In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df=pd.read_csv('./data/ab_data.csv')

In [3]:
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [4]:
#查看详细信息
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [5]:
df.isnull().sum()

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

### 2.数据分析

#### 2.1数据清洗

- 查询并处理缺失值；查询并处理重复数据
- 确保在实验中给treatment组为新页面，control为旧业面

In [7]:
#检查是否分组与页面展示不符的现象
t_old_error=len(df[(df['group']=='treatment') & (df['landing_page']=='old_page')])
c_new_error=len(df[(df['group']=='control')&(df['landing_page']=='new_page')])
print('实验组看到老页面人数：{},对照组看到的新页面的人数：{}'.format(t_old_error,c_new_error))

实验组看到老页面人数：1965,对照组看到的新页面的人数：1928


- 将不符合的数据排除

In [8]:
df=df[((df['group']=='treatment')&(df['landing_page']=='new_page'))|((df['group']=='control')&(df['landing_page']=='old_page'))]
df.shape

(290585, 5)

In [9]:
#查找重复值
df.drop_duplicates(subset='user_id').head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [10]:
#删除重复值
df.drop_duplicates(subset=['user_id'],inplace=True,keep='first')
df=df.reset_index()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290584 entries, 0 to 290583
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   index         290584 non-null  int64 
 1   user_id       290584 non-null  int64 
 2   timestamp     290584 non-null  object
 3   group         290584 non-null  object
 4   landing_page  290584 non-null  object
 5   converted     290584 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 13.3+ MB


- 查看流量分配占比 ：看到新页面的用户占比是多少

In [11]:
percent=df[df['landing_page']=='new_page'].shape[0]/df.shape[0]
percent

0.5000619442226688

### 2.2假设验证

- 提出零假设和备选假设
  - 设旧业面的转化率为p1,新页面为p2
  - 零假设 p1>p2 即p1-p2>0
  - 备选假设 p1<p2 即p1-p2<0

- 给定检验α取0.05

In [14]:
#旧版，新版用户数
n_old=df.query('group=="control"').shape[0]
n_new=df.query('group=="treatment"').shape[0]
#旧版，新版转化用户数
convert_old=df.query('group=="control"&converted==1').shape[0]
convert_new=df.query('group=="treatment"&converted==1').shape[0]
#旧版，新版转化率
p_old=convert_old/n_old
p_new=convert_new/n_new
p_c=(convert_old+convert_new)/(n_old+n_new)
z=(p_old-p_new)/np.sqrt(p_c*(1-p_c)*(1/n_old+1/n_new))


In [15]:
print(p_old,p_new,p_c,z)

0.1203863045004612 0.12299222352212512 0.12168942543292129 -2.1484056695589


- 可以看出新界面的转化率高