In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
page_df = pd.read_csv('../data/ab_data.csv')
page_df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [13]:
print(f'数据行数： {page_df.shape[0]}')
print(f'独立用户数: {page_df.user_id.nunique()}')

数据行数： 294478
独立用户数: 290584


In [6]:
page_df[page_df.user_id.duplicated(keep=False)].sort_values(by='user_id').head(10)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
230259,630052,2017-01-17 01:16:05.208766,treatment,new_page,0
213114,630052,2017-01-07 12:25:54.089486,treatment,old_page,1
22513,630126,2017-01-14 13:35:54.778695,treatment,old_page,0
251762,630126,2017-01-19 17:16:00.280440,treatment,new_page,0
183371,630137,2017-01-20 02:08:49.893878,control,old_page,0
11792,630137,2017-01-22 14:59:22.051308,control,new_page,0
207211,630320,2017-01-07 18:02:43.626318,control,old_page,0
255753,630320,2017-01-12 05:27:37.181803,treatment,old_page,0
96929,630471,2017-01-07 02:14:17.405726,control,new_page,0
110634,630471,2017-01-23 01:42:51.501851,control,old_page,0


In [14]:
mismatch = ((page_df['group'] == 'treatment') != (page_df['landing_page'] == 'new_page'))
print(f'不匹配数：{mismatch.sum()}')

不匹配数：3893


In [15]:
match_df = page_df[~mismatch].copy()

In [17]:
print(f'数据行数： {match_df.shape[0]}')
print(f'独立用户数: {match_df.user_id.nunique()}')

数据行数： 290585
独立用户数: 290584


In [27]:
match_df[match_df.user_id.duplicated(keep=False)]

Unnamed: 0,user_id,timestamp,group,landing_page,converted
1899,773192,2017-01-09 05:37:58.781806,treatment,new_page,0
2893,773192,2017-01-14 02:55:59.590927,treatment,new_page,0


In [29]:
match_df = match_df.drop_duplicates(subset=['user_id'], keep='last')

In [30]:
print(f'数据行数： {match_df.shape[0]}')
print(f'独立用户数: {match_df.user_id.nunique()}')

数据行数： 290584
独立用户数: 290584


In [31]:
match_df.isnull().sum()

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

In [32]:
match_df[match_df.landing_page=='new_page'].shape[0] / match_df.shape[0]

0.5000619442226688

In [33]:
n_old = match_df.query('group=="control"').shape[0]
n_new = match_df.query('group=="treatment"').shape[0]

convert_old = match_df.query('group=="control" & converted==1').shape[0]
convert_new = match_df.query('group=="treatment" & converted==1').shape[0]

p_old = convert_old / n_old
p_new = convert_new / n_new

print('旧版总受试用户数:', n_old, '旧版转化用户数:', convert_old, '旧版转化率:', p_old)
print('新版总受试用户数:', n_new, '新版转化用户数:', convert_new, '新版转化率:', p_new)




旧版总受试用户数: 145274 旧版转化用户数: 17489 旧版转化率: 0.1203863045004612
新版总受试用户数: 145310 新版转化用户数: 17872 新版转化率: 0.12299222352212512


In [34]:
p_c = (convert_old + convert_new) / (n_old + n_new)
print('转化率的联合估计:', p_c)

转化率的联合估计: 0.12168942543292129


In [35]:
z = (p_old - p_new)/ np.sqrt(p_c*(1 - p_c)*( 1/n_old + 1/n_new))
print('检验统计量z:', z)

检验统计量z: -2.1484056695589


In [36]:
from scipy.stats import norm

In [39]:
z_alpha = norm.ppf(0.05)
# 若为双侧，则norm.ppf(0.05/2)
z_alpha

# 结果为=-1.6448536269514729，统计上一般取-1.645，拒绝域为{ z < z_alpha }，z=-2.15落入拒绝域。在显著性水平  时，拒绝零假设。

-1.6448536269514729