In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../00_dataset/ab_data.csv')
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
user_id         294478 non-null int64
timestamp       294478 non-null object
group           294478 non-null object
landing_page    294478 non-null object
converted       294478 non-null int64
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [6]:
df.isnull().sum()

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

In [7]:
# 查看数据中group和landing_page是否有不一致的情况
df.loc[(df['group'] == 'treatment') != (df['landing_page'] == 'new_page')].count()

user_id         3893
timestamp       3893
group           3893
landing_page    3893
converted       3893
dtype: int64

In [10]:
# 源数据中可以看到有3893条记录的数据出错，故对其进行剔除
df2 = df.loc[~((df['group'] == 'treatment') != (df['landing_page'] == 'new_page'))]
df2.count()

user_id         290585
timestamp       290585
group           290585
landing_page    290585
converted       290585
dtype: int64

In [16]:
# 判断现有数据中是否有重复数据
# df2.user_id.nunique()
df2['user_id'].nunique()

290584

In [21]:
# 删除重复值
df2[df2['user_id'].duplicated(keep=False)]    # 查看重复的用户信息，将两条记录同时展现
df2 = df2.drop_duplicates(subset=['user_id'], keep='first')

df2.shape

(290584, 5)

In [33]:
# 计算用户转化率
# 整体转化率
total_converted = df2['converted'].mean()  # 整体转化率
# 新页面转化率
newPage_converted = df2[df2['landing_page'] == 'new_page']['converted'].mean()
# 旧页面转化率
oldPage_converted = df2[df2['landing_page'] == 'old_page']['converted'].mean()
# AB test的流量分流比例--用户收到新页面的概率
getNew_pro = df2[df2['landing_page'] == 'new_page'].shape[0] / df2.shape[0]

print('目前页面的整体转化率为:  %.2f%%, \n新页面转化率为：%.2f%%, \n旧页面的转化率为：%.2f%%, \n新旧页面的分流比例为：%.2f%%' 
      % (total_converted * 100, newPage_converted * 100, oldPage_converted * 100, getNew_pro * 100))

目前页面的整体转化率为:  11.96%, 
新页面转化率为：11.88%, 
旧页面的转化率为：12.04%, 
新旧页面的分流比例为：50.01%


通过计算可以发现，新旧页面的分流比例为50%，但新页面的转化率11.88%是低于旧页面12.04%的转化率，看似旧页面的效果要优于新页面，究竟是本身就如此还是由于一些随机因素导致的呢？对以上结果进行显著性检验：

#### AB test
目标：新页面的转化率高于旧页面的转化率， 故设计原假设与备择假设如下：
- H0：P_new - P_old <= 0
- H1：P_new - P_old > 0

In [37]:
new_num = df2[df2['landing_page'] == 'new_page'].shape[0]
old_num = df2[df2['landing_page'] == 'old_page'].shape[0]
p_new = newPage_converted
p_old = oldPage_converted

In [43]:
from scipy.stats import norm

# 计算显著性检验z值
z_score = (p_old - p_new) / np.sqrt(p_old*(1-p_old)/old_num + p_new*(1-p_new)/new_num)
print("z值为:", z_score)

# 计算置信区间，显著性水平α=0.05
z_t = norm.ppf(1-0.05)
print('查t表得到显著性水平为0.05的z值为：', z_t)

if z_score < z_t:
    print('按α=0.05的水准，z_score<z_t，该结果落在95%的置信区间内，无法拒绝原假设H0，故新页面的转化率无法判断比旧页面好，因此建议延长测试时间，继续观察数据。')

z值为: 1.3109271488301917
查t表得到显著性水平为0.05的z值为： 1.6448536269514722
按α=0.05的水准，该结果落在95%的置信区间内，无法拒绝原假设H0，故新页面的转化率无法判断比旧页面好，因此建议延长测试时间，继续观察数据。
