In [88]:
# import libraries
import numpy as np
import pandas as pd
import statsmodels.stats.api as sm
from statsmodels.stats.proportion import proportions_ztest, proportion_confint
import matplotlib.pyplot as plt 

In [10]:
# import data into pandas
df = pd.read_csv('ab_data.csv')

# Data Cleaning

In [11]:
# check top 10 data in pandas
df.head(n=10)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1
5,936923,2017-01-10 15:20:49.083499,control,old_page,0
6,679687,2017-01-19 03:26:46.940749,treatment,new_page,1
7,719014,2017-01-17 01:48:29.539573,control,old_page,0
8,817355,2017-01-04 17:58:08.979471,treatment,new_page,1
9,839785,2017-01-15 18:11:06.610965,treatment,new_page,1


In [15]:
# understand data
df.describe()

Unnamed: 0,user_id,converted
count,294478.0,294478.0
mean,787974.124733,0.119659
std,91210.823776,0.324563
min,630000.0,0.0
25%,709032.25,0.0
50%,787933.5,0.0
75%,866911.75,0.0
max,945999.0,1.0


In [16]:
# check null value
df.isnull().sum()

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

In [19]:
# check duplicate users
len(df)-df['user_id'].nunique()

3894

In [26]:
# remove duplicate data
df_unique = df.drop_duplicates(subset=['user_id'])
len(df_unique)-df_unique['user_id'].nunique()


0

In [27]:
# check groups
pd.crosstab(df_unique['group'], df_unique['landing_page'])

# other way of using the same
df_unique.groupby(['group', 'landing_page'])['landing_page'].count().unstack().fillna(0)

landing_page,new_page,old_page
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,1006,144226
treatment,144314,1038


In [56]:
# removing the users from control group seeing the new_page
t_index = df_unique[((df_unique['group']=='treatment') & (df['landing_page']=='old_page')) == True].index;
c_index = df_unique[((df_unique['group']=='control') & (df['landing_page']=='new_page')) == True].index;
df_cleaned = df_unique.drop(t_index).drop(c_index);
pd.crosstab(df_cleaned['group'], df_cleaned['landing_page'])

  t_index = df_unique[((df_unique['group']=='treatment') & (df['landing_page']=='old_page')) == True].index;
  c_index = df_unique[((df_unique['group']=='control') & (df['landing_page']=='new_page')) == True].index;


landing_page,new_page,old_page
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,0,144226
treatment,144314,0


# EDA

In [58]:
#check the distribution of groups
df_cleaned['user_id'].groupby(df_cleaned['group']).count()/len(df_cleaned)*100

group
control      49.984751
treatment    50.015249
Name: user_id, dtype: float64

In [77]:
#baseline conversion rate
control_and_converted = df_cleaned[(df_cleaned['converted'] == 1) & (df_cleaned['group'] == "control")]['converted'].count()
control = df_cleaned[(df_cleaned['group'] == "control")]['converted'].count()
bcr = control_and_converted/control

In [78]:
#treatment conversion rate
treatment_and_converted = df_cleaned[(df_cleaned['converted'] == 1) & (df_cleaned['group'] == "treatment")]['converted'].count()
treatment = df_cleaned[(df_cleaned['group'] == "treatment")]['converted'].count()
tcr = treatment_and_converted/treatment

# sample size

In [84]:
# calculate effective size
initial_rate = bcr
expected_rate = 0.14 #increase by 2%
effect_size = sm.proportion_effectsize(initial_rate,expected_rate)   

#calculating the sample size
sample_size = sm.NormalIndPower().solve_power(
    effect_size, 
    power=0.8, 
    alpha=0.05, 
    ratio=1
    )                                                  
sample_size = round(sample_size)                                                  
print(sample_size) 

4569


# Prepare data with 4569 samples in each group

In [87]:

control_group = df_cleaned[df_cleaned['group'] == 'control'].sample(n=sample_size)
treatment_group = df_cleaned[df_cleaned['group'] == 'treatment'].sample(n=sample_size)

ab_test_df = pd.concat([control_group, treatment_group], axis=0)
ab_test_df.reset_index(drop=True, inplace=True)
pd.crosstab(ab_test_df['group'], ab_test_df['landing_page'])

landing_page,new_page,old_page
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,0,4569
treatment,4569,0


# Running experiment

In [93]:
control_results = ab_test_df[ab_test_df['group'] == 'control']['converted']
treatment_results = ab_test_df[ab_test_df['group'] == 'treatment']['converted']
n_control = control_results.count()
n_treatment = treatment_results.count()
coversion_list = [control_results.sum(), treatment_results.sum()]
n_list = [n_control, n_treatment]
z_stat, pval = proportions_ztest(coversion_list, n_list)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(coversion_list, n_list, alpha=0.05)

In [94]:
print(f'z statistic: {z_stat:.2f}')
print(f'p-value: {pval:.3f}')

z statistic: -0.38
p-value: 0.701
ci 95% for control group: [0.111, 0.130]
ci 95% for treatment group: [0.114, 0.133]
