In [1]:
import pandas as pd
import numpy as np

In [6]:
#This data is from Kaggle: https://www.kaggle.com/zhangluyuan/ab-testing
df=pd.read_csv("C:\\datasets\\ab_data.csv")

In [7]:
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [9]:
df.shape

(294478, 5)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [13]:
df["group"].value_counts()

treatment    147276
control      147202
Name: group, dtype: int64

In [27]:
conversion = df.groupby('group')['converted']
c_rates=conversion.agg([np.mean])
c_rates

Unnamed: 0_level_0,mean
group,Unnamed: 1_level_1
control,0.120399
treatment,0.11892


In [24]:
control_results = df[df['group'] == 'control']['converted']
treatment_results = df[df['group'] == 'treatment']['converted']
control_nconverted = control_results.count()
treatment_nconverted = treatment_results.count()
successes = [control_results.sum(), treatment_results.sum()]

print("Number of converts in the control group: ",control_nconverted)
print("Number of converts in the treatment group: ",treatment_nconverted)


Number of converts in the control group:  147202
Number of converts in the treatment group:  147276


In [25]:
#import stats models so that we test the hypotheses
import statsmodels.stats.api as sms
from statsmodels.stats.proportion import proportions_ztest, proportion_confint

tot=[control_nconverted,treatment_nconverted]
z_stat, pval = proportions_ztest(successes, nobs=tot)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes, nobs=tot, alpha=0.05)

print(f'z statistic: {z_stat:.2f}')
print(f'p-value: {pval:.3f}')
print(f'ci 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]')
print(f'ci 95% for treatment group: [{lower_treat:.3f}, {upper_treat:.3f}]')

z statistic: 1.24
p-value: 0.216
ci 95% for control group: [0.119, 0.122]
ci 95% for treatment group: [0.117, 0.121]


### *Interpretation*

Using a significance level of %5 we fail to reject our null hypothesis that the treatment group performs differently than the control group because the p-value is >0.05. Looking at the confidence intervals we can say with 95% confidence that the control group will have a conversion rate between 11.9% and 12.2% and the treatment group will have a conversion rate between 11.7% and 12.1%  

**note* This is a data set ready-made for A/B testing so all that is exhibited here is execution of the actual test.  Additionally, the testing was done without first performing any sort of EDA or cleaning which could change the results.