In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("ab_testing_dataset/AB_testing_exercise.csv")

In [3]:
df.head()

Unnamed: 0,uid,country,gender,spent,purchases,date,group,device
0,11115722,MEX,F,1595,5,2016-03-08,GRP B,I
1,11122053,USA,M,498,2,2017-07-14,GRP B,I
2,11128688,USA,F,2394,6,2017-09-17,GRP A,I
3,11130578,USA,F,1197,3,2017-11-30,GRP A,I
4,11130759,ESP,M,1297,3,2018-01-10,GRP B,A


In [4]:
# Group the data by test vs. control
df_grouped = df.groupby(by=['group'], as_index=False)
df_grouped.uid.count()

Unnamed: 0,group,uid
0,GRP A,23009
1,GRP B,22874


In [5]:
# Group the data by demographic breakout
df_grouped = df.groupby(by=['country','gender','device','group'], as_index=False)
df_grouped.uid.count()

Unnamed: 0,country,gender,device,group,uid
0,AUS,F,A,GRP A,134
1,AUS,F,A,GRP B,130
2,AUS,F,I,GRP A,130
3,AUS,F,I,GRP B,133
4,AUS,M,A,GRP A,133
5,AUS,M,A,GRP B,122
6,AUS,M,I,GRP A,109
7,AUS,M,I,GRP B,126
8,BRA,F,A,GRP A,1044
9,BRA,F,A,GRP B,1135


In [6]:
# Find the count of paywall viewer and purchases in each group
test_results_summary = df.groupby(by=['group'], as_index=False).agg({'purchases':['count','sum']})
test_results_summary

Unnamed: 0_level_0,group,purchases,purchases
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum
0,GRP A,23009,104377
1,GRP B,22874,104133


In [None]:
# Find the count of paywall viewer and purchases in each group
test_results_summary = df.groupby(by=['group'], as_index=False).agg({'purchases':['count','sum']})

# Calculate the paywall conversion rate by group 
test_results_summary['conversion_rate'] = (test_results_summary.purchases['sum']/
                                           test_results_summary.purchases['count'])
test_results_summary

## p-value Function

In [None]:
from scipy import stats
from scipy.stats import norm

In [None]:
# Calculate the p-value from group conversion rates and group sizes
def get_pvalue(con_conv, test_conv, con_size, test_size):
    lift = -abs(test_conv - con_conv)
    scale_one = con_conv * (1 - con_conv) * (1 / con_size)
    scale_two = test_conv * (1 - test_conv) * (1 / test_size)
    scale_val = (scale_one + scale_two)**0.5
    p_value = 2*stats.norm.cdf(lift, loc = 0, scale = scale_val)
    return(p_value)

In [None]:
con_conv = 4.536355 # control group conversion rate
test_conv = 4.552461 # test group conversion rate
con_size = 104377 # control group size
test_size = 104133 # test group size

In [None]:
p_val = get_pvalue(con_conv, test_conv, con_size, test_size)

In [None]:
print(p_val)

## Confidence Interval Function

In [None]:
def get_ci(test_conv, con_conv, test_size, con_size, ci):
    sd = ((test_conv *(1 - test_conv)) / test_size + 
           con_conv * (1 - con_conv) / con_size)**0.5
    
    lift = test_conv - con_conv
    
    val = stats.norm.isf((1 - ci) / 2)
    lower_bound = lift - val * sd
    upper_bound = lift + val * sd
    
    return((lower_bound, upper_bound))

In [None]:
get_ci(test_conv, con_conv, test_size, con_size, 0.95)

## Confirming our test results

In [None]:
# Find the unique users in each group 
results = df.groupby('group').agg({'uid': pd.Series.nunique}) 

# Find the overall number of unique users using "len" and "unique"
unique_users = len(df.uid.unique()) 

# Find the percentage in each group
results = results / unique_users * 100
print(results)

In [None]:
# Find the unique users in each group, by device and gender
results = df.groupby(by=['group', 'device', 'gender']).agg({'uid': pd.Series.nunique}) 

# Find the overall number of unique users using "len" and "unique"
unique_users = len(df.uid.unique())

# Find the percentage in each group
results = results / unique_users * 100
print(results)