In [17]:
%pylab inline
import math as mt
import numpy as np
import pandas as pd
from scipy.stats import norm
import os

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [2]:
# https://www.kaggle.com/tammyrotem/ab-tests-with-python

In [3]:
# Baseline information before the experiment start
baseline = {"Cookies":40000, 
           "Clicks":3200,
           "Enrollments": 660,
           "CTP": 0.08,
           "GConversion":0.20625,
           "Retention": 0.53,
           "NConversion":0.109313}

In [4]:
# Scale the counts estimates
baseline["Cookies"] =5000
baseline["Clicks"] = baseline["Clicks"] *(5000/40000)
baseline["Enrollments"] = baseline["Enrollments"] *(5000/40000)

In [5]:
print(baseline)

{'Cookies': 5000, 'Clicks': 400.0, 'Enrollments': 82.5, 'CTP': 0.08, 'GConversion': 0.20625, 'Retention': 0.53, 'NConversion': 0.109313}


In [6]:
# Gross Conversion sd calculation
GC = {}
GC["dmin"] = 0.01
GC["p"] = baseline["GConversion"]
GC["n"] = baseline["Clicks"] # the unit of diversion = unit of analysis
GC["sd"] = round(mt.sqrt((GC["p"]*(1-GC["p"]))/GC["n"]),4)
GC["sd"]

0.0202

In [7]:
# Retention sd
R = {}
R["dmin"] = 0.01
R['p'] = baseline["Retention"]
R['n'] = baseline['Enrollments'] # unit of diversion != analysis 
# Calculate empirical sd
R['sd'] = round(mt.sqrt((R["p"]*(1-R["p"]))/R["n"]),4)
R['sd']

0.0549

In [8]:
# Net conversion sd
NC={}
NC["dmin"]=0.0075
NC["p"]=baseline["NConversion"]
NC["n"]=baseline["Clicks"]
NC["sd"]=round(mt.sqrt((NC["p"]*(1-NC["p"]))/NC["n"]),4)
NC["sd"]

0.0156

In [9]:

def get_sds(p,d):
    sd1 = mt.sqrt(2*p*(1-p))
    sd2 = mt.sqrt(p*(1-p) + (p+d)*(1-(p+d)))
    x = [sd1,sd2]
    return x

def get_z(alpha):
    return norm.ppf(alpha)

def get_sample_size(sds,alpha,beta,d):
    n = pow((get_z(1-alpha/2)*sds[0] + get_z(1-beta)*sds[1]),2)/pow(d,2)
    return n


In [10]:
GC["SampSize"] = round(get_sample_size(get_sds(GC["p"],GC['dmin']),0.05,0.2,GC['dmin']),2)
GC["SampSize"] 
# at least these amount of cookies that click the free trail per group

25834.7

In [11]:
GC['SampSize'] = GC['SampSize']/(400/5000)*2
GC['SampSize']

645867.5

In [12]:
R["SampSize"]=round(get_sample_size(get_sds(R["p"],R["dmin"]),0.05,0.2,R["dmin"]))
R["SampSize"]

39087.0

In [13]:
R["SampSize"] = R["SampSize"]/0.08/0.20625*2
R["SampSize"]
# the result is way too high. don't use this metric

4737818.181818182

In [14]:
NC["SampSize"]=round(get_sample_size(get_sds(NC["p"],NC["dmin"]),0.05,0.2,NC["dmin"]))
NC["SampSize"]

27413.0

In [15]:
NC["SampSize"]=NC["SampSize"]/0.08*2
NC["SampSize"]

685325.0

In [60]:
# import control and expriment data
os.getcwd()
control = pd.read_csv('Final Project Results - Control.csv')
experiment = pd.read_csv('Final Project Results - Experiment.csv')

control.head()

Unnamed: 0,Date,Pageviews,Clicks,Enrollments,Payments
0,"Sat, Oct 11",7723,687,134.0,70.0
1,"Sun, Oct 12",9102,779,147.0,70.0
2,"Mon, Oct 13",10511,909,167.0,95.0
3,"Tue, Oct 14",9871,836,156.0,105.0
4,"Wed, Oct 15",10014,837,163.0,64.0


In [40]:
# Sanity check for the difference between total counts

# Page view difference
PVcon = control['Pageviews'].sum()
PVexp = experiment['Pageviews'].sum()
PVtol = PVcon + PVexp
p = 0.5 #hypothetically, half in control and half in experiment 
alpha = 0.05
ME = round(get_z(1-(alpha/2))*(mt.sqrt(p*(1-p)/PVtol)),4)
print(p-ME, p+ME,round(PVcon/PVtol,4))

# Clicks difference
Ccon = control['Clicks'].sum()
Cexp = experiment['Clicks'].sum()
Ctot = Ccon + Cexp
ME = round(get_z(1-(alpha/2))*(mt.sqrt(p*(1-p)/Ctot)),4)
print(p-ME, p+ME,round(Ccon/Ctot,4))



0.4988 0.5012 0.5006
0.4959 0.5041 0.5005


In [52]:
# sanity check for the CTP

CTPcon = round(Ccon/PVcon,4)
CTPexp = round(Cexp/PVexp,4)

ppool = round(Ctot/PVtol,4)
ctpsd = round(sqrt((ppool*(1-ppool)*(1/PVcon+1/PVexp))),4)
ctpme = round(get_z(1-alpha/2)*ctpsd,4)
print(0+ctpme,0-ctpme,round(CTPcon-CTPexp,4))


0.0014 -0.0014 -0.0001


In [67]:
# Effect size for gross Conversion and net conversion
control = control.dropna()
experiment = experiment.dropna()

In [75]:
# GROSS CONVERSION
Econ=control["Enrollments"].sum()
Ccon=control["Clicks"].sum()
Eexp=experiment["Enrollments"].sum()
Cexp=experiment["Clicks"].sum()
GCcon =Econ/Ccon
GCexp = Eexp/Cexp
GCpool = (Econ + Eexp)/(Ccon+Cexp)

GCsd = sqrt(GCpool*(1-GCpool)*(1/Ccon+1/Cexp))
GCme = get_z(1-0.05/2)*GCsd

GCdiff = GCexp - GCcon

print(GCdiff - GCme,GCdiff+GCme,GC["dmin"])

# it is statistically(!=0) but is not practical significant the change is negative

-0.02912320088750467 -0.011986548273218463 0.01


In [78]:
# Net CONVERSION
Pcon=control["Payments"].sum()
Ccon=control["Clicks"].sum()
Pexp=experiment["Payments"].sum()
Cexp=experiment["Clicks"].sum()
NCcon =Pcon/Ccon
NCexp = Pexp/Cexp
NCpool = (Pcon + Pexp)/(Ccon+Cexp)

NCsd = sqrt(NCpool*(1-NCpool)*(1/Ccon+1/Cexp))
NCme = get_z(1-0.05/2)*NCsd

NCdiff = round(NCexp - NCcon,4)

print(NCdiff - NCme,NCdiff+NCme,NC["dmin"],NCdiff)

-0.011630778003449567 0.001830778003449567 0.0075 -0.0049


In [79]:
# Sign test that checks the difference on each day

# Join the dataset
full = control.join(other = experiment, how = "inner",lsuffix = "_cont",rsuffix = '_exp')
full.count()

Date_cont           23
Pageviews_cont      23
Clicks_cont         23
Enrollments_cont    23
Payments_cont       23
Date_exp            23
Pageviews_exp       23
Clicks_exp          23
Enrollments_exp     23
Payments_exp        23
dtype: int64

In [83]:
# Add column creates binary outcome of each metric
x = full['Enrollments_cont']/full['Clicks_cont']
y = full['Enrollments_exp']/full['Clicks_exp']
full["GC"] = np.where(x<y,1,0)
a = full['Payments_cont']/full['Clicks_cont']
b = full['Payments_exp']/full['Clicks_exp']
full["NC"] = np.where(a<b,1,0)
full.head()

Unnamed: 0,Date_cont,Pageviews_cont,Clicks_cont,Enrollments_cont,Payments_cont,Date_exp,Pageviews_exp,Clicks_exp,Enrollments_exp,Payments_exp,GC,NC
0,"Sat, Oct 11",7723,687,134.0,70.0,"Sat, Oct 11",7716,686,105.0,34.0,0,0
1,"Sun, Oct 12",9102,779,147.0,70.0,"Sun, Oct 12",9288,785,116.0,91.0,0,1
2,"Mon, Oct 13",10511,909,167.0,95.0,"Mon, Oct 13",10480,884,145.0,79.0,0,0
3,"Tue, Oct 14",9871,836,156.0,105.0,"Tue, Oct 14",9867,827,138.0,92.0,0,0
4,"Wed, Oct 15",10014,837,163.0,64.0,"Wed, Oct 15",9793,832,140.0,94.0,0,1


In [90]:
GC_x = full["GC"].sum()
NC_x = full['NC'].sum()
n = len(full.index)
print(GC_x,NC_x,n)

4 10 23


In [91]:
#first a function for calculating probability of x=number of successes
def get_prob(x,n):
    p=round(mt.factorial(n)/(mt.factorial(x)*mt.factorial(n-x))*0.5**x*0.5**(n-x),4)
    return p
#next a function to compute the pvalue from probabilities of maximum x
def get_2side_pvalue(x,n):
    p=0
    for i in range(0,x+1):
        p=p+get_prob(i,n)
    return 2*p

In [92]:
print(get_2side_pvalue(GC_x,n),get_2side_pvalue(NC_x,n))

0.0026000000000000003 0.6774
