# Feature： Free Shipping Bar with 50$ Threshold

### 1. import Packages and load dataset

In [1]:
import pandas as pd
import math
import scipy.stats
from matplotlib import pyplot as plt
import numpy as np

In [2]:
##import data
# 1.1 AB Category: https://drive.google.com/file/d/1F7-jE-fOPEg-ytnbwG5YfsrbsoHau6u2/view?usp=sharing
# 1.2 AB Rev Data: https://drive.google.com/file/d/1CQjswN9xbipO6x-Ec3MnGcc7eSbEjFCE/view?usp=sharing
# 1.3 AB Visit Data Test1_2: https://drive.google.com/file/d/1L8VRtjmGXfY_3HCJSKYlW1_-dRug5LKJ/view?usp=sharing
df_Test_Overall = pd.read_csv(r'1.3 AB Visit Data Test1_2.csv')
df_rev = pd.read_csv(r'1.2 AB Rev Data.csv')
df_categorymap=pd.read_csv(r'1.1 AB Category Mapping.csv')

FileNotFoundError: [Errno 2] File 1.3 AB Visit Data Test1_2.csv does not exist: '1.3 AB Visit Data Test1_2.csv'

In [None]:
df_Test_Overall.head(3)

### 2. Filter data for test2, and check traffic.

In [None]:
df_Test=df_Test_Overall[df_Test_Overall['testid']==2]
df_Test.shape

#### Test2 has 6,870,668 rows of data(6 million)

In [None]:
#check the number of control and var
df_Test.drop_duplicates(['ControlGroup','SessionID']).groupby(['ControlGroup'])['SessionID'].count()
#yes,they are almost equal!

In [None]:
#check how many traffic are in our test(2) each day
df_Test.drop_duplicates(['Date','SessionID']).groupby(['Date'])['SessionID'].count()
#and we found out the 4 days at the beginning and the last day our traffic have some problem

In [None]:
# We want to exclude the date for these days from the original test dataset for test2.
df_Test=df_Test[pd.to_datetime(df_Test['Date'])>pd.to_datetime('2019-06-17')]
df_Test=df_Test[pd.to_datetime(df_Test['Date'])<pd.to_datetime('2019-07-12')]

In [None]:
#let's see how our data looks now
df_Test.drop_duplicates(['Date','SessionID']).groupby(['Date'])['SessionID'].count()
#yeahee!we've exclude them succussfully

In [None]:
import gc
gc.collect()


### 3.1 Analyze metrics

#### Count the customer behavior on session level, but we will automate this part with customer level later.

In [None]:
#get the counts of session_id for each test group and store the result into a new pandas dataframe named df_result
df_Result=pd.DataFrame(df_Test.drop_duplicates(['SessionID','ControlGroup']).groupby('ControlGroup')['SessionID'].count())

In [None]:
df_Result

In [None]:
metrics=['Bounced','SawProduct','AddedToCart','ReachedCheckout','Converted']
#create loop to get all the counts of "1" for each metrics
for metric in metrics:
    #get the column of "1s" for that metric,then count the number of it,then store the row of result number as a.
    a=df_Test[df_Test[metric]==1].drop_duplicates(['SessionID','ControlGroup']).groupby('ControlGroup')['SessionID'].count()
    #define the column name as that metric name
    a.name=metric
    #join each result on the df_result dataframe we've crested above.
    df_Result=df_Result.join(a)

In [None]:
df_Result

#### Create a z_test calculation tool

In [None]:
def z_test_calculator(df,denominator,numerator):
    #df: dataframe containing the result (absolute counts)
    #denominator: str
    #numerator:str
    
    #get the data [1,]or[0,] is control or variation,denominator and numerator is our imput,
    #normally it should be a count of total number, and a count of the metric number we observed.
    control_denominator=df.loc[1,denominator]
    var_denominator=df.loc[0,denominator]
    control_numerator=df.loc[1,numerator]
    var_numerator=df.loc[0,numerator]    
    
    #caculate the Rate: simply do 2 divide
    control_rate=control_numerator/control_denominator
#     print(control_numerator)
#     print(control_denominator)
    var_rate=var_numerator/var_denominator
    
    #calculate the STD: sd=p(1-p)/N 再开根号
#     print(control_rate)
    control_sd=math.sqrt(control_rate*(1-control_rate)/control_denominator)
    
    var_sd=math.sqrt(var_rate*(1-var_rate)/var_denominator)
    
    #z score =两个rate的差值/control和variation的平均标准差
    #control和variation的平均标准差=control的标准差的平方+variation的标准差的平方 再开根号
    
    z_score=(control_rate-var_rate)/math.sqrt(pow(control_sd,2)+pow(var_sd,2))
    
    
    #p value python有方法可以直接找到pvalue，simply input the absolute value of z_score.
    p_value=scipy.stats.norm.sf(abs(z_score))
    
    
    #lift simple subtract and divide
    perc_lift=(var_rate-control_rate)/control_rate
    abs_lift=(var_rate-control_rate)
    
    return (p_value,perc_lift,abs_lift)

#### Real Calculation part, calculate all metrics on both levels together

In [None]:
user_types=['SessionID','CusID']
dic_final={}
metrics=['Bounced','SawProduct','AddedToCart','ReachedCheckout','Converted']

for user_type in user_types:
    df_Result_any=pd.DataFrame(df_Test.drop_duplicates([user_type,'ControlGroup']).groupby('ControlGroup')[user_type].count())
    
    for metric in metrics:
        a=df_Test[df_Test[metric]==1].drop_duplicates([user_type,'ControlGroup']).groupby('ControlGroup')[user_type].count()
        a.name=metric
        df_Result_any=df_Result_any.join(a)

    KPIs=[(user_type,'Bounced'),
          (user_type,'SawProduct'),
          (user_type,'AddedToCart'),
          (user_type,'ReachedCheckout'),
          (user_type,'Converted'),
          ('AddedToCart','ReachedCheckout'),
          ('ReachedCheckout','Converted'),
          ('AddedToCart','Converted')]

    for index in df_Result_any.index:
        j=0
        if index!=1:
            df_each_group=df_Result_any

            df_final=pd.DataFrame()

            for i in KPIs:
                result=z_test_calculator(df_each_group,i[0],i[1])
                df_final.loc[j,'denominator']=i[0]
                df_final.loc[j,'numerator']=i[1]
                df_final.loc[j,'p_value']=result[0]
                df_final.loc[j,'perc_lift']=result[1]
                df_final.loc[j,'abs_lift']=result[2]
                j=j+1
            dic_final[user_type]=df_final


In [None]:
#check what does df_Result_any look like now, should be for customer level now, because it's the last item in user_types list.
df_Result_any

In [None]:
#check the value for key "CusID", it should be results for customer level
dic_final['CusID']

#### Export result into excel file, in multiple sheets.

In [None]:
writer = pd.ExcelWriter('3.1 Final_data_Test2.xlsx')
for key in dic_final.keys():
    dic_final[key].to_excel(writer, sheet_name=key)
writer.save()

#### Re-calculate everything in different cuts(different categories in item purchased, platform using, and visitor type)

In [None]:
df_test_data = df_Test.copy()
dic_final_cuts_usertype = {}
user_types = ['SessionID','CusID']
cuts = ['CategoryID', 'PlatformID','VisitorTypeID']

In [None]:
for user_type in user_types:
    print(user_type)
    for cut in cuts:
        print(cut)
 
        for p in set(df_test_data[cut]):

            df_Test_cut = df_test_data[df_test_data[cut]==p]
            df_Result_any1 = pd.DataFrame(df_Test_cut.drop_duplicates([user_type,'ControlGroup']).groupby('ControlGroup')[user_type].count())

            metrics = ['Bounced','SawProduct','AddedToCart','ReachedCheckout','Converted']
            for metric in metrics:
                a = df_Test_cut[df_Test_cut[metric]==1].drop_duplicates([user_type,'ControlGroup']).groupby('ControlGroup')[user_type].count()
                a.name = metric
                df_Result_any1 = df_Result_any1.join(a)
                print(df_Result_any1)


            KPIs=[(user_type,'Bounced'),
          (user_type,'SawProduct'),
          (user_type,'AddedToCart'),
          (user_type,'ReachedCheckout'),
          (user_type,'Converted'),
          ('AddedToCart','ReachedCheckout'),
          ('ReachedCheckout','Converted'),
          ('AddedToCart','Converted')]

            for index in df_Result_any1.index:
                j=0
                #reset the index
                if index!=1:
                    df_each_group = df_Result_any1
                    df_final=pd.DataFrame()


                    for i in KPIs:
                        result=z_test_calculator(df_each_group,i[0],i[1])
                        df_final.loc[j,'denominator']=i[0]
                        df_final.loc[j,'numerator'] = i[1]
                        df_final.loc[j,'p_value'] = result[0]
                        df_final.loc[j,'perc_lift'] = result[1]
                        df_final.loc[j,'abs_lift']= result[2]
                        j=j+1

                    dic_final_cuts_usertype[user_type+'_'+cut+str(p)]= df_final 

In [None]:
dic_final_cuts_usertype['SessionID_CategoryID2']

#### Export result into excel file, in multiple sheets.

In [None]:
#excel contain all the possible cuts
writer = pd.ExcelWriter('3.2 Final_cuts_data_Test2.xlsx')
for key in dic_final_cuts_usertype.keys():
    dic_final_cuts_usertype[key].to_excel(writer, sheet_name=key)
writer.save()

### 3.2 Analyze revenue

In [None]:
df_cusid=df_Test_Overall[['SessionID','ControlGroup','CusID','CategoryID','VisitorTypeID']]
df_cusid.head(3)

In [None]:
df_Rev=pd.merge(df_rev,df_cusid, on = ['SessionID','ControlGroup'])

In [None]:
df_Rev=df_Rev[df_Rev['testid']==2]
df_Rev.head(3)

In [None]:
df_Rev.drop_duplicates(['ControlGroup','SessionID']).groupby(['ControlGroup'])['SessionID'].count()

In [None]:
Control_Rev=df_Rev[df_Rev['ControlGroup']==1]['TotalRevenue'].array
Control_Rev
Var_Rev=df_Rev[df_Rev['ControlGroup']==0]['TotalRevenue'].array
Control_Rev
Var_Rev

In [None]:
P1=np.percentile(Control_Rev,95)
P2=np.percentile(Var_Rev,95)

In [None]:
scipy.stats.mannwhitneyu(Control_Rev,Var_Rev)

In [None]:
def z_test_calculator_continuous(df,denominator,numerator,numerator_sq):
    #df: dataframe containing the result (absolute counts)
    #denominator: str
    #numerator:str
    #numerator_sq:str
    
    #get the data
    control_denominator=df.loc[1,denominator]
    var_denominator=df.loc[0,denominator]
    control_numerator=df.loc[1,numerator]
    var_numerator=df.loc[0,numerator]    
    
    #Rate
    control_rate=control_numerator/control_denominator
    var_rate=var_numerator/var_denominator
    
    #Variance

    control_var=df.loc[1,numerator_sq]/control_denominator-control_rate**2
    var_var=df.loc[0,numerator_sq]/var_denominator-var_rate**2
    
    #z score
    z_score=(control_rate-var_rate)/math.sqrt(control_var/control_denominator+var_var/var_denominator)
    
    
    #p value
    p_value=scipy.stats.norm.sf(abs(z_score))
    
    
    #lift
    perc_lift=(var_rate-control_rate)/control_rate
    abs_lift=(var_rate-control_rate)
    
    return (p_value,perc_lift,abs_lift)

In [None]:
user_types=['SessionID','CusID']
dic_rev_final={}


for user_type in user_types:
    print(user_type)
    df_Result = pd.DataFrame(df_Rev.drop_duplicates([user_type,'ControlGroup']).groupby('ControlGroup')[user_type].count())
              
    df_Result.loc[1,'Rev']=sum(Control_Rev[Control_Rev<P1])
    df_Result.loc[0,'Rev']=sum(Var_Rev[Var_Rev<P2])

    df_Result.loc[1,'Rev_sq']=sum(Control_Rev[Control_Rev<P1]**2)
    df_Result.loc[0,'Rev_sq']=sum(Var_Rev[Var_Rev<P2]**2)
    
    for index in df_Result.index:
    
        if index!=1:
            df_each_group=df_Result
            df_final=pd.DataFrame()
            result=z_test_calculator_continuous(df_each_group,user_type,'Rev','Rev_sq')

            dic_rev_final[user_type]= result

In [None]:
dic_rev_final

In [None]:
dic_rev_final=pd.DataFrame(dic_rev_final).T
dic_rev_final=dic_rev_final.rename(columns={0:'p_value',1:'perc_lift',2:'abs_lift'})
dic_rev_final

In [None]:
writer = pd.ExcelWriter('3.3 Final_revenue_data_Test2.xlsx')
dic_rev_final.to_excel(writer, sheet_name='revenue_data_Test2')
writer.save()

In [None]:
df_rev_data = df_Rev.copy()
dic_rev_final_cuts_usertype = {}
user_types = ['SessionID','CusID']
cuts = ['CategoryID','VisitorTypeID']

In [None]:
for user_type in user_types:
    print(user_type)
    for cut in cuts:
        print(cut)
 
        for p in set(df_rev_data[cut]):

            df_Rev_cut = df_rev_data[df_rev_data[cut]==p]
            Control_Rev_cut=df_Rev_cut[df_Rev_cut['ControlGroup']==1]['TotalRevenue'].array
            Var_Rev_cut=df_Rev_cut[df_Rev_cut['ControlGroup']==0]['TotalRevenue'].array
           
            df_Result_any = pd.DataFrame(df_Rev_cut.drop_duplicates([user_type,'ControlGroup']).groupby('ControlGroup')[user_type].count())
            
            df_Result_any.loc[1,'Rev']=sum(Control_Rev_cut)
            df_Result_any.loc[0,'Rev']=sum(Var_Rev_cut)
            df_Result_any.loc[1,'Rev_sq']=sum(Control_Rev_cut**2)
            df_Result_any.loc[0,'Rev_sq']=sum(Var_Rev_cut**2)

            for index in df_Result.index:

                if index!=1:
                    df_each_group=df_Result_any
                    df_final=pd.DataFrame()
                    result=z_test_calculator_continuous(df_each_group,user_type,'Rev','Rev_sq')

                    dic_rev_final_cuts_usertype [user_type+'_'+cut+str(p)]= result

In [None]:
dic_rev_final_cuts_usertype

In [None]:
dic_rev_final_cuts_usertype=pd.DataFrame(dic_rev_final_cuts_usertype).T
dic_rev_final_cuts_usertype=dic_rev_final_cuts_usertype.rename(columns={0:'p_value',1:'perc_lift',2:'abs_lift'})
dic_rev_final_cuts_usertype

In [None]:
writer = pd.ExcelWriter('3.4 Final_revenue_cuts_data_Test2.xlsx')
dic_rev_final_cuts_usertype.to_excel(writer, sheet_name='revenue_cuts_data_Test2')
writer.save()