In [None]:
#####################################################
# AB TESTING PROJECT
#####################################################

In [None]:
#####################################################
# DATASET
#####################################################
# A COMPANY DECIDED TO FOLLOW A NEW ADVERTISEMENT STRATEGY IN ORDER TO INCREASE
# THE NUMBER OF CLICKS TO PURCHASE ITEMS ON THE WEB PAGE FOLLOWING THE NEW DESIGNED WEB ADVERTS

# THE AIM OF THE PROJECT IS TO DETERMINE IF THERE HAS BEEN A STATISTICALLY SIGNIFICANT DIFFERENCE ON PURCHASES
# FOLLOWING THE NEW ADVERTISEMENT STRATEGY

# Dataset is  ab_testing Excel file consist of 2 Excel sheets having both test and control groups

# each Excel sheet has 4 variables as below:

# impression: view count on adverts
# Click: Click count on adverts
# Purchase: Purchase count on items following the advert
# Earning: Revenues after the purchases


In [1]:
###############################################################
# TASK 1: DATA PREPROCESSING and INITIAL ANALYSIS
###############################################################

import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import shapiro, levene, ttest_ind


pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

dataframe_control = pd.read_excel("ab_testing.xlsx" , sheet_name="Control Group")
dataframe_test = pd.read_excel("ab_testing.xlsx" , sheet_name="Test Group")

df_control = dataframe_control.copy()
df_test = dataframe_test.copy()

In [2]:
# INITIAL ANALYSIS


def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head())
    print("##################### Tail #####################")
    print(dataframe.tail())
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

check_df(df_control)

##################### Shape #####################
(40, 4)
##################### Types #####################
Impression    float64
Click         float64
Purchase      float64
Earning       float64
dtype: object
##################### Head #####################
    Impression      Click  Purchase    Earning
0  82529.45927 6090.07732 665.21125 2311.27714
1  98050.45193 3382.86179 315.08489 1742.80686
2  82696.02355 4167.96575 458.08374 1797.82745
3 109914.40040 4910.88224 487.09077 1696.22918
4 108457.76263 5987.65581 441.03405 1543.72018
##################### Tail #####################
     Impression      Click  Purchase    Earning
35 132064.21900 3747.15754 551.07241 2256.97559
36  86409.94180 4608.25621 345.04603 1781.35769
37 123678.93423 3649.07379 476.16813 2187.72122
38 101997.49410 4736.35337 474.61354 2254.56383
39 121085.88122 4285.17861 590.40602 1289.30895
##################### NA #####################
Impression    0
Click         0
Purchase      0
Earning       0
dtype: int6

In [3]:
check_df(df_test)

##################### Shape #####################
(40, 4)
##################### Types #####################
Impression    float64
Click         float64
Purchase      float64
Earning       float64
dtype: object
##################### Head #####################
    Impression      Click  Purchase    Earning
0 120103.50380 3216.54796 702.16035 1939.61124
1 134775.94336 3635.08242 834.05429 2929.40582
2 107806.62079 3057.14356 422.93426 2526.24488
3 116445.27553 4650.47391 429.03353 2281.42857
4 145082.51684 5201.38772 749.86044 2781.69752
##################### Tail #####################
     Impression      Click  Purchase    Earning
35  79234.91193 6002.21358 382.04712 2277.86398
36 130702.23941 3626.32007 449.82459 2530.84133
37 116481.87337 4702.78247 472.45373 2597.91763
38  79033.83492 4495.42818 425.35910 2595.85788
39 102257.45409 4800.06832 521.31073 2967.51839
##################### NA #####################
Impression    0
Click         0
Purchase      0
Earning       0
dtype: int6

In [7]:
# Merging control and test groups
df_control["group"] = "control"
df_test["group"] = "test"

df = pd.concat([df_control,df_test], axis=0,ignore_index=False)
df.head()

Unnamed: 0,Impression,Click,Purchase,Earning,group
0,82529.45927,6090.07732,665.21125,2311.27714,control
1,98050.45193,3382.86179,315.08489,1742.80686,control
2,82696.02355,4167.96575,458.08374,1797.82745,control
3,109914.4004,4910.88224,487.09077,1696.22918,control
4,108457.76263,5987.65581,441.03405,1543.72018,control


In [18]:
df.tail()

Unnamed: 0,Impression,Click,Purchase,Earning,group
35,79234.91193,6002.21358,382.04712,2277.86398,test
36,130702.23941,3626.32007,449.82459,2530.84133,test
37,116481.87337,4702.78247,472.45373,2597.91763,test
38,79033.83492,4495.42818,425.3591,2595.85788,test
39,102257.45409,4800.06832,521.31073,2967.51839,test


In [14]:
#####################################################
# TASK2:  Create a  A/B Testing Hypothesis
#####################################################
# H0 : M1 = M2 (No significant difference.)
# H1 : M1!= M2


# Control mean of purchases for both control and test groups
df.groupby("group")[["Purchase"]].mean()

Unnamed: 0_level_0,Purchase
group,Unnamed: 1_level_1
control,550.89406
test,582.1061


In [15]:
#####################################################
# TASK 3: hypothesis testing
#####################################################


################NORMALITY TEST#######################
# Run normality test to determine if "purchase" variable well-modeled by a normal distribution

# Null hypothesis      (H0): Data follows a normal distribution
# Alternate Hypothesis (H1): Data does not follow a normal distribution
test_stat, pvalue = shapiro(df.loc[df["group"] == "control", "Purchase"])
print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

Test Stat = 0.9773, p-value = 0.5891


In [None]:
# "purchase" variable follows a normal distribution
# H0 is consistent with the data

In [16]:
#############  Testing Homogeneity of the Variances###########

# Test Homogeneity of the Variances to determine if "test" and "control" groups have the same or similar variance

# Null Hypothesis       (H0): The variances of the two groups are equal
# Alternative Hypothesis(H1): The variances are different

test_stat, pvalue = levene(df.loc[df["group"] == "control", "Purchase"],
                           df.loc[df["group"] == "test", "Purchase"])
print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))


Test Stat = 2.6393, p-value = 0.1083


In [None]:
# "test" and "control" groups have the same or similar variance
# H0 is consistent with the data


#  RESULT: data is normally distributed and the variance across groups are homogeneous.
#  Parametric (T-TEST) will be used

In [17]:
######################### T-TEST##########################

# H0 : M1 = M2 (No significant difference.)
# H1 : M1!= M2

test_stat, pvalue = ttest_ind(df.loc[df["group"] == "control", "Purchase"],
                              df.loc[df["group"] == "test", "Purchase"],
                              equal_var=True)

print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

Test Stat = -0.9416, p-value = 0.3493


In [None]:
###################FINAL COMMENTS#########################

In [None]:
# H0 is consistent with the data
# No statistically significant difference between "test" and "control" groups