# 1. Import Libraries

In [21]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.stats.api as sms
from scipy.stats import ttest_1samp, shapiro, levene, ttest_ind, mannwhitneyu, pearsonr, spearmanr, kendalltau, \
    f_oneway, kruskal
from statsmodels.stats.proportion import proportions_ztest
import warnings
warnings.filterwarnings('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 2. Data Preprocessing

In [3]:
df = pd.read_csv("/Users/hayaayu/Users/hayaayu/Career Thingy/Portfolio/AB-testing-grocery/grocerywebsiteabtestdata.csv")
df.head()

Unnamed: 0,RecordID,IP Address,LoggedInFlag,ServerID,VisitPageFlag
0,1,39.13.114.2,1,2,0
1,2,13.3.25.8,1,1,0
2,3,247.8.211.8,1,1,0
3,4,124.8.220.3,0,3,0
4,5,60.10.192.7,0,2,0


In [4]:
df.describe()

Unnamed: 0,RecordID,LoggedInFlag,ServerID,VisitPageFlag
count,184588.0,184588.0,184588.0,184588.0
mean,92294.5,0.503657,1.999691,0.058184
std,53286.110082,0.499988,0.816719,0.234091
min,1.0,0.0,1.0,0.0
25%,46147.75,0.0,1.0,0.0
50%,92294.5,1.0,2.0,0.0
75%,138441.25,1.0,3.0,0.0
max,184588.0,1.0,3.0,1.0


In [5]:
#Checking data shape

df.shape

(184588, 5)

## 2.1 Reducing Visit Value

In [6]:
#Check for duplicates (users that logged)
df['IP Address'].duplicated().sum()

85072

An IP Address may have visited the page more than once. So that this doesn't affect the results, I reduce the Visit value to 1 for users with multiple visits.

In [7]:
df = df.groupby(["IP Address", "LoggedInFlag", "ServerID"])["VisitPageFlag"].sum()

In [8]:
df = df.reset_index(name="VisitPageFlagSum")
df.head()

Unnamed: 0,IP Address,LoggedInFlag,ServerID,VisitPageFlagSum
0,0.0.108.2,0,1,0
1,0.0.109.6,1,1,0
2,0.0.111.8,0,3,0
3,0.0.160.9,1,2,0
4,0.0.163.1,0,2,0


In [9]:
df["VisitPageFlag"] = df["VisitPageFlagSum"].apply(lambda x: 1 if x != 0 else 0)
df.head()

Unnamed: 0,IP Address,LoggedInFlag,ServerID,VisitPageFlagSum,VisitPageFlag
0,0.0.108.2,0,1,0,0
1,0.0.109.6,1,1,0,0
2,0.0.111.8,0,3,0,0
3,0.0.160.9,1,2,0,0
4,0.0.163.1,0,2,0,0


## 2.2 Splitting Treatment and Control Group

Then I need to split the dataset into Treatment and Control groups. I will perform this operation with the help of ServerID. I set ServerID 1 as Treatment group and ServerID 2 and 3 as Control group.

In [10]:
# Mapping ServerID to Groups
df['group'] = df['ServerID'].map({1:'Treatment', 2:'Control', 3:'Control'})
df.drop(['ServerID','VisitPageFlagSum'],axis=1, inplace=True)

In [11]:
df.head()

Unnamed: 0,IP Address,LoggedInFlag,VisitPageFlag,group
0,0.0.108.2,0,0,Treatment
1,0.0.109.6,1,0,Treatment
2,0.0.111.8,0,0,Control
3,0.0.160.9,1,0,Control
4,0.0.163.1,0,0,Control


In [12]:
# Control datasets
df_control = df[df['group'] == 'Control'].copy()
df_control.reset_index(inplace=True, drop = True)

In [14]:
# Test Datasets
df_treatment = df[df['group'] == 'Treatment'].copy()
df_treatment.reset_index(inplace=True, drop = True)

In [15]:
df_control.head()

Unnamed: 0,IP Address,LoggedInFlag,VisitPageFlag,group
0,0.0.111.8,0,0,Control
1,0.0.160.9,1,0,Control
2,0.0.163.1,0,0,Control
3,0.0.178.9,1,0,Control
4,0.0.185.4,1,0,Control


In [30]:
df_treatment.head()

Unnamed: 0,IP Address,LoggedInFlag,VisitPageFlag,group
0,0.0.108.2,0,0,Treatment
1,0.0.109.6,1,0,Treatment
2,0.0.169.1,1,0,Treatment
3,0.0.181.9,0,1,Treatment
4,0.0.195.5,1,0,Treatment


In [18]:
# Calcualte Important metrics
n_t=df.query('group=="Treatment"')['IP Address'].count()#number of people in treatment group
n_c=df.query('group=="Control"')['IP Address'].count()#number of people in control group
converted_t=df.query('group=="Treatment"&VisitPageFlag==1')['IP Address'].count()#number of clicks in treatment group
converted_c=df.query('group=="Control"&VisitPageFlag==1')['IP Address'].count()#number of clicks in control group
t_r=converted_t/n_t#click ratio for treatment
c_r=converted_c/n_c#click ratio for control
print(f"""number of people in treatment group={n_t}
number of people in control group={n_c}
number of clicks in treatment group={converted_t}
number of clicks in control group={converted_c}
click ratio for treatment={t_r}
click ratio for control={c_r}
""")

number of people in treatment group=33303
number of people in control group=66460
number of clicks in treatment group=3847
number of clicks in control group=6131
click ratio for treatment=0.11551511875806984
click ratio for control=0.09225097803189888



# 3. Checking for Normality

In order to choose the appropriate statistical test, it is necessary to first assess the normality of the data.

- **Parametric tests** are based on the assumption that the data is normally distributed. This means that the data should be symmetrical and bell-shaped. If the data is not normally distributed, then the results of the parametric test may not be accurate. (two_sample_z_test or two_sample_t_test)

- **Nonparametric tests** do not make any assumptions about the distribution of the data. This means that they can be used even if the data is not normally distributed. However, nonparametric tests are not as powerful as parametric tests. (Mann-Whitney U test)

H_0 : The data is normally distributed.

H_1 : The data is not normally distributed.

In [31]:
control_group=df.query('group=="Control"')['VisitPageFlag']
treatment_group=df.query('group=="Treatment"')['VisitPageFlag']

In [22]:
#checking for normality for treatment group
statistic_treatment,pvalue_treatmen=stats.shapiro(treatment_group)

print(f"""statistic_treatment=={statistic_treatment:.4f}
pvalue_treatmen={pvalue_treatmen:.4f}""")

statistic_treatment==0.3711
pvalue_treatmen=0.0000


In [23]:
#checking for normality for control group
statistic_control,pvalue_control=stats.shapiro(control_group)

print(f"""statistic_control=={statistic_control:.4f}
\npvalue_control={pvalue_control:.4f}""")

statistic_control==0.3266

pvalue_control=0.0000


In [24]:
if pvalue_treatmen or pvalue_control <0.05 :
    print("Data is not normally distributed")
else :
    print("Data is normally distributed")

Data is not normally distributed


As data is not normally distributed a -non parametric test- will be used.

# 4. Statistical Testing for Click Rates (Mann-Whitney U test)

- H_0 : There is no difference in click rates between two groups.
- H_1 : There is difference in click rates between two groups.

In [25]:
statistic,p_value=stats.mannwhitneyu(treatment_group,control_group)
print(f"""statistic=={statistic}
p_value=={p_value}""")

if p_value<0.05:
    print("\nWe can reject the null hypothesis.")
else :
    print('\nwe can not reject null hypothesis')


statistic==1132404153.5
p_value==7.41454867554153e-31

We can reject the null hypothesis.


 In other words, we can say statistically that there is a difference between the two groups.

# 5. Conclusion & Recommendation

**Conclusion**

We can reject the null hypothesis that there is no significant difference in click rate between the two groups. The results of the Mann-Whitney U test show that there is a significant statistical difference in click rates between the two groups, with the treatment group having a higher click rate than the control group.the control group.

**Recommendation**

Based on the results of the A/B test, I recommend that the client continue running the A/B test for a longer period of time to see if the effect size increases. If the effect size does not increase, then the client may want to consider keeping the current homepage design or making other changes to the website to see if they can improve the click rate.