In [2]:
# Import statsmodel for statistical calculations and 
# TTestIndPower class to calculate the parameters.
import statsmodels.stats.api as sms
from statsmodels.stats.power import TTestIndPower

# Specify the three required parameters for the power analysis:
alpha = 0.05 
power = 0.80
# Old page had 50% convertion rate and and we hope for 5% increas to 55%.
effect = sms.proportion_effectsize(0.50, 0.55) 

# Perform power analysis by using the solve_power() function:
# Specify an instance of TTestIndPower.
analysis = TTestIndPower() 

# Calculate the sample size and list the parameters. Ratio equals 1
# to ensure the sample sizes in A and B are equal.
result = analysis.solve_power(effect, power=power, nobs1=None,
                              ratio=1.0, alpha=alpha) 

# Print the output. Represent absolute minimum for the significant level and power.
# needs to be rounded to bigger size: 4721
print('Sample Size: %.3f' % result)

Sample Size: 1565.490


In [4]:
# Import necessary libraries, packages and classes.
import pandas as pd
import math
import numpy as np
import statsmodels.stats.api as sms
import scipy.stats as st
import matplotlib as mpl
import matplotlib.pyplot as plt

In [22]:
# Read the CSV file (ab_data.csv).
df = pd.read_csv('bike_shop.csv')

# View the DataFrame.
df.head()

Unnamed: 0,RecordID,IPAddress,LoyaltyPage,ServerID,VisitPageFlag
0,1,39.13.114.2,1,2,0
1,2,13.3.25.8,1,1,0
2,3,247.8.211.8,1,1,0
3,4,124.8.220.3,0,3,0
4,5,60.10.192.7,0,2,0


In [23]:
# Check the metadata.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184588 entries, 0 to 184587
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   RecordID       184588 non-null  int64 
 1   IPAddress      184588 non-null  object
 2   LoyaltyPage    184588 non-null  int64 
 3   ServerID       184588 non-null  int64 
 4   VisitPageFlag  184588 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 7.0+ MB


In [24]:
df.shape

(184588, 5)

In [25]:
# Check for duplicates.
# Pandas's duplicated() function to check the IPAddress column (visited more than once). 
print(df[df.IPAddress.duplicated()])

        RecordID     IPAddress  LoyaltyPage  ServerID  VisitPageFlag
275          276    191.4.97.7            0         2              0
394          395     79.9.70.7            1         3              0
703          704    175.1.81.8            1         3              0
809          810    125.0.30.9            1         2              0
889          890  207.14.157.6            1         3              0
...          ...           ...          ...       ...            ...
184582    184583    90.4.224.4            0         3              0
184583    184584   114.8.104.1            0         1              0
184585    184586   170.13.31.9            0         2              0
184586    184587   195.14.92.3            0         3              0
184587    184588  172.12.115.8            0         2              1

[85072 rows x 5 columns]


In [48]:
# Drop duplicate values.
df.drop_duplicates(subset ='IPAddress',
                       keep = False,
                       inplace = True)


# Drop duplicate columns.
df_final = df.drop(['RecordID', 'VisitPageFlag'],
                       axis=1)


# View the DataFrame.
print(df_final.shape)
print(df_final.head())
df_final.info()

(39608, 3)
       IPAddress  LoyaltyPage  ServerID
7     97.6.126.6            0         3
12   188.13.62.2            0         3
14   234.1.239.1            0         2
15  167.15.157.7            0         2
16  123.12.229.8            0         1
<class 'pandas.core.frame.DataFrame'>
Int64Index: 39608 entries, 7 to 184584
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   IPAddress    39608 non-null  object
 1   LoyaltyPage  39608 non-null  int64 
 2   ServerID     39608 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.2+ MB


In [52]:
# Remove unnecessary columns.
# Use dropped.drop to remove irrelevant columns from the DataFrame. 
# Specify that user_id and timestamp are columns (i.e. axis 1). 
df_final = df.drop(['RecordID', 'VisitPageFlag'], axis=1)  

# Check the DataFrame.
df_final.head()
df_final.shape

(39608, 3)

In [53]:
# Split the data set into ID1 as treatment and ID2 & ID3 as control groups.
df_final['Group'] = df_final['ServerID'].map({1:'Treatment',
                                              2:'Control',
                                              3:'Control'})

# View the DataFrame.
print(df_final.shape)
df_final.head()

(39608, 4)


Unnamed: 0,IPAddress,LoyaltyPage,ServerID,Group
7,97.6.126.6,0,3,Control
12,188.13.62.2,0,3,Control
14,234.1.239.1,0,2,Control
15,167.15.157.7,0,2,Control
16,123.12.229.8,0,1,Treatment


In [54]:
# Count the values.
df_final['Group'].value_counts()

Control      26310
Treatment    13298
Name: Group, dtype: int64

In [55]:
# Obtain a simple random sample for control and treatment groups with n = 1566; 
# set random_stategenerator seed at an arbitrary value of 42.
# Obtain a simple random sample for the control group.
control_sample = df_final[df_final['Group'] == 'Control'].sample(n=1566, 
                                                       random_state=42) 

# Obtain a simple random sample for the treatment group.
treatment_sample = df_final[df_final['Group'] == 'Treatment'].sample(n=1566,
                                                           random_state=42)

In [56]:
# Join the two samples.  
ab_test = pd.concat([control_sample, treatment_sample], axis=0)  

# Reset the A/B index.
ab_test.reset_index(drop=True, inplace=True) 

# Print the sample table.
ab_test 

Unnamed: 0,IPAddress,LoyaltyPage,ServerID,Group
0,25.16.126.2,1,3,Control
1,106.13.67.3,1,3,Control
2,169.11.137.7,0,2,Control
3,164.9.86.8,1,2,Control
4,112.12.25.7,0,2,Control
...,...,...,...,...
3127,187.4.117.9,1,1,Treatment
3128,134.0.112.5,1,1,Treatment
3129,7.3.242.7,0,1,Treatment
3130,118.14.226.4,0,1,Treatment


In [57]:
# Calculate basic statistics.
# Import library.
# SEM stands for standard error mean.
from scipy.stats import sem

# Group the ab_test data set by group and aggregate by converted.
conversion_rates = ab_test.groupby('Group')['LoyaltyPage']

# Standard deviation of the proportion.
STD_p = lambda x: np.std(x, ddof=0)    
# Standard error of the proportion.
SE_p = lambda x: st.sem(x, ddof=0)     

# Calculate conversion rates by calculating the means of columns STD_p and SE_p.
conversion_rates = conversion_rates.agg([np.mean, STD_p, SE_p])

# Assign names to the three columns.
conversion_rates.columns = ['conversion_rate',
                            'std_deviation',
                            'std_error']  

# Round the output to 3 decimal places.
# 51.9% presses in control group
# 50.1% presses in treatment group
# is there significance?
conversion_rates.style.format('{:.3f}')

Unnamed: 0_level_0,conversion_rate,std_deviation,std_error
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Control,0.532,0.499,0.013
Treatment,0.483,0.5,0.013


In [58]:
from statsmodels.stats.proportion import proportions_ztest, proportion_confint

# Create a subset of control and treatment results.
control_results = ab_test[ab_test['Group'] == 'Control']['LoyaltyPage']
treatment_results = ab_test[ab_test['Group'] == 'Treatment']['LoyaltyPage']

# Determine the count of the control_results and 
# treatment_result sub-data sets and store them in their respective variables.
n_con = control_results.count()
n_treat = treatment_results.count()

# Create a variable 'success' with the sum of the two data sets in a list format. 
successes = [control_results.sum(), treatment_results.sum()]

# Create a variable 'nobs' which stores the values of 
# variables n_con and n_treat in list format. 
nobs = [n_con, n_treat] 

# Use the imported libraries to calculate the statistical values. 
z_stat, pval = proportions_ztest(successes, nobs=nobs)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes,
                                                                        nobs=nobs,
                                                                        alpha=0.05)

# Print the outputs (with lead-in text). The .3f indicates the number of decimal places.
print(f"Z test stat: {z_stat:.3f}")
print(f"P-value: {pval:.3f}")
print(f"Confidence Interval of 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]")
print(f"Confidence Interval of 95% for treatment group: [{lower_treat:.3f}, {upper_treat:.3f}]")

Z test stat: 2.716
P-value: 0.007
Confidence Interval of 95% for control group: [0.507, 0.557]
Confidence Interval of 95% for treatment group: [0.459, 0.508]


# 7. Summarise results and explain your answers

The change to the homepage slightly decreased the click through to the login page.
The p-value is smaller than the Alpha value of 0.05, meaning we reject the  𝐻0.