### A/B Testing
##### Source: Scaler
##### https://www.scaler.com/topics/data-science/a-b-testing-in-python/
##### data source: https://www.kaggle.com/datasets/zhangluyuan/ab-testing?resource=download

In [2]:
!python3 -m pip install --upgrade pip
!pip install statsmodels



In [3]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.stats.api as sms
import seaborn as sns
from math import ceil

In [4]:
# Load dataset
df = pd.read_csv('ab_data.csv')
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [14]:
df.info(), df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


(None,
              user_id      converted
 count  294478.000000  294478.000000
 mean   787974.124733       0.119659
 std     91210.823776       0.324563
 min    630000.000000       0.000000
 25%    709032.250000       0.000000
 50%    787933.500000       0.000000
 75%    866911.750000       0.000000
 max    945999.000000       1.000000)

In [5]:
# Users with multiple session counts
session_counts = df['user_id'].value_counts(ascending = False)
multi_users = session_counts[session_counts > 1].count()

In [6]:
session_counts.head()

user_id
805339    2
754884    2
722274    2
783176    2
898232    2
Name: count, dtype: int64

In [8]:
multi_users

3894

In [15]:
# dropping with multiple session counts
users_to_drop = session_counts[session_counts > 1].index

df = df[~ df['user_id'].isin(users_to_drop)]

In [24]:
users_to_drop, len(df)

(Index([805339, 754884, 722274, 783176, 898232, 899018, 904570, 842042, 881123,
        858720,
        ...
        721188, 902463, 713355, 827417, 655283, 847058, 889392, 664150, 835401,
        736955],
       dtype='int64', name='user_id', length=3894),
 286690)

In [25]:
# Sample control and treatment group
control_sample = df[df['group'] == 'control'].sample(n = 5000, random_state = 12)
treatment_sample = df[df['group'] == 'treatment'].sample(n = 5000, random_state = 12)
ab_test = pd.concat([control_sample, treatment_sample], axis=0)
ab_test.reset_index(drop=True, inplace=True)

In [46]:
ab_test.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,722876,2017-01-03 16:24:40.954412,control,old_page,0
1,630561,2017-01-16 12:06:13.597159,control,old_page,0
2,704591,2017-01-21 06:42:13.850395,control,old_page,0
3,651659,2017-01-23 00:08:59.556011,control,old_page,0
4,733154,2017-01-04 11:05:36.544602,control,old_page,0


In [40]:
# np.random.seed(1)
# pd.DataFrame(range(10)).sample(n=3)

In [41]:
# pd.DataFrame(range(10)).sample(n=3, random_state=12)

In [53]:
# Define functions for standard deviation and standard error
std_dev = lambda x : np.std(x, ddof = 0)
std_error = lambda x : stats.sem(x, ddof = 0)
conversion_rate = ab_test.groupby('group')['converted'].agg([np.mean, std_dev, std_error])
conversion_rate.columns = ['conversion_rate', 'std_deviation', 'std_error']
conversion_rate

  conversion_rate = ab_test.groupby('group')['converted'].agg([np.mean, std_dev, std_error])


Unnamed: 0_level_0,conversion_rate,std_deviation,std_error
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
control,0.1144,0.318296,0.004501
treatment,0.1218,0.327055,0.004625


In [55]:
from statsmodels.stats.proportion import proportions_ztest, proportion_confint

In [56]:
control_results = ab_test[ab_test['group'] == 'control']['converted']
treatment_results = ab_test[ab_test['group'] == 'treatment']['converted']

In [58]:
num_control = control_results.count()
num_treatment = treatment_results.count()
successes = [control_results.sum(), treatment_results.sum()]
nobs = [num_control, num_treatment]

In [60]:
num_control, num_treatment

(5000, 5000)

In [61]:
successes

[572, 609]

In [62]:
nobs

[5000, 5000]

In [63]:
z_stat, pval = proportions_ztest(successes, nobs = nobs)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes, nobs=nobs, alpha=0.05)

print(f'Z Statistic - {z_stat:.2f}')
print(f'P-Value - {pval:.3f}')
print(f'CI 95% for control group - [{lower_con:.3f}, {upper_con:.3f}]')
print(f'CI 95% for treatment group - [{lower_treat:.3f}, {upper_treat:.3f}]')

Z Statistic - -1.15
P-Value - 0.252
CI 95% for control group - [0.106, 0.123]
CI 95% for treatment group - [0.113, 0.131]


In [67]:
# As you can see, p-value for the A/B testing results is 0.252. If we set the significance level at 0.05, 
# then we can’t reject the NULL hypothesis at this p-value. It means that observed results have occurred due to random chance and
#  are not statistically significant to say that the landing page of the treatment group works better.

# Conclusion
# A/B testing compares two versions of a product, website, app, or marketing campaign to determine which one performs better. 
# It is useful in a situation when you want to test the effectiveness of incremental changes in a product, website, or app.
# Post conducting A/B testing, we need to prove that the results are statistically significant. It means that the observed results are not due to some random chance.