<a href="https://colab.research.google.com/github/josephcabezas/ecom-a-b-testing/blob/main/ecom_a_b_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ahmedmohameddawoud/ecommerce-ab-testing")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/ahmedmohameddawoud/ecommerce-ab-testing?dataset_version_number=1...


100%|██████████| 3.29M/3.29M [00:00<00:00, 112MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/ahmedmohameddawoud/ecommerce-ab-testing/versions/1





In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import random
random.seed(42)

import warnings
warnings.filterwarnings("ignore")

In [4]:
import os

dataset_files = os.listdir(path)
dataset_files

['ab_test.csv', 'countries_ab.csv']

In [5]:
csv_file = os.path.join(path, 'ab_test.csv')
df = pd.read_csv(csv_file)
df.head()

Unnamed: 0,id,time,con_treat,page,converted
0,851104,11:48.6,control,old_page,0
1,804228,01:45.2,control,old_page,0
2,661590,55:06.2,treatment,new_page,0
3,853541,28:03.1,treatment,new_page,0
4,864975,52:26.2,control,old_page,1


In [6]:
# change column names

df.columns = ['user_id', 'timestamp', 'group', 'landing_page', 'converted']
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,11:48.6,control,old_page,0
1,804228,01:45.2,control,old_page,0
2,661590,55:06.2,treatment,new_page,0
3,853541,28:03.1,treatment,new_page,0
4,864975,52:26.2,control,old_page,1


In [7]:
print(f'Number of rows: {df.shape[0]}')
print(f'Number of unique users: {df.user_id.nunique()}')

Number of rows: 294478
Number of unique users: 290584


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [9]:
# check for missing values

df.isna().sum()

Unnamed: 0,0
user_id,0
timestamp,0
group,0
landing_page,0
converted,0


In [10]:
# check if number of new_pages and treatment match

n_treat = df[df['group'] == 'treatment'].shape[0]
n_new_page = df[df['landing_page'] == 'new_page'].shape[0]

difference = n_treat - n_new_page
pd.DataFrame({
    'N treatment': [n_treat],
    'N new_page': [n_new_page],
    'Difference': [difference]
})

Unnamed: 0,N treatment,N new_page,Difference
0,147276,147239,37


In [11]:
# there's a mismatch in the data

df[(df['group'] == 'treatment') & (df['landing_page'] == 'old_page')]

Unnamed: 0,user_id,timestamp,group,landing_page,converted
308,857184,34:59.8,treatment,old_page,0
327,686623,26:40.7,treatment,old_page,0
357,856078,29:30.4,treatment,old_page,0
685,666385,11:54.8,treatment,old_page,0
713,748761,47:44.4,treatment,old_page,0
...,...,...,...,...,...
293773,688144,34:50.5,treatment,old_page,1
293817,876037,15:09.0,treatment,old_page,1
293917,738357,37:55.7,treatment,old_page,0
294014,813406,25:33.2,treatment,old_page,0


In [18]:
df_mismatch = df[(df['group'] == 'treatment') & (df['landing_page'] == 'old_page')\
                 |(df['group'] == 'control') & (df['landing_page'] == 'new_page')]

n_mismatch = df_mismatch.shape[0]

percent_mismatch = round(n_mismatch / len(df) * 100, 2)
print(f'Number of mismatched rows: {n_mismatch}')
print(f'Percentage of mismatched rows: {percent_mismatch}')

Number of mismatched rows: 3893
Percentage of mismatched rows: 1.32


In [19]:
# go ahead an remove these mismatched rows, 1.32% is a neglible amnt

df2 = df[(df['group'] == 'treatment') & (df['landing_page'] == 'new_page')\
                 |(df['group'] == 'control') & (df['landing_page'] == 'old_page')]

len(df2)

290585

In [20]:
df2.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,11:48.6,control,old_page,0
1,804228,01:45.2,control,old_page,0
2,661590,55:06.2,treatment,new_page,0
3,853541,28:03.1,treatment,new_page,0
4,864975,52:26.2,control,old_page,1


In [21]:
# check for mismatch again

df_mismatch = df2[(df2['group'] == 'treatment') & (df2['landing_page'] == 'old_page')\
                 |(df2['group'] == 'control') & (df2['landing_page'] == 'new_page')]

n_mismatch = df_mismatch.shape[0]

percent_mismatch = round(n_mismatch / len(df) * 100, 2)
print(f'Number of mismatched rows: {n_mismatch}')
print(f'Percentage of mismatched rows: {percent_mismatch}')

Number of mismatched rows: 0
Percentage of mismatched rows: 0.0


In [22]:
# check for duplicate

len(df2) - df2.user_id.nunique()

1

In [24]:
# drop duplicate
df2 = df2.drop_duplicates(subset='user_id')
len(df2) - df2.user_id.nunique()

0