In [1]:
import pandas as pd
df = pd.read_csv("/content/ab_data.csv")
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [2]:
df.columns

Index(['user_id', 'timestamp', 'group', 'landing_page', 'converted'], dtype='object')

In [3]:
df.shape, df.isna().sum()

((294478, 5),
 user_id         0
 timestamp       0
 group           0
 landing_page    0
 converted       0
 dtype: int64)

In [4]:
summary = (
    df.groupby("group")["converted"]
      .agg(users="count", conversions="sum", conversion_rate="mean")
)
summary

Unnamed: 0_level_0,users,conversions,conversion_rate
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
control,147202,17723,0.120399
treatment,147276,17514,0.11892


In [5]:
pd.crosstab(df["group"], df["landing_page"])

landing_page,new_page,old_page
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,1928,145274
treatment,145311,1965


In [6]:
# keep only rows where group matches the landing_page
df_clean = df[
    ((df["group"] == "control") & (df["landing_page"] == "old_page")) |
    ((df["group"] == "treatment") & (df["landing_page"] == "new_page"))
].copy()

df_clean.shape

(290585, 5)

In [7]:
summary_clean = (
    df_clean.groupby("group")["converted"]
      .agg(users="count", conversions="sum", conversion_rate="mean")
)
summary_clean

Unnamed: 0_level_0,users,conversions,conversion_rate
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
control,145274,17489,0.120386
treatment,145311,17264,0.118807


In [8]:
pd.crosstab(df_clean["group"], df_clean["landing_page"])

landing_page,new_page,old_page
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,0,145274
treatment,145311,0


In [10]:
import numpy as np
from math import sqrt
from scipy.stats import norm

# counts
n_c = summary_clean.loc["control", "users"]
x_c = summary_clean.loc["control", "conversions"]
n_t = summary_clean.loc["treatment", "users"]
x_t = summary_clean.loc["treatment", "conversions"]

p_c = x_c / n_c
p_t = x_t / n_t
diff = p_t - p_c  # treatment - control

# pooled proportion for z-test (H0: p_t = p_c)
p_pool = (x_t + x_c) / (n_t + n_c)
se_pool = sqrt(p_pool * (1 - p_pool) * (1/n_t + 1/n_c))
z = diff / se_pool
p_value = 2 * (1 - norm.cdf(abs(z)))  # two-sided

# 95% CI using unpooled SE (common reporting choice)
se_unpooled = sqrt(p_t * (1 - p_t) / n_t + p_c * (1 - p_c) / n_c)
ci_low = diff - 1.96 * se_unpooled
ci_high = diff + 1.96 * se_unpooled

print(f"Control conversion rate:   {p_c:.6f}")
print(f"Treatment conversion rate: {p_t:.6f}")
print(f"Difference (T - C):        {diff:.6f}")
print(f"z-statistic:               {z:.3f}")
print(f"p-value (two-sided):       {p_value:.4f}")
print(f"95% CI for (T - C):        [{ci_low:.6f}, {ci_high:.6f}]")

Control conversion rate:   0.120386
Treatment conversion rate: 0.118807
Difference (T - C):        -0.001579
z-statistic:               -1.312
p-value (two-sided):       0.1897
95% CI for (T - C):        [-0.003939, 0.000781]
