In [5]:
# Husayn El Sharif
comment = """
Project: AB Testing and XGBoost Predictive Analytics

Scenario: CDC Outreach for Vaccine Booster Scheduling (Synthetic experiment)

A public health team ran a randomized experiment to improve booster appointment scheduling among eligible adults.

Variant A (Control): standard reminder message
Variant B (Treatment): personalized message with social proof + a friction-reducing link

Primary outcome (A/B test):
scheduled_7d = whether the person scheduled an appointment within 7 days

Secondary outcome:
completed_30d = whether they completed within 30 days (harder outcome; correlated with scheduling)

Predictive analytics goal (XGBoost):
Predict scheduled_7d (or completed_30d) using demographics + risk + history + engagement variables + message variant.

Use python environment: datasci_xgb_skl_env001
"""

In [36]:
# Imports
import sys
sys.path.append("..") # Add parent directory to path for imports

import os

import numpy as np
import pandas as pd

# function for generating synthetic dataset
from src.ab_test_data_generator import generate_cdc_ab_dataset


# for making figures
import plotly.express as px
import plotly.graph_objects as go


In [None]:
# Generate synthetic "CDC" style outreach dataset
# see src/ab_test_data_generator.py for details on data generation process

n = 20000

df = generate_cdc_ab_dataset(n=n, seed=42)
df.head()

Unnamed: 0,person_id,age,sex,region,risk_score,barriers_index,channel,weekday,send_hour,prior_cdc_interactions_90d,prior_appointments_1y,missed_appointments_1y,message_variant,opened,clicked,scheduled_7d,completed_30d
0,1,24,M,ATL-Core,0.198,1.255,Email,3,9,0,2,1,A,1,1,0,0
1,2,70,M,ATL-Metro,0.412,0.012,SMS,2,14,1,3,0,B,0,1,0,0
2,3,62,F,South-GA,0.513,0.778,SMS,6,20,1,2,2,A,0,0,0,0
3,4,47,F,ATL-Core,0.373,0.103,SMS,5,14,1,1,0,A,0,1,1,1
4,5,47,F,ATL-Core,0.327,0.942,SMS,0,20,1,0,0,B,1,0,0,0


In [4]:
# export generated dataset to CSV
df.to_csv("data/cdc_outreach_ab_synthetic.csv", index=False)
print("\nSaved: data/cdc_outreach_ab_synthetic.csv")


Saved: data/cdc_outreach_ab_synthetic.csv


In [None]:
# Analyze treatment (message_variant) effect on scheduling rate
ab_df = (
    df.groupby("message_variant")
      .agg(scheduled_7d=("scheduled_7d", "sum"),
           total=("scheduled_7d", "count"))
      .reset_index()
)

# calculate scheduling rate
ab_df["scheduling_rate"] = ab_df["scheduled_7d"] / ab_df["total"]

# display results
ab_df

Unnamed: 0,message_variant,scheduled_7d,total,scheduling_rate
0,A,2636,10021,0.263048
1,B,3065,9979,0.307145


In [17]:
# calcuate observed lift
control_rate = ab_df.loc[ab_df["message_variant"] == "A", "scheduling_rate"].values[0]
treatment_rate = ab_df.loc[ab_df["message_variant"] == "B", "scheduling_rate"].values[0]

nA = ab_df.loc[ab_df["message_variant"] == "A", "total"].values[0]
nB = ab_df.loc[ab_df["message_variant"] == "B", "total"].values[0]

delta = treatment_rate - control_rate
lift = (treatment_rate - control_rate) / control_rate

print(f"\nControl rate: {control_rate:.2%} (n={nA})")
print(f"Treatment rate: {treatment_rate:.2%} (n={nB})")
print(f"Absolute lift: {delta:.2%}")
print(f"\nRelative lift: {lift:.2%}")


Control rate: 26.30% (n=10021)
Treatment rate: 30.71% (n=9979)
Absolute lift: 4.41%

Relative lift: 16.76%


In [33]:
# Create permutation to generate null distribution of lift under no effect
n_permutations = 10000
perm_deltas = []
perm_lifts = []

for i in range(n_permutations):
    # shuffle message_variant labels
    shuffled = df["message_variant"].sample(frac=1, replace=False).reset_index(drop=True)
    
    # calculate scheduling rates for shuffled data
    shuffled_df = df.copy()
    shuffled_df["message_variant"] = shuffled
    
    perm_ab_df = (
        shuffled_df.groupby("message_variant")
                   .agg(scheduled_7d=("scheduled_7d", "sum"),
                        total=("scheduled_7d", "count"))
                   .reset_index()
    )
    
    perm_ab_df["scheduling_rate"] = perm_ab_df["scheduled_7d"] / perm_ab_df["total"]
    
    # calculate lift for this permutation
    perm_control_rate = perm_ab_df.loc[perm_ab_df["message_variant"] == "A", "scheduling_rate"].values[0]
    perm_treatment_rate = perm_ab_df.loc[perm_ab_df["message_variant"] == "B", "scheduling_rate"].values[0]
    
    perm_delta = perm_treatment_rate - perm_control_rate
    perm_lift = (perm_treatment_rate - perm_control_rate) / perm_control_rate
    
    perm_deltas.append(perm_delta)
    perm_lifts.append(perm_lift)

# convert to numpy array for easier calculations
perm_lifts = np.array(perm_lifts)

In [34]:
# calculate the one sided p-value for observed lift
p_value = np.sum(perm_lifts >= lift)/len(perm_lifts)
p_value

np.float64(0.0)

In [None]:
comment = """
Because the permuted lifts are all around 0 (no effect), 
the observed lift of 16.8 percent is highly significant (p-value close to 0). 
This suggests the treatment message had a strong positive effect on scheduling rates.
"""

In [38]:
# Create histogram of permuted lifts
fig = px.histogram(
    perm_lifts,
    nbins=60,
    title="Permutation Test: Null Distribution of Relative Lift (B vs A)",
    labels={"value": "Relative Lift (Treatment vs Control)"},
    opacity=0.8
)

# Add vertical line for observed lift
fig.add_vline(
    x=lift,
    line_width=3,
    line_dash="dash",
    annotation_text=f"Observed lift = {lift:.2%}",
    annotation_position="top right"
)

# Optional: Add zero reference line (null hypothesis)
fig.add_vline(
    x=0,
    line_width=2,
    line_dash="dot",
    annotation_text="Null (no effect)",
    annotation_position="top left"
)

fig.update_layout(
    bargap=0.05,
    xaxis_title="Relative Lift",
    yaxis_title="Count",
    template="plotly_white",
    showlegend=False,
)

fig.show()