In [2]:
# ==========================================
# ðŸš€ FULL AUTO ADVANCED AB TESTING RUNNER
# ==========================================

import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests

# =====================================================
# ADVANCED AB TESTING CLASS
# =====================================================

class AdvancedABTesting:

    def __init__(self, alpha=0.05):
        self.alpha = alpha

    def interpret_cramers_v(self, v):
        if v < 0.1:
            return "Negligible"
        elif v < 0.3:
            return "Small"
        elif v < 0.5:
            return "Medium"
        else:
            return "Large"

    def chi_square_test(self, control, treatment, metric_name):

        combined = np.concatenate([control, treatment])
        labels = np.concatenate([np.zeros(len(control)), np.ones(len(treatment))])

        contingency_table = pd.crosstab(combined, labels)

        chi2, pvalue, dof, _ = chi2_contingency(contingency_table)

        n = len(combined)
        min_dim = min(contingency_table.shape) - 1
        cramers_v = np.sqrt(chi2 / (n * min_dim)) if min_dim > 0 else 0

        return pvalue, {
            "metric": metric_name,
            "test": "Chi-Square",
            "pvalue": pvalue,
            "significant": pvalue < self.alpha,
            "cramers_v": cramers_v,
            "effect_size": self.interpret_cramers_v(cramers_v),
        }

    def mann_whitney_test(self, control, treatment, metric_name):

        control = control[~np.isnan(control)]
        treatment = treatment[~np.isnan(treatment)]

        stat, pvalue = stats.mannwhitneyu(treatment, control)

        return pvalue, {
            "metric": metric_name,
            "test": "Mann-Whitney U",
            "pvalue": pvalue,
            "significant": pvalue < self.alpha,
            "control_median": np.median(control),
            "treatment_median": np.median(treatment),
        }

    def multiple_testing(self, pvals):

        reject, corrected, _, _ = multipletests(
            pvals, alpha=self.alpha, method="holm"
        )

        return reject, corrected


# =====================================================
# CONFIG
# =====================================================

BASE_PATH = "../raw/"

EXPERIMENTS = {
    "Menu Design": ("test1_menu.csv", "added_to_cart"),
    "Novelty Slider": ("test2_novelty_slider.csv", "products_added_from_novelties"),
    "Product Sliders": ("test3_product_sliders.csv", "add_to_cart_rate"),
    "Reviews": ("test4_reviews.csv", "converted"),
    "Search Engine": ("test5_search_engine.csv", "converted"),
}

tester = AdvancedABTesting(alpha=0.05)

all_results = []
all_pvalues = []

# =====================================================
# RUN ALL EXPERIMENTS
# =====================================================

for exp_name, (file, metric) in EXPERIMENTS.items():

    print(f"\n==============================")
    print(f"Running: {exp_name}")
    print(f"==============================")

    df = pd.read_csv(BASE_PATH + file)

    variants = df["variant"].unique()
    control = df[df["variant"] == variants[0]][metric].values
    treatment = df[df["variant"] == variants[1]][metric].values

    # Auto detect binary vs continuous
    if set(np.unique(df[metric].dropna())) <= {0, 1}:
        pval, result = tester.chi_square_test(control, treatment, metric)
    else:
        pval, result = tester.mann_whitney_test(control, treatment, metric)

    all_results.append(result)
    all_pvalues.append(pval)

    for k, v in result.items():
        print(f"{k}: {v}")

# =====================================================
# MULTIPLE TESTING CORRECTION
# =====================================================

print("\n\n==============================")
print("MULTIPLE TESTING CORRECTION (Holm)")
print("==============================")

reject, corrected = tester.multiple_testing(all_pvalues)

for i, exp_name in enumerate(EXPERIMENTS.keys()):
    print(f"{exp_name}")
    print(f"Original p-value: {all_pvalues[i]:.6f}")
    print(f"Corrected p-value: {corrected[i]:.6f}")
    print(f"Significant after correction: {reject[i]}")
    print("----------------------------------")



Running: Menu Design
metric: added_to_cart
test: Chi-Square
pvalue: 1.5602004541879748e-48
significant: True
cramers_v: 0.17498171934780538
effect_size: Small

Running: Novelty Slider
metric: products_added_from_novelties
test: Chi-Square
pvalue: 1.4184381444847933e-05
significant: True
cramers_v: 0.03431851922902354
effect_size: Negligible

Running: Product Sliders
metric: add_to_cart_rate
test: Chi-Square
pvalue: 1.0
significant: False
cramers_v: 0.0
effect_size: Negligible

Running: Reviews
metric: converted
test: Chi-Square
pvalue: 0.7885083661897574
significant: False
cramers_v: 0.0013089166166280867
effect_size: Negligible

Running: Search Engine
metric: converted
test: Chi-Square
pvalue: 0.38678302164976075
significant: False
cramers_v: 0.006278750915845323
effect_size: Negligible


MULTIPLE TESTING CORRECTION (Holm)
Menu Design
Original p-value: 0.000000
Corrected p-value: 0.000000
Significant after correction: True
----------------------------------
Novelty Slider
Original p-