This notebook contains simulations for the Beyond Basic A/B Testing paper.

There are two main sections of simulation:  zero-trimmed Mann-Whitney U-test (power and type I error rate comparison analysis) and regression adjustment (power comparison analysis)

## Zero-trimmed Mann-Whitney U-test

Data generating process:

$y_{0i} =(1-D_i)y_{0i}'$, where $D_i \sim \textrm{Bernoulli}(p_0)$ and $y_{0i}' \sim f(0, \sigma)$. And $y_{1j} =(1-D_j)y_{1j}'$  where $D_j \sim \textrm{Bernoulli}(p_0 +p_{\Delta})$ and $y_{1j}' \sim f(\mu, \sigma)$ for $p_{\Delta}, \mu \geq 0$. Here $f(\cdot)$ denotes a heavy-tailed distribution (either LogNormal or Positive Cauchy).

In [1]:
import numpy as np
from scipy import stats
from itertools import product
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Test functions

In [2]:
# Test functions
def modified_wilcoxon(x, y):
    """Modified Wilcoxon test for zero-inflated data"""
    x, y = np.asarray(x), np.asarray(y)
    n0, n1 = len(x), len(y)

    # Assert that all input values are positive
    assert np.all(x >= 0), "All values in x must be non-negative."
    assert np.all(y >= 0), "All values in y must be non-negative."
    assert n0 > 0 and n1 > 0, "Both input arrays must be non-empty."
    
    # Calculate non-zero proportions
    p_hat0 = np.sum(x > 0) / n0 if n0 > 0 else 0
    p_hat1 = np.sum(y > 0) / n1 if n1 > 0 else 0
    p_hat = max(p_hat0, p_hat1)
    
    # Truncate zeros
    x_nonzero, y_nonzero = x[x > 0], y[y > 0]
    n_plus0, n_plus1 = len(x_nonzero), len(y_nonzero)
    n_prime_0, n_prime_1 = round(n0 * p_hat), round(n1 * p_hat)
    
    # Add zeros to balance proportions
    x_trun = np.concatenate([np.zeros(int(n_prime_0) - len(x_nonzero)), x_nonzero])
    y_trun = np.concatenate([np.zeros(int(n_prime_1) - len(y_nonzero)), y_nonzero])
    
    # Compute ranks and statistic
    combined = np.concatenate([y_trun, x_trun])
    descending_ranks = stats.rankdata(-combined, method='average')
    R1 = np.sum(descending_ranks[:len(y_trun)])
    W = R1 - len(y_trun) * (len(combined) + 1) / 2
    
    # Calculate variance
    var_comp1 = (n1**2 * n0**2 / 4) * (p_hat**2) * (
        (p_hat0 * (1 - p_hat0) / n0) + (p_hat1 * (1 - p_hat1) / n1)
    )
    var_comp2 = (n_plus0 * n_plus1 * (n_plus0 + n_plus1)) / 12
    var_W = var_comp1 + var_comp2
    
    # Calculate p-value
    Z = W / np.sqrt(var_W)
    return 2 * stats.norm.cdf(-abs(Z))

def standard_wilcoxon(x, y, use_correction=False):
    """Standard Mann-Whitney U test"""
    if use_correction:
        try:
            # scipy's implementation has tie adjustment by default
            _, p = stats.mannwhitneyu(x, y, alternative='two-sided')
            return p
        except ValueError:
            print("Value Error in Standard Wilcoxon test")
            return 1.0
    x, y = np.asarray(x), np.asarray(y)
    n0, n1 = len(x), len(y)

    # Compute ranks and statistic
    combined = np.concatenate([y, x])
    descending_ranks = stats.rankdata(-combined, method='average')
    R1 = np.sum(descending_ranks[:len(y)])
    W = R1 - len(y) * (len(combined) + 1) / 2

    var_W = (n1 * n0 * (n1 + n0 + 1)) / 12
    Z = W / np.sqrt(var_W)
    return 2 * stats.norm.cdf(-abs(Z))


def t_test(x, y):
    """Welch's t-test"""
    try:
        _, p = stats.ttest_ind(x, y, equal_var=False)
        return p
    except:
        return 1.0

### Simulation util functions

In [3]:
def plot_distributions(x, y, bins=30, figsize=(10, 6)):
    """
    Plot two distributions on the same graph using Seaborn.
    
    Parameters:
    -----------
    x, y : array-like
        The two samples to plot
    bins : int
        Number of bins for the histogram
    figsize : tuple
        Figure size (width, height) in inches
    """
    # Create figure
    plt.figure(figsize=figsize)
    
    # Create DataFrame for seaborn
    df = pd.DataFrame({
        'Value': np.concatenate([x, y]),
        'Group': np.concatenate([np.repeat('Group X', len(x)), np.repeat('Group Y', len(y))])
    })
    
    # Plot histograms with KDE
    sns.histplot(data=df, x='Value', hue='Group', bins=bins, kde=True, alpha=0.5)
    
    # Add zero proportions to title
    zero_prop_x = np.mean(x == 0)
    zero_prop_y = np.mean(y == 0)
    plt.title(f'Distribution Comparison\nZeros: Group X = {zero_prop_x:.2f}, Group Y = {zero_prop_y:.2f}')
    
    plt.tight_layout()
    plt.show()

# Data generation function
def generate_data(n, zero_prop_x, effect, dist_type, zero_prop_delta=0, sigma=5):
    """
    Generate zero-inflated data with specified distribution and different zero proportions
    
    Parameters:
    -----------
    n : int
        Sample size for each group
    zero_prop_x : float
        Proportion of zeros in the x group (between 0 and 1)
    effect : float
        Location shift for y group
    dist_type : str
        Type of distribution ("lognormal" or "positive_cauchy")
    zero_prop_delta : float
        Difference in zero proportion for y group (can be positive or negative)
        zero_prop_y = zero_prop_x + zero_prop_delta
    sigma : float
        Scale parameter for lognormal distribution
        
    Returns:
    --------
    x, y : numpy arrays
        Generated data for both groups
    """
    # Validate inputs, ensure alternative are y > x
    assert 0 <= zero_prop_x <= 1, "zero_prop_x must be between 0 and 1"
    assert zero_prop_delta <= 0, "zero_prop_delta must be non-positive, so that y has more positives than x"
    assert effect >= 0, "effect must be non-negative, so that y is shifted positively from x"
    
    # Calculate y group zero proportion, ensuring it stays between 0 and 1
    zero_prop_y = np.clip(zero_prop_x + zero_prop_delta, 0, 1)
    
    # Generate masks for non-zero values
    x_mask = np.random.random(n) > zero_prop_x
    y_mask = np.random.random(n) > zero_prop_y
    
    x, y = np.zeros(n), np.zeros(n)
    
    if dist_type == "lognormal":
        if np.sum(x_mask) > 0:
            x[x_mask] = np.random.lognormal(0, sigma, np.sum(x_mask))
        if np.sum(y_mask) > 0:
            y[y_mask] = np.random.lognormal(effect, sigma, np.sum(y_mask))
    else:  # positive cauchy
        if np.sum(x_mask) > 0:
            x[x_mask] = np.abs(np.random.standard_cauchy(np.sum(x_mask)))
        if np.sum(y_mask) > 0:
            y[y_mask] = np.abs(np.random.standard_cauchy(np.sum(y_mask)) + effect)
            
    return x, y
    

# Main simulation function
def run_simulation(n_iter=5000):
    # Configuration parameters
    sample_sizes = [50, 200]
    alpha = 0.05
    distributions = ["lognormal", "positive_cauchy"]
    base_zero_prop = 0.8
    # zero_prop_deltas = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5] # Difference in zero proportion percentages
    zero_prop_deltas = [0.0, -0.1]
    base_effect_size = 0.0
    effect_size_deltas = [0.0, 0.05, 1.0, 2.0]  # Location shifts
    
    results = []
    
    for dist in distributions:
        for sample_size in sample_sizes:
            for zero_prop_delta in zero_prop_deltas:
                for effect_size_delta in effect_size_deltas:
                    print(f"Running: dist={dist}, sample_size={sample_size}, zero_prop_delta={zero_prop_delta}, effect_size_delta={effect_size_delta}")
                    
                    mod_rejects = 0
                    std_rejects = 0
                    t_rejects = 0
                    
                    for _ in range(n_iter):
                        # Generate data
                        zero_prop = base_zero_prop - zero_prop_delta
                        effect_size = base_effect_size + effect_size_delta
                        
                        x, y = generate_data(sample_size, base_zero_prop, effect_size, dist, zero_prop_delta=zero_prop_delta, sigma=5)
                        
                        # Apply tests
                        mod_p = modified_wilcoxon(x, y)
                        std_p = standard_wilcoxon(x, y)
                        t_p = t_test(x, y)
                        
                        # Count rejections
                        mod_rejects += (mod_p < alpha)
                        std_rejects += (std_p < alpha)
                        t_rejects += (t_p < alpha)

                    # Calculate rejection rates
                    results.append({
                        'sample_size': sample_size,
                        'alpha': alpha,
                        'distribution': dist,
                        'positive_prop_x': 1-base_zero_prop,
                        'postive_prop_delta': -zero_prop_delta,
                        'effect_size': effect_size,
                        'null_true': (effect_size == 0.0 and zero_prop_delta == 0.0),
                        'modified_wilcoxon': mod_rejects / n_iter,
                        'standard_wilcoxon': std_rejects / n_iter,
                        't_test': t_rejects / n_iter
                    })
    
    return pd.DataFrame(results)

### Run the Simulation

In [7]:
# Run simulation and analyze results
np.random.seed(42)

# Run the simulation
results = run_simulation(n_iter=1000)

# Separate Type I error from power
type1_error = results[results['null_true'] == True]
power = results[results['null_true'] == False]


Running: dist=lognormal, sample_size=50, zero_prop_delta=0.0, effect_size_delta=0.0
Running: dist=lognormal, sample_size=50, zero_prop_delta=0.0, effect_size_delta=0.05
Running: dist=lognormal, sample_size=50, zero_prop_delta=0.0, effect_size_delta=1.0
Running: dist=lognormal, sample_size=50, zero_prop_delta=0.0, effect_size_delta=2.0
Running: dist=lognormal, sample_size=50, zero_prop_delta=-0.1, effect_size_delta=0.0
Running: dist=lognormal, sample_size=50, zero_prop_delta=-0.1, effect_size_delta=0.05
Running: dist=lognormal, sample_size=50, zero_prop_delta=-0.1, effect_size_delta=1.0
Running: dist=lognormal, sample_size=50, zero_prop_delta=-0.1, effect_size_delta=2.0
Running: dist=lognormal, sample_size=200, zero_prop_delta=0.0, effect_size_delta=0.0
Running: dist=lognormal, sample_size=200, zero_prop_delta=0.0, effect_size_delta=0.05
Running: dist=lognormal, sample_size=200, zero_prop_delta=0.0, effect_size_delta=1.0
Running: dist=lognormal, sample_size=200, zero_prop_delta=0.0, eff

In [8]:
# Display Type I error results
print("Type I Error Rates:")
type1_error.pivot_table(
    index=['distribution', 'positive_prop_x', 'postive_prop_delta', 'sample_size'],
    columns='alpha',
    values=['modified_wilcoxon', 'standard_wilcoxon', 't_test']
)


Type I Error Rates:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,modified_wilcoxon,standard_wilcoxon,t_test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,alpha,0.05,0.05,0.05
distribution,positive_prop_x,postive_prop_delta,sample_size,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
lognormal,0.2,-0.0,50,0.053,0.008,0.0
lognormal,0.2,-0.0,200,0.055,0.003,0.003
positive_cauchy,0.2,-0.0,50,0.062,0.004,0.018
positive_cauchy,0.2,-0.0,200,0.054,0.008,0.027


In [10]:
# Display Power results - with option to display all rows
# Set display options to show the full table
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Display Power results
print("\nPower Results:")
power.pivot_table(
    index=['distribution', 'positive_prop_x', 'postive_prop_delta', 'sample_size', 'effect_size'],
    columns='alpha',
    values=['modified_wilcoxon', 'standard_wilcoxon', 't_test']
)


Power Results:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,modified_wilcoxon,standard_wilcoxon,t_test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,alpha,0.05,0.05,0.05
distribution,positive_prop_x,postive_prop_delta,sample_size,effect_size,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
lognormal,0.2,-0.0,50,0.05,0.063,0.004,0.002
lognormal,0.2,-0.0,50,1.0,0.07,0.006,0.0
lognormal,0.2,-0.0,50,2.0,0.077,0.001,0.003
lognormal,0.2,-0.0,200,0.05,0.044,0.005,0.007
lognormal,0.2,-0.0,200,1.0,0.069,0.004,0.002
lognormal,0.2,-0.0,200,2.0,0.166,0.008,0.005
lognormal,0.2,0.1,50,0.0,0.213,0.075,0.002
lognormal,0.2,0.1,50,0.05,0.209,0.085,0.003
lognormal,0.2,0.1,50,1.0,0.269,0.074,0.003
lognormal,0.2,0.1,50,2.0,0.325,0.1,0.005


## Regression Adjustment

Data generating process:



$$ w_i \sim \mathcal{N}(0,1),$$

$$z_i|w_i \sim \textrm{Bernoulli}\left( \frac{1}{1+e^{-\gamma w_i}} \right),$$ 

$$y_i|z_i,w_i \sim \textrm{Poisson}\left( e^{2 + \beta_z z_i + \beta_w w_i}\right)$$


Here $\gamma \geq 0$ controls the degree of confounding ($\gamma = 0$ implies no confounding). 

In [None]:
# !pip install statsmodels


In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import warnings

# Suppress FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Set random seed for reproducibility
np.random.seed(42)

def generate_data(n_samples, beta_t=0, beta_x=1, conf_strength=0.5):
    """Generate data with a Poisson outcome, treatment T, and confounder X"""
    # Generate confounder X
    X = np.random.normal(0, 1, n_samples)
    
    # Generate treatment with dependency on X (confounding)
    p_treat = 1 / (1 + np.exp(-(conf_strength * X)))
    T = np.random.binomial(1, p_treat, n_samples)
    
    # Generate Poisson outcome
    lambda_i = np.exp(2 + beta_t * T + beta_x * X)
    Y = np.random.poisson(lambda_i, n_samples)
    
    return pd.DataFrame({'Y': Y, 'T': T, 'X': X})

def run_simulation(conf_strength, n_samples=200, n_sims=5000, alpha=0.05):
    """Run simulation and return Type I error rates"""
    reject_unadj = 0
    reject_adj = 0
    
    for _ in range(n_sims):
        # Generate data under null hypothesis (beta_t = 0)
        data = generate_data(n_samples, beta_t=0, beta_x=0.05, conf_strength=conf_strength)
        
        # Unadjusted model (raw test)
        model_unadj = sm.GLM(data['Y'], sm.add_constant(data['T']), family=sm.families.Poisson())
        results_unadj = model_unadj.fit(disp=0)
        if results_unadj.pvalues[1] < alpha:
            reject_unadj += 1
        
        # Adjusted model (regression adjustment)
        model_adj = sm.GLM(data['Y'], sm.add_constant(pd.DataFrame({'T': data['T'], 'X': data['X']})), 
                          family=sm.families.Poisson())
        results_adj = model_adj.fit(disp=0)
        if results_adj.pvalues[1] < alpha:
            reject_adj += 1
    
    # Calculate Type I error rates
    type1_unadj = reject_unadj / n_sims
    type1_adj = reject_adj / n_sims
    
    return type1_unadj, type1_adj

# Confounding strengths to test
conf_strengths = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]

# Run simulations
results = []
for conf in conf_strengths:
    print(f"Running simulation with confounding strength = {conf}")
    type1_unadj, type1_adj = run_simulation(conf)
    results.append([conf, type1_unadj, type1_adj])

# Create results table
results_df = pd.DataFrame(results, columns=['Confounding Strength', 'Type I Error (Raw)', 'Type I Error (RA)'])
results_df['Inflation Factor'] = results_df['Type I Error (Raw)'] / 0.05

print("\nType I Error Comparison: Raw Test vs. Regression Adjustment")
print(results_df.to_string(index=False, float_format=lambda x: f"{x:.4f}"))

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

def generate_data(n_samples, beta_t, beta_x=1, conf_strength=0):
    """
    Generate data with a Poisson outcome, treatment T, and predictor X
    
    Parameters:
    - n_samples: Number of observations
    - beta_t: True treatment effect
    - beta_x: Effect of X on outcome
    - conf_strength: Strength of confounding (0 = no confounding)
    """
    # Generate predictor X
    X = np.random.normal(0, 1, n_samples)
    
    # Generate treatment with dependency on X (confounding strength controlled by conf_strength)
    p_treat = 1 / (1 + np.exp(-(conf_strength * X)))
    T = np.random.binomial(1, p_treat, n_samples)
    
    # Generate Poisson outcome
    lambda_i = np.exp(2 + beta_t * T + beta_x * X)
    Y = np.random.poisson(lambda_i, n_samples)
    
    return pd.DataFrame({'Y': Y, 'T': T, 'X': X})

def calculate_power(n_samples, beta_t, beta_x=1, conf_strength=0, n_sims=1000, alpha=0.05):
    """
    Calculate power for both unadjusted and adjusted models
    
    Parameters:
    - n_samples: Number of observations
    - beta_t: True treatment effect (non-zero for power calculation)
    - beta_x: Effect of X on outcome
    - conf_strength: Strength of confounding
    - n_sims: Number of simulation runs
    - alpha: Significance level
    
    Returns:
    - Power for both models
    """
    # Store rejection indicators
    reject_unadj = 0
    reject_adj = 0
    
    for _ in range(n_sims):
        # Generate data with specified confounding and treatment effect
        data = generate_data(n_samples, beta_t=beta_t, beta_x=beta_x, conf_strength=conf_strength)
        
        # Unadjusted model (raw test)
        model_unadj = sm.GLM(data['Y'], sm.add_constant(data['T']), family=sm.families.Poisson())
        results_unadj = model_unadj.fit(disp=0)
        if results_unadj.pvalues[1] < alpha:
            reject_unadj += 1
        
        # Adjusted model (regression adjustment)
        model_adj = sm.GLM(data['Y'], 
                          sm.add_constant(pd.DataFrame({'T': data['T'], 'X': data['X']})), 
                          family=sm.families.Poisson())
        results_adj = model_adj.fit(disp=0)
        if results_adj.pvalues[1] < alpha:
            reject_adj += 1
    
    # Calculate power
    power_unadj = reject_unadj / n_sims
    power_adj = reject_adj / n_sims
    
    return power_unadj, power_adj

# Configuration for power simulations
effect_sizes = [0.1 + 0.01*i for i in range(0, 11)]
n_samples = 200
n_sims = 1000
conf_strengths = [0.0, 0.1]  # No confounding and moderate confounding

# Run comprehensive simulations
results = []
for conf in conf_strengths:
    conf_label = "No Confounding" if conf == 0.0 else "With Confounding"
    print(f"\nRunning simulations for {conf_label} (conf_strength = {conf})")
    
    for effect in effect_sizes:
        print(f"  Effect size: {effect}")
        power_unadj, power_adj = calculate_power(
            n_samples=n_samples, 
            beta_t=effect, 
            beta_x=1.0, 
            conf_strength=conf,
            n_sims=n_sims
        )
        
        # Store results with all details
        results.append({
            'Confounding': conf_label,
            'Confounding Strength': conf,
            'Treatment Effect': effect,
            'Power (Raw)': power_unadj,
            'Power (RA)': power_adj,
            'Power Difference': power_adj - power_unadj,
            'Relative Improvement': ((power_adj / power_unadj) - 1) * 100 if power_unadj > 0 else float('inf')
        })

# Create comprehensive results table
results_df = pd.DataFrame(results)

# Print formatted table
print("\nPower Comparison: Raw Test vs. Regression Adjustment")
print(f"Sample Size: {n_samples}, Simulations: {n_sims}")

# Group by confounding status for clearer presentation
for conf_label in ["No Confounding", "With Confounding"]:
    subset = results_df[results_df['Confounding'] == conf_label]
    print(f"\n{conf_label} (γ = {subset['Confounding Strength'].iloc[0]}):")
    
    formatted_subset = subset[['Treatment Effect', 'Power (Raw)', 'Power (RA)', 'Power Difference']]
    print(formatted_subset.to_string(index=False, float_format=lambda x: f"{x:.4f}"))

# LaTeX table format for the comprehensive results
print("\nLaTeX Table Format:")
latex_table = """
\\begin{table}[htbp]
\\centering
\\begin{tabular}{cccccc}
\\hline
Confounding ($\\gamma$) & Treatment Effect ($\\beta_T$) & Power (Raw) & Power (RA) & Difference & Improvement (\\%) \\\\
\\hline"""

for conf in conf_strengths:
    subset = results_df[results_df['Confounding Strength'] == conf]
    for _, row in subset.iterrows():
        latex_table += f"\n{row['Confounding Strength']} & {row['Treatment Effect']:.1f} & {row['Power (Raw)']:.4f} & {row['Power (RA)']:.4f} & {row['Power Difference']:.4f} & {row['Relative Improvement']:.1f} \\\\"
    
    # Add a separator between confounding scenarios
    if conf == 0.0:
        latex_table += "\n\\hline"

latex_table += """
\\hline
\\end{tabular}
\\caption{Power Comparison: Raw Test vs. Regression Adjustment}
\\label{tab:power}
\\end{table}
"""

print(latex_table)

# Create visualization
plt.figure(figsize=(12, 8))

# Plot for No Confounding
no_conf = results_df[results_df['Confounding'] == "No Confounding"]
plt.subplot(1, 2, 1)
plt.plot(no_conf['Treatment Effect'], no_conf['Power (Raw)'], 'bo-', label='Raw Test')
plt.plot(no_conf['Treatment Effect'], no_conf['Power (RA)'], 'ro-', label='Regression Adjustment')
plt.xlabel('Treatment Effect Size')
plt.ylabel('Power')
plt.title('Power Comparison: No Confounding (γ = 0)')
plt.grid(True, alpha=0.3)
plt.legend()

# Plot for With Confounding
with_conf = results_df[results_df['Confounding'] == "With Confounding"]
plt.subplot(1, 2, 2)
plt.plot(with_conf['Treatment Effect'], with_conf['Power (Raw)'], 'bo-', label='Raw Test')
plt.plot(with_conf['Treatment Effect'], with_conf['Power (RA)'], 'ro-', label='Regression Adjustment')
plt.xlabel('Treatment Effect Size')
plt.ylabel('Power')
plt.title(f'Power Comparison: With Confounding (γ = {conf_strengths[1]})')
plt.grid(True, alpha=0.3)
plt.legend()

plt.tight_layout()
plt.savefig('power_comparison_combined.png')
plt.show()