In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import random
from scipy.stats import chisquare

In [None]:
# Generate data for simulation
n = 1000
counties=["Manhattan", "Brooklyn", "Queens", "Bronx"]
user_status = ["Dasher", "Customer", "Restaurant"]
day_time = ["Morning", "Afternoon", "Night"]
np.random.seed(42)
expected_distribution = [0.5, 0.5]
experiment_groups = [1, 0]

data = pd.DataFrame({
    "user_id": list(range(1,n+1)),
    "county": np.random.choice(counties, size=n),
    "day_time": np.random.choice(day_time, size=n),
    "user_status": np.random.choice(user_status, size=n),
    "number_of_orders": np.random.randint(1, 101, size=n)
})
# Experiment 1: No imbalance
data["experiment_1"] = np.random.choice(experiment_groups, size=n, p=expected_distribution)
# Experiment 2: create imbalance
data["experiment_2"] = data["experiment_1"]
filter = (data["county"] == "Manhattan") | (data["user_status"] == "Dasher") | (data["day_time"] == "Night")
data.loc[filter, "experiment_2"] = np.random.choice(experiment_groups, size=sum(filter), p=[0.77, 0.23])
# Experiment 3: only imbalance for user_status
data["experiment_3"] = data["experiment_1"]
filter = data["user_status"] == "Customer"
data.loc[filter, "experiment_3"] = np.random.choice(experiment_groups, size=sum(filter), p=[0.67, 0.33])




# First method using regression for possible attributes which affect the treatment

In [None]:
def SRM_checker(data, experiment):
    # center the outcome variable around expected ratio
    data['is_treatment'] = data[experiment] - 0.5
    formula = "is_treatment ~ 1 + county + user_status + day_time"
    # fit the regression
    m = smf.glm(formula, data=data).fit(cov_type="HC1")
    # get the p-values for the main effect using a Wald test
    wald_p_values = m.wald_test_terms(scalar=True).table
    return wald_p_values

In [None]:
SRM_checker(data, "experiment_1")

Unnamed: 0,statistic,pvalue,df_constraint
Intercept,0.351115,0.553482,1
county,2.083597,0.555238,3
user_status,0.032565,0.983849,2
day_time,0.564852,0.753952,2


In [None]:
SRM_checker(data, "experiment_2")

Unnamed: 0,statistic,pvalue,df_constraint
Intercept,0.565668,0.4519855,1
county,17.851929,0.0004718854,3
user_status,39.76023,2.323677e-09,2
day_time,21.50834,2.135616e-05,2


In [None]:
SRM_checker(data, "experiment_3")

Unnamed: 0,statistic,pvalue,df_constraint
Intercept,20.098404,7.355803e-06,1
county,0.160321,0.9837254,3
user_status,53.849654,2.026265e-12,2
day_time,1.08032,0.582655,2


# Second method using chi-squared

In [None]:
control = data[data['experiment_2'] == 0]

In [None]:
treatment = data[data['experiment_2'] == 1]

In [None]:
county_lst = data['county'].unique()

In [None]:
user_lst = data['user_status'].unique()

In [None]:
daytime_lst = data['day_time'].unique()

In [None]:
cat = {'county':county_lst, 'user_status':user_lst, 'day_time':daytime_lst}
cat

{'county': array(['Queens', 'Bronx', 'Manhattan', 'Brooklyn'], dtype=object),
 'user_status': array(['Dasher', 'Customer', 'Restaurant'], dtype=object),
 'day_time': array(['Afternoon', 'Night', 'Morning'], dtype=object)}

In [None]:
p_value_result = {}

In [None]:
for key, item in cat.items():
  for i in item:
    temp_control = control[control[key] == i].shape[0]
    temp_treatment = treatment[treatment[key] == i].shape[0]
    observed = (temp_control, temp_treatment)
    total = temp_control + temp_treatment
    expected = [total/2, total/2]
    chi_stats, p_value = chisquare(observed, f_exp = expected)
    p_value_result[f"{i}"] = p_value

In [None]:
p_value_result

{'Queens': 0.0016251517273099209,
 'Bronx': 1.4483625824765793e-07,
 'Manhattan': 8.583767625776217e-18,
 'Brooklyn': 2.059062794614967e-06,
 'Dasher': 5.500364298865125e-28,
 'Customer': 0.00010360947064091593,
 'Restaurant': 7.01628575055618e-05,
 'Afternoon': 1.7360072595924857e-06,
 'Night': 2.1579346614405287e-21,
 'Morning': 2.4612450008866864e-06}

In [None]:
result = pd.DataFrame(p_value_result.values(), index = p_value_result.keys(), columns = ['p_value'])

In [None]:
result

Unnamed: 0,p_value
Queens,0.001625152
Bronx,1.448363e-07
Manhattan,8.583768e-18
Brooklyn,2.059063e-06
Dasher,5.500364e-28
Customer,0.0001036095
Restaurant,7.016286e-05
Afternoon,1.736007e-06
Night,2.157935e-21
Morning,2.461245e-06
