In [None]:
import pandas as pd
import numpy as np
import scipy
import seaborn as sns
import matplotlib
import os

In [None]:
import matplotlib.pyplot as plt
sns.set_theme(style="ticks", palette=sns.color_palette("Set2"))


In [None]:
working_folder = "."
figures_folder = working_folder + os.path.sep + 'figures' + os.path.sep
if not os.path.isdir(figures_folder):
    os.mkdir(figures_folder)
abtest_metrics_df = pd.read_parquet(working_folder + os.path.sep + 'abtest_metrics_anonymised.parquet')

## parameters

In [None]:
alpha = 0.05

## function definitions

In [None]:
def mSPRT_vanilla_normal_p_value_aux(mean_x: float, mean_y: float, variance_x: float, variance_y: 
                                     float, count_x: float, count_y: float, theta_0: float = 0, tau_sq: float = 0.0001):
    
    if (count_x == 0) or (count_y == 0):
        return 1.0

    count_mean = 2 / (1/count_x + 1/count_y)

    test_statistic = (
      np.sqrt((variance_x + variance_y)/
              (variance_x + variance_y + count_mean * tau_sq)) *
      np.exp((count_mean ** 2.0 * tau_sq *
              (mean_y - mean_x - theta_0) ** 2.0) /
             (2.0 * (variance_x + variance_y) *
              (variance_x + variance_y + count_mean * tau_sq)))
    )
    return 1.0 / max(1.0, test_statistic)


## quality checks (to delete)

In [None]:
mSPRT_vanilla_normal_p_value_aux(0.5, 0.506, 0.25, 0.25, 200000, 100000, 0, 0.0001)

In [None]:
abtest_metrics_df.count()

In [None]:
abtest_metrics_df.drop_duplicates(['experiment_id', 'variant_id', 'metric_id', 'time_since_start']).count()

TODO: Find the 8 duplicate rows

## Dataset descriptive statistics

### Reproduce the effect size distribution plot in the report "ASOS experimentation meta-analysis"

In [None]:
experiment_keys_overall_df = \
abtest_metrics_df \
.groupby(['experiment_id', 'variant_id', 'metric_id'])\
.agg({'time_since_start':'max'})

t_test_data_df = (
  experiment_keys_overall_df
  .reset_index()
  .merge(abtest_metrics_df, how='inner')
)


In [None]:
ttest_lambda = lambda x: scipy.stats.ttest_ind_from_stats(x['mean_c'],np.sqrt(x['variance_c']),
                                                          x['count_c'], x['mean_t'],
                                                          np.sqrt(x['variance_t']), x['count_t'],
                                                          equal_var=False)[1]

#scipy.stats.ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2, equal_var=True, alternative='two-sided')

In [None]:
t_test_data_df['t-test-p-value']=t_test_data_df.apply(ttest_lambda, axis=1)
abtest_metrics_df = abtest_metrics_df.merge(t_test_data_df[['experiment_id', 'variant_id', 'metric_id','t-test-p-value']], 
                                            on=['experiment_id', 'variant_id', 'metric_id'])

In [None]:
import matplotlib.pyplot as plt

for metric_id in t_test_data_df['metric_id'].unique():
    print('metric_id', metric_id)
    plt.figure()
    sns.displot(t_test_data_df[t_test_data_df['metric_id']==metric_id]['t-test-p-value'],bins=np.linspace(0,1,20),rug=False,kde=False)
    #axlabel='Metric: '+ str(metric_id)
    plt.xlim(0, 1)
    plt.show()
    plt.savefig(figures_folder + 't-test_p-value_metric_' + str(metric_id) + '.pdf')



Calculate mSPRT p-value for ALL rows  
Calculate Bayesian A/B test Bayes Factor + Posterior Odds for ALL rows  
Calculate % experiemnt progress (i.e. time_since_start / max(time_since_start))

In [None]:
abtest_metrics_df['mSPRT-vanilla-p-value-aux'] = (
  abtest_metrics_df.apply(
    lambda row: mSPRT_vanilla_normal_p_value_aux(
      mean_x=row['mean_c'], mean_y=row['mean_t'],
      variance_x=row['variance_c'], variance_y=row['variance_t'],
      count_x=row['count_c'], count_y=row['count_t'],
      theta_0=0, tau_sq=0.0001),
    axis=1)
)


In [None]:
abtest_metrics_df = abtest_metrics_df.sort_values(['experiment_id', 'variant_id', 'metric_id'])

abtest_metrics_df['mSPRT-vanilla-p-value'] = (
  abtest_metrics_df
  .groupby(['experiment_id', 'variant_id', 'metric_id'])
  ['mSPRT-vanilla-p-value-aux']
  .transform(lambda row: row.expanding(min_periods=2).min())
)

abtest_metrics_df['mSPRT-vanilla-p-value'] = (
  abtest_metrics_df['mSPRT-vanilla-p-value'].fillna(1.0)
)


In [None]:
abtest_metrics_df = abtest_metrics_df.merge(abtest_metrics_df.groupby(['experiment_id', 'variant_id', 'metric_id'])\
                                            ['time_since_start'].max().reset_index()\
                                            .rename(columns={'time_since_start':'design_duration'}),on=['experiment_id', 'variant_id', 'metric_id'])
abtest_metrics_df['time_progress'] = abtest_metrics_df['time_since_start']/abtest_metrics_df['design_duration']
abtest_metrics_df['experiment_variant_id'] = abtest_metrics_df['experiment_id'].map(str) + '-' + abtest_metrics_df['variant_id'].map(str)

In [None]:
abtest_metrics_df

In [None]:
sns.lineplot(x="time_progress", y="mSPRT-vanilla-p-value",
             hue="experiment_variant_id",
             data=abtest_metrics_df[(abtest_metrics_df.experiment_variant_id.isin(['c56288-1', 'a4386f-1', 'bac0d3-1', '08bcc2-1', '591c2c-1'])) & (abtest_metrics_df.metric_id==1)])

plt.savefig(figures_folder + 'time_progress_significance_metric_' + str(metric_id) + '.pdf')


In [None]:


abtest_metrics_df['Both significant']=(abtest_metrics_df['t-test-p-value']<=alpha) & \
    (abtest_metrics_df['mSPRT-vanilla-p-value']<=alpha)
abtest_metrics_df['Only mSPRT significant']=(abtest_metrics_df['t-test-p-value']>alpha) & \
    (abtest_metrics_df['mSPRT-vanilla-p-value']<=alpha)
abtest_metrics_df['Both not significant']=(abtest_metrics_df['t-test-p-value']>alpha) & \
    (abtest_metrics_df['mSPRT-vanilla-p-value']>alpha)
abtest_metrics_df['Only t-test significant']=(abtest_metrics_df['t-test-p-value']<=alpha) & \
    (abtest_metrics_df['mSPRT-vanilla-p-value']>alpha)


In [None]:
abtest_metrics_df[(abtest_metrics_df['time_progress']==1)].groupby('metric_id')[
    ['Both significant','Only mSPRT significant','Both not significant','Only t-test significant']].sum()

In [None]:
nbins = 10
labels=['Both not significant','Only mSPRT significant','Only t-test significant','Both significant']
abtest_metrics_df['time_progress_bin'] = pd.cut(abtest_metrics_df['time_progress'], \
                                                np.linspace(0,1,nbins+1),retbins=False)

#count snapshots per time_bin
abtest_metrics_evol_df = abtest_metrics_df.groupby(['experiment_variant_id','time_progress_bin'])[['time_progress']]\
    .count().groupby('experiment_variant_id').min()\
    .rename(columns={'time_progress':'min_samples_in_time_bin'}).reset_index()

#filter experiment_variant_id that have at least one entry per time bin
abtest_metrics_evol_df = abtest_metrics_evol_df[
    abtest_metrics_evol_df['min_samples_in_time_bin']>=1][['experiment_variant_id']]


print('#experiments with at least one entry per time bin', len(abtest_metrics_evol_df['experiment_variant_id'].unique()))
abtest_metric_evol_df = abtest_metrics_df[['experiment_variant_id','metric_id','time_progress_bin']+ labels]\
.merge(abtest_metrics_evol_df,on='experiment_variant_id')\
    .groupby(['time_progress_bin','experiment_variant_id','metric_id']).last().reset_index()
abtest_metric_evol_df['time_progress_bin'] = [x.mid for x in abtest_metric_evol_df['time_progress_bin'].values]

abtest_metric_evol_df = abtest_metric_evol_df.groupby(['time_progress_bin','metric_id']).mean().reset_index()


In [None]:
abtest_metric_evol_df

In [None]:
for metric_id in abtest_metric_evol_df['metric_id'].unique():
    print('metric_id', metric_id)
    x=abtest_metric_evol_df['time_progress_bin'].unique()
    y = abtest_metric_evol_df[abtest_metric_evol_df['metric_id']==metric_id][labels].T.values

    # set seaborn style

    # Plot
    plt.stackplot(x,y, labels=labels)
    plt.ylim(0,1)

    plt.legend(loc='lower center')
    plt.show()
    plt.savefig(figures_folder + 'time_progress_type_metric_' + str(metric_id) + '.pdf')
    

Confusion matrix between Welch's t-test and mSPRT  
Confusion matrix between Welch's t-test and Bayesian A/B test



cumulative plot off type-I, type-II and correct rejection or no-rejection

Show data spike experiment, comment on how it affects mSPRT(vanilla), mSPRT(in deployment) and Bayesian A/B Test.

TODO: use markdown instead of cell title for export


TODO: TODO: Fix effective sample size n
TODO: estimate sigma^2 via whatever means