In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import binomtest
from scipy.stats import ttest_ind
import scipy.stats as stats
%matplotlib inline

In [3]:
df_final = pd.read_csv("../data/clean/df_combined.csv")
df_exp = pd.read_csv("../data/clean/experiments.csv")

In [15]:
# Merge the two tables based on 'client_id'
df_merged = df_final.merge(df_exp[['client_id', 'variation']], on='client_id', how='left')
df_merged['variation'] = df_merged['variation'].fillna('Unknown')
# Drop null rows
df_merged = df_merged[df_merged['variation'] != 'Unknown']
df_merged.reset_index(drop=True, inplace=True)

In [19]:
df_merged = df_merged.drop(columns=['time_diff', 'error', 'step_numeric'])

In [21]:
df_merged.shape

(443897, 6)

In [8]:
# Filter data for control and test groups
control_group = df_merged[df_merged['variation'] == 'Control']
test_group = df_merged[df_merged['variation'] == 'Test']

In [10]:
# Calculate completion rate for control group
control_completed = control_group[control_group['process_step'] == 'confirm']['client_id'].nunique()
control_started = control_group[control_group['process_step'] == 'start']['client_id'].nunique()
control_completion_rate = control_completed / control_started
print(control_completion_rate)

0.6718126225914388


In [14]:
# Calculate completion rate for test group
test_completed = test_group[test_group['process_step'] == 'confirm']['client_id'].nunique()
test_started = test_group[test_group['process_step'] == 'start']['client_id'].nunique()
test_completion_rate = test_completed / test_started
print(test_completion_rate)

0.700438547171933


In [16]:
observed_increase = test_completion_rate - control_completion_rate
print(observed_increase)

0.028625924580494178


In [18]:
#date_time column in datetime format
df_merged['date_time'] = pd.to_datetime(df_merged['date_time'])
# Sort Data
df_merged = df_merged.sort_values(by=['client_id', 'visit_id', 'date_time'])

# Calculate time difference to the next step
df_merged['time_diff'] = df_merged.groupby(['client_id', 'visit_id'])['date_time'].diff()

# Set time_diff to 0 for the "start" process step
df_merged.loc[df_merged['process_step'] == 'start', 'time_diff'] = pd.Timedelta(seconds=0)

# Average time spent per step, separated control and test
completion_times_by_group = df_merged.groupby(['variation', 'process_step'])['time_diff'].mean().reset_index()

print(completion_times_by_group)

  variation process_step                 time_diff
0   Control      confirm 0 days 00:02:10.250405388
1   Control        start           0 days 00:00:00
2   Control       step_1 0 days 00:00:42.260355239
3   Control       step_2 0 days 00:00:40.950420604
4   Control       step_3 0 days 00:01:34.860185229
5      Test      confirm 0 days 00:02:09.595182350
6      Test        start           0 days 00:00:00
7      Test       step_1 0 days 00:00:37.896578700
8      Test       step_2 0 days 00:00:48.206424108
9      Test       step_3 0 days 00:01:37.191935040


In [22]:
threshold = 0.05

if observed_increase >= threshold:
    print(f"The observed increase is >= than the 5% threshold.")
else:
    print(f"The observed increase is <= than the 5% threshold.")


The observed increase is <= than the 5% threshold.


In [26]:
#Calculate completion rates (number of clients who completed each step)
clients_started = df_merged[df_merged['process_step'] == 'start']['client_id'].nunique()
completion_rates = (
    df_merged.groupby(['variation', 'process_step'])['client_id'].nunique() / clients_started
).reset_index(name='completion_rate')
print(completion_rates)

  variation process_step  completion_rate
0   Control      confirm         0.415817
1   Control        start         0.618948
2   Control       step_1         0.541120
3   Control       step_2         0.499329
4   Control       step_3         0.467506
5      Test      confirm         0.266904
6      Test        start         0.381052
7      Test       step_1         0.346602
8      Test       step_2         0.317908
9      Test       step_3         0.298240


In [28]:
step_order = { 'step_1': 0, 'step_2': 2, 'step_3': 3, 'confirm': 4}

df_merged['step_numeric'] = df_merged['process_step'].map(step_order)

# Ensure the data is sorted by client_id and date_time
df_merged = df_merged.sort_values(by=['client_id', 'date_time'])

# Calculate errors separately for Control and Test groups
df_merged['error'] = df_merged.groupby(['variation', 'client_id' , 'visitor_id' , 'visit_id' ])['step_numeric'].shift(-1) < df_merged['step_numeric']

# Calculate error rate for each group
error_rate_by_variation = df_merged.groupby('variation')['error'].mean().reset_index()

# Display results
print(error_rate_by_variation)


  variation     error
0   Control  0.033848
1      Test  0.032253


In [30]:
df_control_transitions = df_merged[df_merged['variation'] == 'Control']
total_test_transitions = len(df_control_transitions[df_control_transitions["process_step"] != "start"])

df_control_backwards = df_control_transitions[df_control_transitions['error'] == True]

control_transitions = 100 * len(df_control_backwards) / total_test_transitions
control_transitions



4.9814105918316764

In [32]:
df_test_transitions = df_merged[df_merged['variation'] == 'Test']

total_test_transitions = len(df_test_transitions[df_test_transitions["process_step"] != "start"])

df_test_backwards = df_test_transitions[df_test_transitions['error'] == True]

test_transitions = 100 * len(df_test_backwards) / total_test_transitions
test_transitions

4.712799563369333

In [34]:
step_order = {'start': 0, 'step_1': 1, 'step_2': 2, 'step_3': 3, 'confirm': 4}
df_merged['step_numeric'] = df_merged['process_step'].map(step_order)
# Ensure the data is sorted by client_id and date_time
df_merged = df_merged.sort_values(by=['client_id', 'date_time'])
# Check if the next step is less than the current step
df_merged['error'] = df_merged.groupby('client_id')['step_numeric'].shift(-1) < df_merged['step_numeric']
error_rate = df_merged['error'].mean()
print(error_rate)

0.10226471456216195


In [36]:
error_rate_by_step = df_merged.groupby('process_step')['error'].mean()
print(error_rate_by_step)

process_step
confirm    0.063439
start      0.000000
step_1     0.163266
step_2     0.141296
step_3     0.218799
Name: error, dtype: float64


In [24]:
# Binomial test
successes = test_completed
total = test_started
p_null = control_completion_rate

binom_result = binomtest(successes, total, p=p_null, alternative='greater')
print(f"Binomial Test P-value : {binom_result.pvalue:.10e}")


Binomial Test P-value : 5.6838514065e-24


In [38]:
# Create completion arrays for Control and Test groups
control_completion = control_group['process_step'].apply(lambda x: 1 if x == 'confirm' else 0)
test_completion = test_group['process_step'].apply(lambda x: 1 if x == 'confirm' else 0)

In [40]:
t_stat, p_value = ttest_ind(control_completion, test_completion, equal_var=False)
print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.10e}")

T-statistic: -14.8044, P-value: 1.4182407520e-49


In [42]:
# Interpret the result
alpha = 0.05
if p_value < alpha:
    print("The difference in completion rates between Control and Test groups is statistically significant.")
else:
    print("The difference in completion rates between Control and Test groups is not statistically significant.")

The difference in completion rates between Control and Test groups is statistically significant.


In [44]:
df_merged['completion_rate_control'] = control_completion_rate 
df_merged['completion_rate_test'] = test_completion_rate        

In [46]:
df_merged['error_rate_control'] = error_rate_by_variation[error_rate_by_variation['variation'] == 'Control']['error'].values[0]
df_merged['error_rate_test'] = error_rate_by_variation[error_rate_by_variation['variation'] == 'Test']['error'].values[0]

In [48]:
# Error rates (overall)
error_rate_overall = pd.DataFrame({
    'variation': ['Control', 'Test'],
    'error_rate': [
        error_rate_by_variation.query("variation == 'Control'")['error'].values[0],
        error_rate_by_variation.query("variation == 'Test'")['error'].values[0]
    ]
})

error_rate_by_step = error_rate_by_step.rename('step_error_rate')

In [114]:
df_merged.to_csv("../data/clean/df_combined_exp.csv", index=False)