In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import binomtest
from scipy.stats import ttest_ind
import scipy.stats as stats
%matplotlib inline

In [2]:
df_final = pd.read_csv("../data/clean/df_combined.csv")
df_exp = pd.read_csv("../data/clean/experiments.csv")

In [3]:
# Merge the two tables based on 'client_id'
df_merged = df_final.merge(df_exp[['client_id', 'variation']], on='client_id', how='left')
df_merged['variation'] = df_merged['variation'].fillna('Unknown')
# Drop null rows
df_merged = df_merged[df_merged['variation'] != 'Unknown']
df_merged.reset_index(drop=True, inplace=True)

In [4]:
df_merged.head(10)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,time_diff,step_numeric,error,variation
0,169,201385055_71273495308,749567106_99161211863_557568,start,2017-04-12 20:19:36,,0,False,Control
1,169,201385055_71273495308,749567106_99161211863_557568,step_1,2017-04-12 20:19:45,0 days 00:00:09,1,False,Control
2,169,201385055_71273495308,749567106_99161211863_557568,step_2,2017-04-12 20:20:31,0 days 00:00:46,2,False,Control
3,169,201385055_71273495308,749567106_99161211863_557568,step_3,2017-04-12 20:22:05,0 days 00:01:34,3,False,Control
4,169,201385055_71273495308,749567106_99161211863_557568,confirm,2017-04-12 20:23:09,0 days 00:01:04,4,False,Control
5,555,402506806_56087378777,637149525_38041617439_716659,start,2017-04-15 12:57:56,,0,False,Test
6,555,402506806_56087378777,637149525_38041617439_716659,step_1,2017-04-15 12:58:03,0 days 00:00:07,1,False,Test
7,555,402506806_56087378777,637149525_38041617439_716659,step_2,2017-04-15 12:58:35,0 days 00:00:32,2,False,Test
8,555,402506806_56087378777,637149525_38041617439_716659,step_3,2017-04-15 13:00:14,0 days 00:01:39,3,False,Test
9,555,402506806_56087378777,637149525_38041617439_716659,confirm,2017-04-15 13:00:34,0 days 00:00:20,4,False,Test


In [5]:
# Ensure 'date_time' is in datetime format
df_merged['date_time'] = pd.to_datetime(df_merged['date_time'])


In [6]:
# Sort Data by 'client_id', 'visit_id', and 'date_time'
df_merged = df_merged.sort_values(by=['client_id', 'visit_id', 'date_time'])


In [7]:
# Calculate the time spent on each step (by subtracting the previous step's date_time)
df_merged['time_spent'] = df_merged.groupby(['client_id', 'visit_id'])['date_time'].shift(-1) - df_merged['date_time']

# Set the time spent for the "last" step (e.g., 'step_3' if no next step exists) to 0 (or NaT if it's not available)
df_merged['time_spent'] = df_merged['time_spent'].fillna(pd.Timedelta(seconds=0))

# For the "start" process step, the time spent will be based on the first step after it (calculated above)


In [14]:
import pandas as pd

# Ensure 'date_time' is in datetime format
df_merged['date_time'] = pd.to_datetime(df_merged['date_time'])

# Sort Data by 'client_id', 'visit_id', and 'date_time'
df_merged = df_merged.sort_values(by=['client_id', 'visit_id', 'date_time'])

# Calculate the time spent on each step (by subtracting the previous step's date_time)
df_merged['time_spent'] = df_merged.groupby(['client_id', 'visit_id'])['date_time'].shift(-1) - df_merged['date_time']

# Set the time spent for the "last" step (e.g., 'step_3' if no next step exists) to 0 (or NaT if it's not available)
df_merged['time_spent'] = df_merged['time_spent'].fillna(pd.Timedelta(seconds=0))

# Convert time_spent to total seconds
df_merged['time_spent_seconds'] = df_merged['time_spent'].dt.total_seconds()

# Remove rows where 'process_step' is 'start' (because we don't want to include 'start' in averages)
df_merged = df_merged[df_merged['process_step'] != 'start']

# Group by 'variation' and 'process_step' and calculate the average time spent (in seconds)
average_time_spent = df_merged.groupby(['variation', 'process_step'])['time_spent_seconds'].mean().reset_index()

# Rename the columns for clarity
average_time_spent.rename(columns={'time_spent_seconds': 'average_time_spent_seconds'}, inplace=True)

# Save to CSV for importing into Tableau
average_time_spent.to_csv('../data/clean/average_time_spent_separated_no_start.csv', index=False)

# Print the result
print(average_time_spent)


  variation process_step  average_time_spent_seconds
0   Control      confirm                   22.565801
1   Control       step_1                   50.933991
2   Control       step_2                   84.878574
3   Control       step_3                  119.055850
4      Test      confirm                   38.354453
5      Test       step_1                   62.915688
6      Test       step_2                   84.544419
7      Test       step_3                  117.126354


In [18]:
# Set time_diff to 0 for the "start" process step
df_merged.loc[df_merged['process_step'] == 'start', 'time_diff'] = pd.Timedelta(seconds=0)
# Ensure 'time_diff' is in timedelta format
if not pd.api.types.is_timedelta64_dtype(df_merged['time_diff']):
    df_merged['time_diff'] = pd.to_timedelta(df_merged['time_diff'], errors='coerce')

# Handle any potential null values resulting from coercion
df_merged['time_diff'] = df_merged['time_diff'].fillna(pd.Timedelta(seconds=0))

# Convert to seconds
df_merged['time_diff_seconds'] = df_merged['time_diff'].dt.total_seconds()

# Group by 'variation' and 'process_step' and calculate the average
average_time_diff = df_merged.groupby(['variation', 'process_step'])['time_diff_seconds'].mean().reset_index()

# Rename the columns for clarity
average_time_diff.rename(columns={'time_diff_seconds': 'average_time_diff_seconds'}, inplace=True)

# Save to CSV for importing into Tableau
average_time_diff.to_csv('../data/clean/time_diff_sec.csv', index=False)

print(average_time_diff)

  variation process_step  average_time_diff_seconds
0   Control      confirm                 120.995046
1   Control        start                   0.000000
2   Control       step_1                  41.858131
3   Control       step_2                  40.762462
4   Control       step_3                  94.034644
5      Test      confirm                 112.433945
6      Test        start                   0.000000
7      Test       step_1                  37.584907
8      Test       step_2                  47.939642
9      Test       step_3                  96.180816
