In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [35]:
df_1 = pd.read_csv(r"..\..\data\clean\all_clients.csv")
df_2 = pd.read_csv(r"..\..\data\clean\control_filter.csv")
df_3 = pd.read_csv(r"..\..\data\clean\test_filter.csv")

In [36]:
"""
input: df
output: df

function to extract only the latest session per client based on visit_id

"""

def valid_session(df: pd.DataFrame) -> pd.DataFrame:
    df['date_time'] = pd.to_datetime(df['date_time'])
    
    most_recent_sessions = df.loc[df.groupby('client_id')['date_time'].idxmax()]
    
    df_recent = df[df['visit_id'].isin(most_recent_sessions['visit_id'])]
    
    df_recent = df_recent.sort_values(by=['client_id', 'visitor_id', 'date_time'])

    return df_recent


df_1 = valid_session(df_1)


step_mapping = {
        'start': 0,
        'step_1': 1,
        'step_2': 2,
        'step_3': 3,
        'confirm': 4
    }
"""
input: df
output: df

function to map "process_step" for easier sorting of process steps
"""

def step_map(df):
    df["process_step_num"] = df["process_step"].map(step_mapping)
    df = df.sort_values(by=["client_id", "process_step_num"], ascending = True)

    return df

step_map(df_1)

df_1.tail()

Unnamed: 0.1,Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,variation,process_step_num
276145,276145,9999729,834634258_21862004160,870243567_56915814033_814203,step_2,2017-05-08 16:08:40,Test,2
276144,276144,9999729,834634258_21862004160,870243567_56915814033_814203,step_3,2017-05-08 16:09:19,Test,3
276143,276143,9999729,834634258_21862004160,870243567_56915814033_814203,confirm,2017-05-08 16:09:40,Test,4
244975,244975,9999832,145538019_54444341400,472154369_16714624241_585315,start,2017-05-16 16:46:03,Test,0
244974,244974,9999832,145538019_54444341400,472154369_16714624241_585315,step_1,2017-05-16 16:46:11,Test,1


In [38]:
"""
input: df
output: df

function for total average step time and start-to-confirm
"""

def calculate_avg_time(df):
    df['date_time'] = pd.to_datetime(df['date_time'])

    df.sort_values(by=['visit_id', 'date_time'], inplace=True)

    df['time_diff'] = df.groupby('visit_id')['date_time'].diff()

    # Filter out rows where time_diff is NaT (first row of each visit_id)
    df = df.dropna(subset=['time_diff'])

    average_time_per_step = df.groupby('process_step')['time_diff'].mean()

    #convert date_time to seconds
    average_time_per_step = average_time_per_step.dt.total_seconds()

    start_confirm_df = df[df['process_step'].isin(['start', 'confirm'])]

    start_confirm_df.loc[:, 'time_diff'] = start_confirm_df.groupby('visit_id')['date_time'].diff()

    #keep only "confirm" rows
    start_to_confirm_time = start_confirm_df[start_confirm_df['process_step'] == 'confirm']

    avg_time_start_to_confirm = start_to_confirm_time['time_diff'].mean().total_seconds()  / 60

    print("Average time per process step (in seconds):")
    print(average_time_per_step.round(2))

    print(f"\nAverage time from 'start' to 'confirm': {avg_time_start_to_confirm} min / {avg_time_start_to_confirm * 60} seconds")

calculate_avg_time(df_1)


Average time per process step (in seconds):
process_step
confirm    126.16
start      147.23
step_1      39.04
step_2      42.17
step_3      94.64
Name: time_diff, dtype: float64

Average time from 'start' to 'confirm': 4.664324766666667 min / 279.859486 seconds
