In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df_1 = pd.read_csv(r"..\..\data\clean\all_clients.csv")
df_2 = pd.read_csv(r"..\..\data\clean\control_filter.csv")
df_3 = pd.read_csv(r"..\..\data\clean\test_filter.csv")

In [None]:
def valid_session(df: pd.DataFrame) -> pd.DataFrame:
    # Convert the 'date_time' column to datetime format
    df['date_time'] = pd.to_datetime(df['date_time'])
    
    # Get the most recent session for each client (the latest visit_id per client)
    most_recent_sessions = df.loc[df.groupby('client_id')['date_time'].idxmax()]
    
    # Now filter the data to include only the most recent sessions
    df_recent = df[df['visit_id'].isin(most_recent_sessions['visit_id'])]
    
    # Sort by client_id, visitor_id, and date_time to make sure steps are in correct order
    df_recent = df_recent.sort_values(by=['client_id', 'visitor_id', 'date_time'])

    return df_recent


df_1 = valid_session(df_1)


step_mapping = {
        'start': 0,
        'step_1': 1,
        'step_2': 2,
        'step_3': 3,
        'confirm': 4
    }
def step_map(df):
    df["process_step_num"] = df["process_step"].map(step_mapping)
    df = df.sort_values(by=["client_id", "process_step_num"], ascending = True)

    return df


In [None]:
# Function to calculate total average time per step and from start to confirm (overall)
def calculate_total_avg_times(df: pd.DataFrame) -> pd.DataFrame:
    # Convert the 'date_time' column to datetime format
    df['date_time'] = pd.to_datetime(df['date_time'])
    
    # Calculate the time difference between consecutive steps
    df['time_diff'] = df.groupby(['client_id', 'visitor_id'])['date_time'].diff()
    
    # Remove rows where time_diff is NaT (e.g., the first step in a session)
    df = df.dropna(subset=['time_diff'])
    
    # Calculate the total average time per step (for all clients)
    total_time_per_step = df.groupby('process_step')['time_diff'].mean().reset_index()

    # Format total time per step as seconds
    total_time_per_step['total_seconds'] = total_time_per_step['time_diff'].dt.total_seconds()

    total_time_per_step['total_seconds'] = total_time_per_step['total_seconds'].round(2)
    # Calculate the total average time from "start" to "confirm" for all clients
    # Filter the data to only "start" and "confirm" steps
    start_to_confirm_time = df[df['process_step'].isin(['start', 'confirm'])]
    
    # Merge the "start" and "confirm" steps for each client
    start_to_confirm_time = start_to_confirm_time.pivot_table(
        index=['client_id', 'visitor_id'],
        columns='process_step',
        values='date_time',
        aggfunc='first'
    ).reset_index()
    
    # Calculate the time difference from "start" to "confirm"
    start_to_confirm_time['start_to_confirm_diff'] = start_to_confirm_time['confirm'] - start_to_confirm_time['start']
    
    # Calculate the total seconds for start to confirm time
    start_to_confirm_time['total_seconds'] = start_to_confirm_time['start_to_confirm_diff'].dt.total_seconds()
    
    # Calculate the total average time from "start" to "confirm"
    avg_start_to_confirm = start_to_confirm_time['total_seconds'].mean()

    # Return both results: total_time_per_step and avg_start_to_confirm
    return total_time_per_step, avg_start_to_confirm

# Example usage:
# Assuming df_1 is your DataFrame and it contains the relevant data
total_time_per_step, avg_start_to_confirm = calculate_total_avg_times(df_1)

# Display results
print("Total average time per step:")
print(total_time_per_step)
print(f"\nTotal average time from 'start' to 'confirm': {avg_start_to_confirm: .2f} seconds")


In [None]:
import pandas as pd

def calculate_total_avg_times(df):
    # Convert the date_time column to datetime if it's not already
    df['date_time'] = pd.to_datetime(df['date_time'])
    
    # Sort by visitor_id and date_time to ensure steps are in the correct order
    df = df.sort_values(by=['visitor_id', 'date_time'])
    
    # Calculate time_diff and total_seconds
    df['time_diff'] = df.groupby('visitor_id')['date_time'].diff().fillna(pd.Timedelta(seconds=0))
    df['total_seconds'] = df['time_diff'].dt.total_seconds()
    
    # Set the time_diff for the 'start' step to 0 since it's the beginning
    df.loc[df['process_step'] == 'start', 'total_seconds'] = 0
    df['formatted_time'] = df['time_diff'].apply(lambda x: str(x) if x != pd.Timedelta(seconds=0) else '00:00:00')
    
    # Calculate total time spent on each step (grouping by process_step)
    total_time_per_step = df.groupby('process_step')['total_seconds'].sum().reset_index()
    
    # Calculate average time between start and confirm for each visitor
    start_to_confirm_avg = df[df['process_step'] == 'start'].merge(
        df[df['process_step'] == 'confirm'], on='visitor_id', suffixes=('_start', '_confirm')
    )
    start_to_confirm_avg['start_to_confirm'] = (
        start_to_confirm_avg['date_time_confirm'] - start_to_confirm_avg['date_time_start']
    ).dt.total_seconds()
    
    avg_start_to_confirm = start_to_confirm_avg['start_to_confirm'].mean()
    
    return total_time_per_step, avg_start_to_confirm




# Call the function to calculate total time per step and average time between start and confirm
total_time_per_step, avg_start_to_confirm = calculate_total_avg_times(df_1)

# Display the results
print("Total Time per Step:\n", total_time_per_step)
print("Average Time from 'Start' to 'Confirm' (in seconds):", avg_start_to_confirm)
