In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df_1 = pd.read_csv('/Users/camil/Documents/iron_hack_bootcamp/week_5/2_weeks_project/two_week_project/data/clean/all_clients.csv')
df_2 = pd.read_csv('/Users/camil/Documents/iron_hack_bootcamp/week_5/2_weeks_project/two_week_project/data/clean/test_filter.csv')
df_3 = pd.read_csv('/Users/camil/Documents/iron_hack_bootcamp/week_5/2_weeks_project/two_week_project/data/clean/control_filter.csv')

In [3]:
# Convertir la colonne 'date_time' en type datetime (si ce n'est pas déjà fait)
df_1['date_time'] = pd.to_datetime(df_1['date_time'])

# Trier les données par 'client_id', puis par 'date_time'
df_1 = df_1.sort_values(by=['client_id', 'date_time'], ascending=[True, True])

# Vérifier les premières lignes pour confirmer
df_1.head(60)


Unnamed: 0.1,Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,variation
70803,70803,555,402506806_56087378777,637149525_38041617439_716659,start,2017-04-15 12:57:56,Test
70802,70802,555,402506806_56087378777,637149525_38041617439_716659,step_1,2017-04-15 12:58:03,Test
70801,70801,555,402506806_56087378777,637149525_38041617439_716659,step_2,2017-04-15 12:58:35,Test
70800,70800,555,402506806_56087378777,637149525_38041617439_716659,step_3,2017-04-15 13:00:14,Test
70799,70799,555,402506806_56087378777,637149525_38041617439_716659,confirm,2017-04-15 13:00:34,Test
135918,135918,647,66758770_53988066587,40369564_40101682850_311847,start,2017-04-12 15:41:28,Test
135917,135917,647,66758770_53988066587,40369564_40101682850_311847,step_1,2017-04-12 15:41:35,Test
135916,135916,647,66758770_53988066587,40369564_40101682850_311847,step_2,2017-04-12 15:41:53,Test
135915,135915,647,66758770_53988066587,40369564_40101682850_311847,step_3,2017-04-12 15:45:02,Test
135912,135912,647,66758770_53988066587,40369564_40101682850_311847,confirm,2017-04-12 15:47:45,Test


In [4]:
df_1['process_step'].unique()

array(['start', 'step_1', 'step_2', 'step_3', 'confirm'], dtype=object)

In [5]:
"""
Parameters:
    df (pd.DataFrame): The dataframe containing client process steps.
    step_column (str): The column name for the process steps.
Returns:
    float: The error rate.
    pd.DataFrame: The updated dataframe with an 'error_step' column.
"""

def calculate_process_errors(df, step_column):
    # Map the process steps to numeric values for easier comparison
    step_mapping = {
        'start': 0,
        'step_1': 1,
        'step_2': 2,
        'step_3': 3,
        'confirm': 4
    }
    
    # Add a numeric column for process steps
    df["process_step_num"] = df[step_column].map(step_mapping)
    
    # Add a new column to record error steps
    df['error_step'] = np.nan
    df['error_step'] = df['error_step'].astype(object)
    
    # Sort the dataframe by client_id and date_time
    df = df.sort_values(by=['client_id', 'date_time'], ascending=[True, True])
    
    # Initialize error tracking variables
    errors = 0
    transitions = 0
    
    # Group by client_id to process each client's steps
    grouped = df.groupby('client_id')
    
    # Create a copy of the dataframe to modify
    df_result = df.copy()
    
    for _, client_group in grouped:
        client_steps = client_group.reset_index()
        last_valid_step_num = None
        last_valid_step_name = None
        
        for i in range(len(client_steps)):
            current_step_num = client_steps.loc[i, "process_step_num"]
            current_step_name = client_steps.loc[i, step_column]
            
            # Skip multiple starts and confirms
            if current_step_num in [0, 4]:
                if current_step_num == 0:
                    # Always keep the last 'start'
                    last_valid_step_num = 0
                    last_valid_step_name = current_step_name
                continue
            
            # Count transitions (excluding start and confirm)
            if last_valid_step_num is not None:
                transitions += 1
                
                # Check for repeated steps within the main process
                if current_step_num == last_valid_step_num:
                    df_result.loc[client_steps.loc[i, 'index'], 'error_step'] = current_step_name
                    errors += 1
                
                # Check for backward steps (excluding start → confirm transition)
                elif (current_step_num < last_valid_step_num and 
                      not (last_valid_step_num == 4 and current_step_num == 0)):
                    df_result.loc[client_steps.loc[i, 'index'], 'error_step'] = last_valid_step_name
                    errors += 1
                    
            # Update last valid step
            last_valid_step_num = current_step_num
            last_valid_step_name = current_step_name
    
    # Calculate error rate
    error_rate = errors / transitions if transitions > 0 else 0
    
    return error_rate, df_result

In [6]:
error_rate_control, df_result_control = calculate_process_errors(df_3, step_column='process_step')


In [7]:
print("errors rate for control group: ", round(error_rate_control * 100, 2), '%')

errors rate for control group:  10.42 %


In [8]:
df_result_control["error_step"].value_counts()

error_step
step_3    4454
step_2    2369
step_1    1327
Name: count, dtype: int64

In [9]:
error_rate_test, df_result_test = calculate_process_errors(df_2, step_column='process_step')

In [10]:
df_result_test["error_step"].value_counts()

error_step
step_2    3840
step_3    3615
step_1    1701
Name: count, dtype: int64

In [11]:
print("errors rate for control group: ", round(error_rate_test * 100, 2), '%')

errors rate for control group:  9.58 %


In [12]:
df_result_control.to_csv('/Users/camil/Documents/iron_hack_bootcamp/week_5/2_weeks_project/two_week_project/data/clean/errors_rate_control.csv', index=False)
df_result_test.to_csv('/Users/camil/Documents/iron_hack_bootcamp/week_5/2_weeks_project/two_week_project/data/clean/errors_rate_test.csv', index=False)