In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re

In [3]:
merged = pd.read_csv('../data/clean/web_data_merged_filtered.csv')
merged = merged.drop(columns = 'Unnamed: 0')
merged

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9056452,306992881_89423906595,1000165_4190026492_760066,start,2017-06-04 01:07:29
1,9056452,306992881_89423906595,1000165_4190026492_760066,step_1,2017-06-04 01:07:32
2,9056452,306992881_89423906595,1000165_4190026492_760066,step_2,2017-06-04 01:07:56
3,9056452,306992881_89423906595,1000165_4190026492_760066,step_3,2017-06-04 01:09:13
4,9056452,306992881_89423906595,1000165_4190026492_760066,confirm,2017-06-04 01:09:50
...,...,...,...,...,...
549204,7149380,483112224_46340533900,999992932_41666455053_671149,start,2017-06-06 15:46:03
549205,7149380,483112224_46340533900,999992932_41666455053_671149,step_1,2017-06-06 15:46:24
549206,7149380,483112224_46340533900,999992932_41666455053_671149,step_2,2017-06-06 15:47:32
549207,7149380,483112224_46340533900,999992932_41666455053_671149,step_3,2017-06-06 16:01:46


In [4]:
time_spent = pd.read_csv('../data/clean/time_spent.csv')
time_spent = time_spent.drop(columns = 'Unnamed: 0')
time_spent

Unnamed: 0,client_id,visitor_id,visit_id,from_step,to_step,time_spent,is_error
0,169,201385055_71273495308,749567106_99161211863_557568,start,step_1,0 days 00:00:09,False
1,169,201385055_71273495308,749567106_99161211863_557568,step_1,step_2,0 days 00:00:46,False
2,169,201385055_71273495308,749567106_99161211863_557568,step_2,step_3,0 days 00:01:34,False
3,169,201385055_71273495308,749567106_99161211863_557568,step_3,confirm,0 days 00:01:04,False
4,546,475037402_89828530214,731811517_9330176838_94847,start,step_1,0 days 00:00:10,False
...,...,...,...,...,...,...,...
413379,9999839,948322592_28990736246,715530422_68620416793_515645,step_3,confirm,0 days 00:00:35,False
413380,9999875,738878760_1556639849,931268933_219402947_599432,start,step_1,0 days 00:00:07,False
413381,9999875,738878760_1556639849,931268933_219402947_599432,step_1,step_2,0 days 00:01:39,False
413382,9999875,738878760_1556639849,931268933_219402947_599432,step_2,step_3,0 days 00:03:11,False


In [5]:
control = pd.read_csv('../data/clean/client_id_control.csv')
control.head(5)

Unnamed: 0,client_id
0,1028
1,1104
2,1186
3,1195
4,1197


In [6]:
merged_df = control.merge(merged, on = 'client_id', how = 'left')
merged_df

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,1028,,,,
1,1104,194240915_18158000533,543158812_46395476577_767725,start,2017-06-12 07:49:18
2,1104,194240915_18158000533,643221571_99977972121_69283,start,2017-06-20 22:31:33
3,1186,446844663_31615102958,507052512_11309370126_442139,start,2017-04-08 15:59:16
4,1186,446844663_31615102958,795373564_99931517312_810896,start,2017-04-08 18:05:02
...,...,...,...,...,...
105188,9997391,494669706_3354361161,84654768_90613632047_633963,step_3,2017-04-05 15:41:39
105189,9997470,91394485_75296404278,655572400_94971272893_411965,start,2017-04-07 16:11:03
105190,9997470,395791369_55562604618,904791598_9725982898_416914,start,2017-04-20 20:04:38
105191,9998346,,,,


In [7]:
# Basic inspecting of the dataframe
def inspect_dataframe(merged_df):
    """
    Function to perform basic inspection on a DataFrame: 
    shape, column names, data types, and missing values.
    
    """

    print('Check the shape (rows, columns):')
    print(merged_df.shape)

    print('\nColumn names:')
    print(merged_df.columns)


    print('\nData types:')
    print(merged_df.dtypes)


    print('\nMissing values:')
    print(merged_df.isnull().sum())

# clean column names
def clean_column_names(merged_df):
    """
    Function to clean the column names of a DataFrame:
    - Convert to lowercase
    - Replace spaces with underscores
    - Remove or replace special characters with underscores

    """

    def clean_name(name):
        name = name.lower()
        name = name.replace(" ", "_")
        name = re.sub(r'[^a-z0-9_]', '_', name)
        return name
    
    merged_df.columns = [clean_name(col) for col in merged_df.columns]    
    return merged_df

# check unique and empty values
def check_unique_and_empty(merged_df):
    """
    Function to print the unique and empty values for each column in a DataFrame.
    
    """
    result = []
    
    for column in merged_df.columns:
        unique_values = merged_df[column].nunique()
        empty_values = merged_df[column].isna().sum()
        
        empty_rows = merged_df[column][merged_df[column].isna()].index.tolist()
        
        result.append({
            'Column': column,
            'Unique Values': unique_values,
            'Empty Values': empty_values,
            'Empty Row Indices': empty_rows
        })
    
    merged_df = pd.DataFrame(result)
    
    merged_df.set_index('Column', inplace=True)
    
print(inspect_dataframe(merged_df)) 
print(clean_column_names(merged_df))
print(check_unique_and_empty(merged_df))
display(merged_df)

Check the shape (rows, columns):
(105193, 5)

Column names:
Index(['client_id', 'visitor_id', 'visit_id', 'process_step', 'date_time'], dtype='object')

Data types:
client_id        int64
visitor_id      object
visit_id        object
process_step    object
date_time       object
dtype: object

Missing values:
client_id          0
visitor_id      1780
visit_id        1780
process_step    1780
date_time       1780
dtype: int64
None
        client_id             visitor_id                      visit_id  \
0            1028                    NaN                           NaN   
1            1104  194240915_18158000533  543158812_46395476577_767725   
2            1104  194240915_18158000533   643221571_99977972121_69283   
3            1186  446844663_31615102958  507052512_11309370126_442139   
4            1186  446844663_31615102958  795373564_99931517312_810896   
...           ...                    ...                           ...   
105188    9997391   494669706_3354361161   84654

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,1028,,,,
1,1104,194240915_18158000533,543158812_46395476577_767725,start,2017-06-12 07:49:18
2,1104,194240915_18158000533,643221571_99977972121_69283,start,2017-06-20 22:31:33
3,1186,446844663_31615102958,507052512_11309370126_442139,start,2017-04-08 15:59:16
4,1186,446844663_31615102958,795373564_99931517312_810896,start,2017-04-08 18:05:02
...,...,...,...,...,...
105188,9997391,494669706_3354361161,84654768_90613632047_633963,step_3,2017-04-05 15:41:39
105189,9997470,91394485_75296404278,655572400_94971272893_411965,start,2017-04-07 16:11:03
105190,9997470,395791369_55562604618,904791598_9725982898_416914,start,2017-04-20 20:04:38
105191,9998346,,,,


In [8]:
merged_df.dtypes

client_id        int64
visitor_id      object
visit_id        object
process_step    object
date_time       object
dtype: object

In [9]:
# Changing the date_time column values from objects to 
merged_df['date_time'] = pd.to_datetime(merged_df['date_time'])
merged_df.dtypes

client_id                int64
visitor_id              object
visit_id                object
process_step            object
date_time       datetime64[ns]
dtype: object

In [10]:
merged_df = merged_df.sort_values(by=["client_id", "visitor_id", "visit_id", "date_time"])
merged_df = merged_df.dropna()
merged_df

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
1,1104,194240915_18158000533,543158812_46395476577_767725,start,2017-06-12 07:49:18
2,1104,194240915_18158000533,643221571_99977972121_69283,start,2017-06-20 22:31:33
3,1186,446844663_31615102958,507052512_11309370126_442139,start,2017-04-08 15:59:16
4,1186,446844663_31615102958,795373564_99931517312_810896,start,2017-04-08 18:05:02
5,1186,446844663_31615102958,795373564_99931517312_810896,step_1,2017-04-08 18:05:13
...,...,...,...,...,...
105187,9997391,494669706_3354361161,84654768_90613632047_633963,step_2,2017-04-05 15:41:34
105188,9997391,494669706_3354361161,84654768_90613632047_633963,step_3,2017-04-05 15:41:39
105190,9997470,395791369_55562604618,904791598_9725982898_416914,start,2017-04-20 20:04:38
105189,9997470,91394485_75296404278,655572400_94971272893_411965,start,2017-04-07 16:11:03


In [11]:
# Identifying and dropping the visit_ids that don't have 'start' in the process_step column
visits_with_start = merged_df[merged_df['process_step'] == 'start']['visit_id'].unique()
filtered_df_with_start = merged_df[merged_df['visit_id'].isin(visits_with_start)]
display(filtered_df_with_start)

# Identifying and dropping the client_ids that don't have 'start' in the process_step column
clients_with_start = filtered_df_with_start[filtered_df_with_start['process_step'] == 'start']['client_id'].unique()
merged_df = filtered_df_with_start[filtered_df_with_start['client_id'].isin(clients_with_start)]
display(merged_df)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
1,1104,194240915_18158000533,543158812_46395476577_767725,start,2017-06-12 07:49:18
2,1104,194240915_18158000533,643221571_99977972121_69283,start,2017-06-20 22:31:33
3,1186,446844663_31615102958,507052512_11309370126_442139,start,2017-04-08 15:59:16
4,1186,446844663_31615102958,795373564_99931517312_810896,start,2017-04-08 18:05:02
5,1186,446844663_31615102958,795373564_99931517312_810896,step_1,2017-04-08 18:05:13
...,...,...,...,...,...
105187,9997391,494669706_3354361161,84654768_90613632047_633963,step_2,2017-04-05 15:41:34
105188,9997391,494669706_3354361161,84654768_90613632047_633963,step_3,2017-04-05 15:41:39
105190,9997470,395791369_55562604618,904791598_9725982898_416914,start,2017-04-20 20:04:38
105189,9997470,91394485_75296404278,655572400_94971272893_411965,start,2017-04-07 16:11:03


Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
1,1104,194240915_18158000533,543158812_46395476577_767725,start,2017-06-12 07:49:18
2,1104,194240915_18158000533,643221571_99977972121_69283,start,2017-06-20 22:31:33
3,1186,446844663_31615102958,507052512_11309370126_442139,start,2017-04-08 15:59:16
4,1186,446844663_31615102958,795373564_99931517312_810896,start,2017-04-08 18:05:02
5,1186,446844663_31615102958,795373564_99931517312_810896,step_1,2017-04-08 18:05:13
...,...,...,...,...,...
105187,9997391,494669706_3354361161,84654768_90613632047_633963,step_2,2017-04-05 15:41:34
105188,9997391,494669706_3354361161,84654768_90613632047_633963,step_3,2017-04-05 15:41:39
105190,9997470,395791369_55562604618,904791598_9725982898_416914,start,2017-04-20 20:04:38
105189,9997470,91394485_75296404278,655572400_94971272893_411965,start,2017-04-07 16:11:03


### For our project, we need to know how much time a client spends on each step and if they ever reached the last step.

In [12]:
time_spent_df = control.merge(time_spent, on = 'client_id', how = 'left')
time_spent_df = time_spent_df.dropna()
time_spent_df

Unnamed: 0,client_id,visitor_id,visit_id,from_step,to_step,time_spent,is_error
2,1186,446844663_31615102958,795373564_99931517312_810896,start,step_1,0 days 00:00:11,False
3,1186,446844663_31615102958,795373564_99931517312_810896,step_1,step_2,0 days 00:00:11,False
4,1195,766842522_69992551638,393817425_39015278493_996341,start,step_1,0 days 00:00:33,False
5,1195,766842522_69992551638,393817425_39015278493_996341,step_1,step_2,0 days 00:01:38,False
6,1195,766842522_69992551638,393817425_39015278493_996341,step_2,step_3,0 days 00:00:31,False
...,...,...,...,...,...,...,...
80228,9997391,494669706_3354361161,84654768_90613632047_633963,step_2,step_3,0 days 00:00:56,False
80229,9997391,494669706_3354361161,84654768_90613632047_633963,step_3,step_2,0 days 00:01:08,True
80230,9997391,494669706_3354361161,84654768_90613632047_633963,step_2,step_3,0 days 00:00:14,False
80231,9997391,494669706_3354361161,84654768_90613632047_633963,step_3,step_2,0 days 00:00:17,True


In [13]:
time_spent_df['time_spent'] = pd.to_timedelta(time_spent_df['time_spent'], errors='coerce')
time_spent_df['is_error'] = time_spent_df['is_error'].astype('bool')

In [14]:
time_spent_df.dtypes

client_id               int64
visitor_id             object
visit_id               object
from_step              object
to_step                object
time_spent    timedelta64[ns]
is_error                 bool
dtype: object

In [15]:
errors_rate = time_spent_df['is_error'].value_counts().values[1] / len(time_spent_df) * 100
print(time_spent_df['is_error'].value_counts().values[1])
print(len(time_spent_df))
print(f'Error rate for the control group: {errors_rate:.4f} %')

4053
74848
Error rate for the control group: 5.4150 %


In [16]:
valid_steps_order = ['start', 'step_1', 'step_2', 'step_3', 'confirm']
# Filter transitions to include only those that follow the valid steps order
valid_transitions = []

# Iterate over the time_spent_df and filter valid transitions
for _, row in time_spent_df.iterrows():
    from_step = row['from_step']
    to_step = row['to_step']
    
    # Check if the transition follows the valid steps order (i.e., from a step to the next one in order)
    if from_step is not None and to_step is not None:
        if valid_steps_order.index(to_step) == valid_steps_order.index(from_step) + 1:
            valid_transitions.append(row)

# Create a DataFrame with only valid transitions
valid_transitions_df = pd.DataFrame(valid_transitions)

# Calculate the average time spent on each transition
avg_time_spent = valid_transitions_df.groupby(['from_step', 'to_step'])['time_spent'].mean().round(0).reset_index()

# Rename the columns for clarity
avg_time_spent = avg_time_spent.rename(columns={'time_spent': 'avg_time_spent'})

# Display the resulting DataFrame
avg_time_spent


Unnamed: 0,from_step,to_step,avg_time_spent
0,start,step_1,0 days 00:01:04.276945972
1,step_1,step_2,0 days 00:00:37.193225006
2,step_2,step_3,0 days 00:01:33.193533460
3,step_3,confirm,0 days 00:02:14.721029065


In [17]:
# Filter rows where 'is_error' is True
errors_df = time_spent_df[time_spent_df['is_error'] == True]

# Calculate the average time spent on errors
avg_time_per_error = errors_df['time_spent'].mean()

# Display the results
print(f"Average time spent per error: {avg_time_per_error} seconds")

errors_df

Average time spent per error: 0 days 00:01:57.116456945 seconds


Unnamed: 0,client_id,visitor_id,visit_id,from_step,to_step,time_spent,is_error
11,1197,753759429_54481946928,71862471_21202285428_848395,step_3,step_2,0 days 00:00:11,True
56,9229,810786857_11889939941,794978950_70484965288_55630,step_2,step_1,0 days 00:00:04,True
57,9229,810786857_11889939941,794978950_70484965288_55630,step_1,start,0 days 00:00:52,True
89,13009,360435735_44466390795,377986493_6391607481_598681,step_3,step_2,0 days 00:00:50,True
92,13009,360435735_44466390795,377986493_6391607481_598681,confirm,step_3,0 days 00:00:00,True
...,...,...,...,...,...,...,...
80099,9981420,307752069_78230860567,979430919_71167162970_145409,step_1,start,0 days 00:06:01,True
80108,9981755,816308086_85375401328,853111235_60870063623_712583,step_3,step_2,0 days 00:00:24,True
80181,9992871,777307149_19636412452,427140349_70169218942_291297,step_3,step_2,0 days 00:00:17,True
80229,9997391,494669706_3354361161,84654768_90613632047_633963,step_3,step_2,0 days 00:01:08,True


In [18]:
errors = pd.DataFrame({'to_step': 'error', "Percentage": [errors_rate], "avg_time_spent": [avg_time_per_error]})
errors.to_csv('../data/intermediate_steps/errors_control.csv')
errors

Unnamed: 0,to_step,Percentage,avg_time_spent
0,error,5.414974,0 days 00:01:57.116456945


Some clients didn't ever complete the process. Therefore, we need to ensure they are identified and see how many clients reached which step.

In [19]:
# Identify clients who didn't complete the whole process (didn't reach "confirm")
completed_clients = time_spent_df[time_spent_df['to_step'] == 'confirm']['visit_id'].unique()

# Identify clients who didn't reach "confirm"
incomplete_clients = time_spent_df[~time_spent_df['visit_id'].isin(completed_clients)]['visit_id'].unique()

# Show which clients are incomplete
print(f"Clients who didn't complete the process: {incomplete_clients}")

# Check the last step they reached
last_steps = time_spent_df.groupby('visit_id')['to_step'].last()

# Show the last step for each client (whether complete or incomplete)
print(f"Last step for each client: \n{last_steps}")

Clients who didn't complete the process: ['795373564_99931517312_810896' '53540785_76819002881_890663'
 '216367358_55024634523_791141' ... '369643458_97718322961_329420'
 '984781905_40925324211_645419' '84654768_90613632047_633963']
Last step for each client: 
visit_id
100037962_47432393712_705583     step_1
10006594_66157970412_679648      step_3
10007589_47780784567_391490     confirm
100096068_8301717872_987164      step_1
100195539_90558871716_822894     step_1
                                 ...   
999720028_44146654962_446209     step_1
999859408_41720215615_938916    confirm
999890184_77121766521_746360     step_2
999936732_27995195904_324897     step_1
999985675_64610694964_443659    confirm
Name: to_step, Length: 20962, dtype: object


In [20]:
# Get the total number of unique visits
total_visits = merged_df['visit_id'].nunique()

# Identify the unique steps, excluding 'start' because it didn't make sense to keep it
steps = ['start','step_1', 'step_2', 'step_3', 'confirm'] 

# Initialize a dictionary to store the percentage of clients reaching each step
step_percentages = {}

# Iterate through each step and calculate the percentage
for step in steps:
    visits_reached_step = merged_df[merged_df['process_step'] == step]['visit_id'].nunique()
    # Calculate the percentage of visits who reached this step
    step_percentages[step] = (visits_reached_step / total_visits) * 100

# Print the results
for step, percentage in step_percentages.items():
    print(f"Percentage of visits who reached {step}: {percentage:.2f}%")

completion_rate_step = pd.DataFrame(list(step_percentages.items()), columns=['Step', 'Percentage'])
completion_rate_step.to_csv('../data/intermediate_steps/completion_rate_step_control.csv')
display(completion_rate_step)

Percentage of visits who reached start: 100.00%
Percentage of visits who reached step_1: 73.51%
Percentage of visits who reached step_2: 61.38%
Percentage of visits who reached step_3: 54.80%
Percentage of visits who reached confirm: 46.19%


Unnamed: 0,Step,Percentage
0,start,100.0
1,step_1,73.509307
2,step_2,61.380447
3,step_3,54.79721
4,confirm,46.187822


## Exporting the tables 

In [21]:
merged_df.to_csv('../data/clean/merged_control.csv')

In [22]:
time_spent_df.to_csv('../data/clean/time_spent_control.csv')

In [23]:
avg_time_spent.to_csv('../data/intermediate_steps/average_time_spent_control.csv')