In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re

Checking out the dataset that was in two parts and then merging them with the concat() method, since they have the same columns

In [None]:
big_df_pt1 = pd.read_csv('../data/raw/df_final_web_data_pt_1.txt')
big_df_pt2 = pd.read_csv('../data/raw/df_final_web_data_pt_2.txt')
big_df_pt1

In [None]:
big_df_pt2

In [None]:
merged_df = pd.concat([big_df_pt1, big_df_pt2], axis=0)
merged_df

In [None]:
# Basic inspecting of the dataframe
def inspect_dataframe(merged_df):
    """
    Function to perform basic inspection on a DataFrame: 
    shape, column names, data types, and missing values.
    
    """

    print('Check the shape (rows, columns):')
    print(merged_df.shape)

    print('\nColumn names:')
    print(merged_df.columns)


    print('\nData types:')
    print(merged_df.dtypes)


    print('\nMissing values:')
    print(merged_df.isnull().sum())

# clean column names
def clean_column_names(merged_df):
    """
    Function to clean the column names of a DataFrame:
    - Convert to lowercase
    - Replace spaces with underscores
    - Remove or replace special characters with underscores

    """

    def clean_name(name):
        name = name.lower()
        name = name.replace(" ", "_")
        name = re.sub(r'[^a-z0-9_]', '_', name)
        return name
    
    merged_df.columns = [clean_name(col) for col in merged_df.columns]    
    return merged_df

# check unique and empty values
def check_unique_and_empty(merged_df):
    """
    Function to print the unique and empty values for each column in a DataFrame.
    
    """
    result = []
    
    for column in merged_df.columns:
        unique_values = merged_df[column].nunique()
        empty_values = merged_df[column].isna().sum()
        
        empty_rows = merged_df[column][merged_df[column].isna()].index.tolist()
        
        result.append({
            'Column': column,
            'Unique Values': unique_values,
            'Empty Values': empty_values,
            'Empty Row Indices': empty_rows
        })
    
    merged_df = pd.DataFrame(result)
    
    merged_df.set_index('Column', inplace=True)
    
print(inspect_dataframe(merged_df)) 
print(clean_column_names(merged_df))
print(check_unique_and_empty(merged_df))
display(merged_df)

In [None]:
merged_df.dtypes

In [None]:
# Changing the date_time column values from objects to 
merged_df['date_time'] = pd.to_datetime(merged_df['date_time'])
merged_df.dtypes

In [None]:
merged_df= merged_df.sort_values(by=["client_id", "visit_id", "date_time"])
merged_df

In [None]:
# Step 1: Identify 'start' steps that are followed by a 'step_1'
# First, shift the process_step column to check the next step
merged_df['next_step'] = merged_df.groupby('visit_id')['process_step'].shift(-1)

# Step 2: Filter the DataFrame to retain 'start' only if the next step is 'step_1'
filtered_df = merged_df[
    (merged_df['process_step'] != 'start') | (merged_df['next_step'] == 'step_1')
]

# Step 3: Drop the temporary 'next_step' column
filtered_df = filtered_df.drop(columns=['next_step'])

# Step 4: Drop duplicates if needed
filtered_df = filtered_df.drop_duplicates()

# Display the resulting DataFrame
filtered_df

### For our project, we need to know how much time a client spends on each step and if they ever reached the last step.

In [None]:
# Valid steps order
valid_steps_order = ['start', 'step_1', 'step_2', 'step_3', 'confirm']

# Initialize a list to store the results
time_spent = []

# Function to map steps to their order
def step_order(step):
    return valid_steps_order.index(step)

# Iterate over each unique combination of client, visitor, and visit
for (client_id, visitor_id, visit_id), group in filtered_df.groupby(['client_id', 'visitor_id', 'visit_id']):
    prev_time = None
    prev_step = None
    last_valid_step_order = -1  # Start with an invalid step order to force a check for the first step
    
    for _, row in group.iterrows():
        current_time = row['date_time']
        current_step = row['process_step']
        
        # Initialize is_error to False
        is_error = False
        
        # Check if the current step is a valid step and follows the correct order (no skipping)
        current_step_order = step_order(current_step)
        
        if prev_time is not None:
            # Calculate the time difference between steps
            time_diff = current_time - prev_time
            
            if current_step_order < last_valid_step_order:
                # If the current step goes backwards, just include the row (no error)
                time_spent.append({
                    'client_id': client_id,
                    'visitor_id': visitor_id,
                    'visit_id': visit_id,
                    'from_step': prev_step,
                    'to_step': current_step,
                    'time_spent': time_diff,
                    'is_error': True  # No error for backward steps
                })
            elif current_step_order > last_valid_step_order + 1:
                # If the step order skips (e.g., step_1 → step_3), **skip this row**
                continue  # Skip this row, don't append it
            elif current_step_order == last_valid_step_order:
                # If the steps to and from are the same, **skip this row**
                continue
            else:
                # Valid transition, append to the list
                time_spent.append({
                    'client_id': client_id,
                    'visitor_id': visitor_id,
                    'visit_id': visit_id,
                    'from_step': prev_step,
                    'to_step': current_step,
                    'time_spent': time_diff,
                    'is_error': False  # Valid transition, no error
                })
        
        # Update the last valid step order and previous time/step for the next iteration
        last_valid_step_order = current_step_order
        prev_time = current_time
        prev_step = current_step
        
        if prev_step is None:
            # For the first step, no previous step
            time_spent.append({
                'client_id': client_id,
                'visitor_id': visitor_id,
                'visit_id': visit_id,
                'from_step': None,
                'to_step': current_step,
                'time_spent': None,
                'is_error': False  # No error for the first step
            })

# Create a DataFrame with the results
time_spent_df = pd.DataFrame(time_spent)

# Drop rows with None values in 'from_step' and 'to_step'
time_spent_df = time_spent_df.dropna(subset=['from_step', 'to_step'])

# Display the resulting DataFrame
time_spent_df

In [None]:
errors_occured = time_spent_df['is_error'].value_counts()
errors_occured

In [None]:
# Filter transitions to include only those that follow the valid steps order
valid_transitions = []

# Iterate over the time_spent_df and filter valid transitions
for _, row in time_spent_df.iterrows():
    from_step = row['from_step']
    to_step = row['to_step']
    
    # Check if the transition follows the valid steps order (i.e., from a step to the next one in order)
    if from_step is not None and to_step is not None:
        if valid_steps_order.index(to_step) == valid_steps_order.index(from_step) + 1:
            valid_transitions.append(row)

# Create a DataFrame with only valid transitions
valid_transitions_df = pd.DataFrame(valid_transitions)

# Calculate the average time spent on each transition
avg_time_spent = valid_transitions_df.groupby(['from_step', 'to_step'])['time_spent'].mean().round(0).reset_index()

# Rename the columns for clarity
avg_time_spent = avg_time_spent.rename(columns={'time_spent': 'avg_time_spent'})

# Display the resulting DataFrame
avg_time_spent

Some clients didn't ever complete the process. Therefore, we need to ensure they are identified and see how many clients reached which step.

In [None]:
# Identify clients who didn't complete the whole process (didn't reach "confirm")
completed_clients = time_spent_df[time_spent_df['to_step'] == 'confirm']['visit_id'].unique()

# Identify clients who didn't reach "confirm"
incomplete_clients = time_spent_df[~time_spent_df['visit_id'].isin(completed_clients)]['visit_id'].unique()

# Show which clients are incomplete
print(f"Clients who didn't complete the process: {incomplete_clients}")

# Check the last step they reached
last_steps = time_spent_df.groupby('visit_id')['to_step'].last()

# Show the last step for each client (whether complete or incomplete)
print(f"Last step for each client: \n{last_steps}")

In [None]:
# Get the total number of unique visits
total_visits = time_spent_df['visit_id'].nunique()

# Identify the unique steps, excluding 'start' because it didn't make sense to keep it
steps = ['step_1', 'step_2', 'step_3', 'confirm'] 

# Initialize a dictionary to store the percentage of clients reaching each step
step_percentages = {}

# Iterate through each step and calculate the percentage
for step in steps:
    # Find unique visits that reached the step
    visits_reached_step = time_spent_df[time_spent_df['to_step'] == step]['visit_id'].nunique()
    
    # Calculate the percentage of visits who reached this step
    step_percentages[step] = (visits_reached_step / total_visits) * 100

# Print the results
for step, percentage in step_percentages.items():
    print(f"Percentage of visits who reached {step}: {percentage:.2f}%")

## Exporting the tables 

In [None]:
filtered_df.to_csv('../data/clean/web_data_merged_filtered.csv')

In [None]:
time_spent_df.to_csv('../data/clean/time_spent.csv')