In [1]:
#Setup and Data Loading
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture # Python Implementation of MClust
import matplotlib.pyplot as plt
import seaborn as sns




In [2]:
BASE_DIR = '../dataset'  # Go up one level, then into 'dataset'

df_events = pd.read_csv(f'{BASE_DIR}/events.csv')
items_part1 = pd.read_csv(f'{BASE_DIR}/item_properties_part1.csv')
items_part2 = pd.read_csv(f'{BASE_DIR}/item_properties_part2.csv')

df_items = pd.concat([items_part1, items_part2], ignore_index=True)

print("Data loaded successfully.")

Data loaded successfully.


In [3]:
#Data Cleaning and Outlier Removal

# 2.1 Convert timestamp and handle missing transactions
df_events['timestamp'] = pd.to_datetime(df_events['timestamp'], unit='ms')
# Fill missing transaction IDs with 0 to preserve non-transactional events (views/carts)
df_events['transactionid'] = df_events['transactionid'].fillna(0) 

# 2.2 Remove high-activity outliers (Bots/Scrapers)
# Find event counts per visitor
visitor_counts = df_events['visitorid'].value_counts()

# Identify the 99.9th percentile threshold for event counts
threshold = visitor_counts.quantile(0.999) 
bot_visitor_ids = visitor_counts[visitor_counts > threshold].index

# Filter the events dataframe
df_events = df_events[~df_events['visitorid'].isin(bot_visitor_ids)]
print(f"Removed {len(bot_visitor_ids)} bot-like visitors based on 99.9th percentile event count.")
print(f"Remaining events: {len(df_events)}")

Removed 1352 bot-like visitors based on 99.9th percentile event count.
Remaining events: 2536174


In [4]:
#Advanced Sessionization

# 3.1 Session Creation using a 30-minute (1800 second) inactivity timeout
df_events = df_events.sort_values(['visitorid', 'timestamp']).reset_index(drop=True)

# Calculate time difference between consecutive events for the same visitor
df_events['time_diff'] = df_events.groupby('visitorid')['timestamp'].diff().dt.total_seconds().fillna(0)

# Mark the start of a new session if time difference > 1800 seconds
df_events['new_session'] = (df_events['time_diff'] > 1800).astype(int)

# Create a session_id unique within each visitor
df_events['session_id'] = df_events.groupby('visitorid')['new_session'].cumsum()

# Create a globally unique session identifier
df_events['global_session_id'] = df_events['visitorid'].astype(str) + '_' + df_events['session_id'].astype(str)

print(f"Total unique sessions created: {df_events['global_session_id'].nunique()}")

Total unique sessions created: 1722864


In [5]:
# Temporal Merging (If Item Properties are Time-Dependent)

# NOTE: This step is only necessary if df_items contains time-sensitive properties 
# (e.g., price changes, stock status) that were used in the original notebook.
# We include it here for completeness as an example of good feature engineering.

# 1. Prepare item properties: convert timestamp and sort
df_items['timestamp'] = pd.to_datetime(df_items['timestamp'], unit='ms')
df_items = df_items.sort_values('timestamp')

# 2. Merge: Join each event with the most recent item property status *before* that event.
# This requires both dataframes to be sorted on the 'on' column ('timestamp').
df_events = pd.merge_asof(
    df_events.sort_values('timestamp'),
    df_items[['itemid', 'timestamp', 'property', 'value']],
    on='timestamp', 
    by='itemid', 
    direction='backward',
    suffixes=('', '_itemprop')
)

print("Temporal merge with item properties completed.")

Temporal merge with item properties completed.


In [6]:
# Create Behavioral and Ratio Features

# 1. Aggregate event data to the session level
agg_features = df_events.groupby('global_session_id').agg(
    # Core Counts
    total_events=('event', 'count'),
    view_count=('event', lambda x: (x == 'view').sum()),
    addtocart_count=('event', lambda x: (x == 'addtocart').sum()),
    transaction_count=('event', lambda x: (x == 'transaction').sum()),
    # Unique Items Viewed (for variety measure)
    unique_items_viewed=('itemid', lambda x: x[x.index.isin(df_events[df_events['event'] == 'view'].index)].nunique()),
    # Temporal features
    session_duration_sec=('timestamp', lambda x: (x.max() - x.min()).total_seconds()),
    session_hour_of_day=('timestamp', lambda x: x.iloc[0].hour),
    # Target feature (for analysis later)
    is_buyer=('transactionid', lambda x: 1 if (x > 0).any() else 0)
).reset_index()

# 2. Calculate Key Behavioral Ratio Features

# View-to-Cart Ratio: Efficiency of viewing leading to cart addition
agg_features['view_to_cart_ratio'] = (
    agg_features['addtocart_count'] / agg_features['view_count'].replace(0, np.nan)
).fillna(0)

# Event Rate per Second: Intensity of user engagement
# Use a small constant (1e-6) instead of 0 for duration to prevent division by zero for 1-event sessions
agg_features['event_rate_per_sec'] = (
    agg_features['total_events'] / agg_features['session_duration_sec'].replace(0, 1e-6)
)

print("Aggregated and ratio features created.")
print(agg_features[['total_events', 'view_to_cart_ratio', 'event_rate_per_sec']].describe())

KeyboardInterrupt: 

In [None]:
#  MClust/GMM Clustering Implementation

# 1. Determine the optimal number of clusters (K)
# Use Bayesian Information Criterion (BIC) to select the best K and covariance type
N_COMPONENTS = range(2, 11)
bic_scores = []
cov_types = ['full', 'tied', 'diag', 'spherical']

for n_comp in N_COMPONENTS:
    for cov in cov_types:
        try:
            gmm = GaussianMixture(n_components=n_comp, covariance_type=cov, random_state=42)
            gmm.fit(X_scaled)
            bic_scores.append((gmm.bic(X_scaled), n_comp, cov))
        except ValueError:
            # Catch cases where GMM fails to converge for certain settings
            continue

# Find the configuration with the minimum BIC score
best_bic, best_k, best_cov = min(bic_scores, key=lambda x: x[0])
print(f"Optimal GMM Configuration: K={best_k}, Covariance Type='{best_cov}', BIC={best_bic:.2f}")


# 2. Run the final GMM model
final_gmm = GaussianMixture(n_components=best_k, covariance_type=best_cov, random_state=42)
final_gmm.fit(X_scaled)
agg_features['gmm_cluster'] = final_gmm.predict(X_scaled)

print("\nGMM Clustering complete. Clusters assigned to session features.")