In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import mode
from sklearn.cluster import MiniBatchKMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
import random

# Utility function for clean evaluation
def evaluate_model(y_true, y_pred, model_name, target_names):
    """Prints classification report and plots confusion matrix."""
    print(f"--- {model_name} Classification Report ---")
    print(classification_report(y_true, y_pred, target_names=target_names))

    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
    fig, ax = plt.subplots(figsize=(8, 8))
    disp.plot(ax=ax, cmap=plt.cm.Blues, xticks_rotation='vertical')
    ax.set_title(f'{model_name} Confusion Matrix')
    plt.tight_layout()
    plt.savefig(f'{model_name.lower().replace(" ", "_")}_confusion_matrix.png')
    plt.close()
    print(f"Saved {model_name} confusion matrix as: {model_name.lower().replace(' ', '_')}_confusion_matrix.png")

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np

# --- Define the Application Class Labels ---
label_map = {
    0: "Weekday OOS Bouncers",
    1: "Weekday 'Quick-Look' Visitors",
    2: "Window Shoppers",
    3: "Determined Buyers",
    4: "Weekday OOS Bouncers (No Category)",
    5: "High-Intent Shoppers",
    6: "Weekend 'Quick-Look' Visitors"
}
target_names = list(label_map.values())
n_clusters = len(target_names) # 7

# --- 1. Load Data ---
# Note: Assuming this file contains the unscaled data and the 'cluster' column
try:
    df_features = pd.read_csv('../session_feature_matrix_with_k7_clusters.csv', index_col='session_id')
    df_features = df_features.rename(columns={'cluster': 'label'})
    print(f"Loaded feature matrix with shape: {df_features.shape}")
except FileNotFoundError:
    print("Error: 'session_feature_matrix_with_k7_clusters.csv' not found.")
    df_features = None

if df_features is not None:
    features_for_clustering = [
        'session_duration_sec', 'total_events', 'unique_items_viewed',
        'unique_categories_viewed', 'avg_item_availability',
        'session_hour_of_day', 'view_count', 'addtocart_count',
        'event_rate_per_sec', 'view_to_cart_ratio',
        'category_spread_ratio', 'is_weekend'
    ]

    # --- 2. Separate Features (X) and Target (y) ---
    X_raw = df_features[features_for_clustering].values
    y = df_features['label'].values
    print(f"Target 'y' classes: {sorted(list(np.unique(y)))}")

    # --- 3. Split Data FIRST (Crucial for proper scaling) ---
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        X_raw, y, test_size=0.2, stratify=y, random_state=42
    )

    # --- 4. Scale Features (Fit only on X_train_raw) ---
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_raw)
    X_test_scaled = scaler.transform(X_test_raw) # Transform test set using TRAIN mean/std

    # --- 5. Balance the Training Data (X_train_bal, y_train_bal) ---
    rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
    X_train_bal, y_train_bal = rus.fit_resample(X_train_scaled, y_train)


    print(f"\nOriginal train shape: {X_train_scaled.shape}")
    print(f"Balanced train shape: {X_train_bal.shape}")
    print(f"Test shape: {X_test_scaled.shape}")
    print(f"Classes in balanced train set: {sorted(list(set(y_train_bal)))}")

    # Rename scaled variables for model input
    X_train_bal = X_train_bal
    X_test = X_test_scaled

    print("\nData preparation complete. Ready for K-Means/GMM.")

Loaded feature matrix with shape: (1722864, 17)
Target 'y' classes: [0, 1, 2, 3, 4, 5, 6]

Original train shape: (1378291, 12)
Balanced train shape: (161350, 12)
Test shape: (344573, 12)
Classes in balanced train set: [0, 1, 2, 3, 4, 5, 6]

Data preparation complete. Ready for K-Means/GMM.


In [18]:
# --- K-Means: 1. Fit Clustering on Balanced Training Data ---
kmeans_eval = MiniBatchKMeans(
    n_clusters=n_clusters,
    random_state=42,
    n_init='auto',
    batch_size=2048
)

# Fit on the balanced training data, as per your constraint
kmeans_eval.fit(X_train_bal)
train_cluster_labels = kmeans_eval.labels_

# --- K-Means: 2. Transductive Mapping (Majority Vote) ---
cluster_to_class_map_km = {}
# Ensure we use the numerical labels from the training data for mapping
possible_classes = sorted(list(set(y_train_bal)))

print("\n--- K-Means Transductive Mapping ---")
for k in range(n_clusters):
    # Find indices of all training points in this cluster
    train_indices_in_cluster = np.where(train_cluster_labels == k)[0]

    if len(train_indices_in_cluster) > 0:
        cluster_true_labels = y_train_bal[train_indices_in_cluster]

        # Determine the majority class (numerical label)
        # mode returns an object, we take the first element of the array of modes
       # Determine the majority class (numerical label) using robust bincount method
        counts = np.bincount(cluster_true_labels)
        # argmax returns the index of the highest count, which is the majority class label
        majority_class_num = np.argmax(counts)
        cluster_to_class_map_km[k] = majority_class_num
        print(f"Cluster {k} -> Class {majority_class_num} ({label_map[majority_class_num]})")
    else:
        # Handle "no labels" case by choosing randomly
        random_class_num = random.choice(possible_classes)
        cluster_to_class_map_km[k] = random_class_num
        print(f"Cluster {k} -> NO LABELS. Chosen randomly: {random_class_num} ({label_map[random_class_num]})")

# --- K-Means: 3. Predict and Evaluate on Test Data ---
# Predict the cluster ID for the test data
test_cluster_labels = kmeans_eval.predict(X_test)

# Map the cluster ID to the final application class label (numerical)
y_pred_km = np.array([cluster_to_class_map_km[cluster_id] for cluster_id in test_cluster_labels])

# Evaluate using the defined target_names
evaluate_model(y_test, y_pred_km, "K-Means Transductive", target_names)


--- K-Means Transductive Mapping ---
Cluster 0 -> Class 5 (High-Intent Shoppers)
Cluster 1 -> Class 2 (Window Shoppers)
Cluster 2 -> Class 4 (Weekday OOS Bouncers (No Category))
Cluster 3 -> Class 4 (Weekday OOS Bouncers (No Category))
Cluster 4 -> Class 4 (Weekday OOS Bouncers (No Category))
Cluster 5 -> Class 6 (Weekend 'Quick-Look' Visitors)
Cluster 6 -> Class 0 (Weekday OOS Bouncers)
--- K-Means Transductive Classification Report ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                                    precision    recall  f1-score   support

              Weekday OOS Bouncers       1.00      1.00      1.00     40517
     Weekday 'Quick-Look' Visitors       0.00      0.00      0.00     41972
                   Window Shoppers       0.89      1.00      0.94     66331
                 Determined Buyers       0.00      0.00      0.00     57043
Weekday OOS Bouncers (No Category)       0.75      1.00      0.86      5762
              High-Intent Shoppers       0.45      1.00      0.62     72754
     Weekend 'Quick-Look' Visitors       0.96      0.97      0.96     60194

                          accuracy                           0.71    344573
                         macro avg       0.58      0.71      0.63    344573
                      weighted avg       0.56      0.71      0.61    344573

Saved K-Means Transductive confusion matrix as: k-means_transductive_confusion_matrix.png


In [24]:
try:
    # This file has the scaled features AND the original cluster label
    df_features = pd.read_csv('df_user_features_with_clusters.csv', index_col='session_id')
    df_features = df_features.rename(columns={'cluster_gmm': 'label'})
    print(f"Loaded feature matrix with shape: {df_features.shape}")
except FileNotFoundError:
    print("Error: 'df_user_features_with_clusters.csv' not found.")
    df_features = None # Handle error appropriately

# --- 2. Define Features (X) and MULTI-CLASS Target (y) ---
if df_features is not None:
    features_for_clustering = [
        'session_duration_sec', 'total_events', 'unique_items_viewed',
        'unique_categories_viewed', 'avg_item_availability',
        'session_hour_of_day', 'view_count', 'addtocart_count',
        'event_rate_per_sec', 'view_to_cart_ratio',
        'category_spread_ratio', 'is_weekend'
    ]

    # We are using the pre-scaled data from the file
    X = df_features[features_for_clustering].values

    # *** THIS IS THE KEY CHANGE ***
    # The target 'y' is now the multi-class 'label' (0-6)
    y = df_features['label'].values
    print(f"Target 'y' classes: {sorted(list(np.unique(y)))}")

    # --- 3. Split Data FIRST (Crucial for proper scaling) ---
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        X_raw, y, test_size=0.2, stratify=y, random_state=42
    )

    # --- 4. Scale Features (Fit only on X_train_raw) ---
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_raw)
    X_test_scaled = scaler.transform(X_test_raw) # Transform test set using TRAIN mean/std

    # --- 5. Balance the Training Data (X_train_bal, y_train_bal) ---
    rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
    X_train_bal, y_train_bal = rus.fit_resample(X_train_scaled, y_train)

    # Rename scaled variables for model input
    X_train_bal = X_train_bal
    X_test = X_test_scaled
# --- GMM: 1. Fit Clustering on Balanced Training Data ---
gmm_eval = GaussianMixture(
    n_components=n_clusters,
    random_state=42,
    covariance_type='spherical',
    n_init=10
)

# Fit on the balanced training data, as per your constraint
gmm_eval.fit(X_train_bal)
train_cluster_labels = gmm_eval.predict(X_train_bal)

# --- GMM: 2. Transductive Mapping (Majority Vote) ---
cluster_to_class_map_gmm = {}
possible_classes = sorted(list(set(y_train_bal)))

print("\n--- GMM Transductive Mapping ---")
for k in range(n_clusters):
    # Find indices of all training points in this cluster
    train_indices_in_cluster = np.where(train_cluster_labels == k)[0]

    if len(train_indices_in_cluster) > 0:
        cluster_true_labels = y_train_bal[train_indices_in_cluster]

        # Determine the majority class (numerical label)
        # Determine the majority class (numerical label) using robust bincount method
        counts = np.bincount(cluster_true_labels)
        # argmax returns the index of the highest count, which is the majority class label
        majority_class_num = np.argmax(counts)
        cluster_to_class_map_gmm[k] = majority_class_num
        print(f"Cluster {k} -> Class {majority_class_num} ({label_map[majority_class_num]})")
    else:
        # Handle "no labels" case by choosing randomly
        random_class_num = random.choice(possible_classes)
        cluster_to_class_map_gmm[k] = random_class_num
        print(f"Cluster {k} -> NO LABELS. Chosen randomly: {random_class_num} ({label_map[random_class_num]})")

# --- GMM: 3. Predict and Evaluate on Test Data ---
# Predict the cluster ID for the test data
test_cluster_labels = gmm_eval.predict(X_test)

# Map the cluster ID to the final application class label (numerical)
y_pred_gmm = np.array([cluster_to_class_map_gmm[cluster_id] for cluster_id in test_cluster_labels])

# Evaluate using the defined target_names
evaluate_model(y_test, y_pred_gmm, "GMM Transductive", target_names)

Error: 'df_user_features_with_clusters.csv' not found.

--- GMM Transductive Mapping ---
Cluster 0 -> Class 3 (Determined Buyers)
Cluster 1 -> Class 4 (Weekday OOS Bouncers (No Category))
Cluster 2 -> Class 4 (Weekday OOS Bouncers (No Category))
Cluster 3 -> Class 6 (Weekend 'Quick-Look' Visitors)
Cluster 4 -> Class 2 (Window Shoppers)
Cluster 5 -> Class 0 (Weekday OOS Bouncers)
Cluster 6 -> Class 5 (High-Intent Shoppers)
--- GMM Transductive Classification Report ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                                    precision    recall  f1-score   support

              Weekday OOS Bouncers       1.00      0.94      0.97     40517
     Weekday 'Quick-Look' Visitors       0.00      0.00      0.00     41972
                   Window Shoppers       0.61      0.99      0.76     66331
                 Determined Buyers       1.00      0.97      0.99     57043
Weekday OOS Bouncers (No Category)       0.35      1.00      0.52      5762
              High-Intent Shoppers       1.00      0.99      0.99     72754
     Weekend 'Quick-Look' Visitors       0.88      0.82      0.85     60194

                          accuracy                           0.83    344573
                         macro avg       0.69      0.82      0.72    344573
                      weighted avg       0.77      0.83      0.79    344573

Saved GMM Transductive confusion matrix as: gmm_transductive_confusion_matrix.png
