In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import mode
from sklearn.cluster import MiniBatchKMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
import random

# Utility function for clean evaluation
def evaluate_model(y_true, y_pred, model_name, target_names):
    """Prints classification report and plots confusion matrix."""
    print(f"--- {model_name} Classification Report ---")
    print(classification_report(y_true, y_pred, target_names=target_names))

    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
    fig, ax = plt.subplots(figsize=(8, 8))
    disp.plot(ax=ax, cmap=plt.cm.Blues, xticks_rotation='vertical')
    ax.set_title(f'{model_name} Confusion Matrix')
    plt.tight_layout()
    plt.savefig(f'{model_name.lower().replace(" ", "_")}_confusion_matrix.png')
    plt.close()
    print(f"Saved {model_name} confusion matrix as: {model_name.lower().replace(' ', '_')}_confusion_matrix.png")

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np

# --- Define the Application Class Labels ---
label_map = {
    0: "Weekday OOS Bouncers",
    1: "Weekday 'Quick-Look' Visitors",
    2: "Window Shoppers",
    3: "Determined Buyers",
    4: "Weekday OOS Bouncers (No Category)",
    5: "High-Intent Shoppers",
    6: "Weekend 'Quick-Look' Visitors"
}
target_names = list(label_map.values())
n_clusters = len(target_names) # 7

# --- 1. Load Data ---
# Note: Assuming this file contains the unscaled data and the 'cluster' column
try:
    df_features = pd.read_csv('../session_feature_matrix_with_k7_clusters.csv', index_col='session_id')
    df_features = df_features.rename(columns={'cluster': 'label'})
    print(f"Loaded feature matrix with shape: {df_features.shape}")
except FileNotFoundError:
    print("Error: 'session_feature_matrix_with_k7_clusters.csv' not found.")
    df_features = None

if df_features is not None:
    features_for_clustering = [
        'session_duration_sec', 'total_events', 'unique_items_viewed',
        'unique_categories_viewed', 'avg_item_availability',
        'session_hour_of_day', 'view_count', 'addtocart_count',
        'event_rate_per_sec', 'view_to_cart_ratio',
        'category_spread_ratio', 'is_weekend'
    ]

    # --- 2. Separate Features (X) and Target (y) ---
    X_raw = df_features[features_for_clustering].values
    y = df_features['label'].values
    print(f"Target 'y' classes: {sorted(list(np.unique(y)))}")

    # --- 3. Split Data FIRST (Crucial for proper scaling) ---
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        X_raw, y, test_size=0.2, stratify=y, random_state=42
    )

    # --- 4. Scale Features (Fit only on X_train_raw) ---
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_raw)
    X_test_scaled = scaler.transform(X_test_raw) # Transform test set using TRAIN mean/std

    # --- 5. Balance the Training Data (X_train_bal, y_train_bal) ---
    rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
    X_train_bal, y_train_bal = rus.fit_resample(X_train_scaled, y_train)


    print(f"\nOriginal train shape: {X_train_scaled.shape}")
    print(f"Balanced train shape: {X_train_bal.shape}")
    print(f"Test shape: {X_test_scaled.shape}")
    print(f"Classes in balanced train set: {sorted(list(set(y_train_bal)))}")

    # Rename scaled variables for model input
    X_train_bal = X_train_bal
    X_test = X_test_scaled

    print("\nData preparation complete. Ready for K-Means/GMM.")

Loaded feature matrix with shape: (1722864, 17)
Target 'y' classes: [0, 1, 2, 3, 4, 5, 6]

Original train shape: (1378291, 12)
Balanced train shape: (161350, 12)
Test shape: (344573, 12)
Classes in balanced train set: [0, 1, 2, 3, 4, 5, 6]

Data preparation complete. Ready for K-Means/GMM.


In [18]:
# --- K-Means: 1. Fit Clustering on Balanced Training Data ---
kmeans_eval = MiniBatchKMeans(
    n_clusters=n_clusters,
    random_state=42,
    n_init='auto',
    batch_size=2048
)

# Fit on the balanced training data, as per your constraint
kmeans_eval.fit(X_train_bal)
train_cluster_labels = kmeans_eval.labels_

# --- K-Means: 2. Transductive Mapping (Majority Vote) ---
cluster_to_class_map_km = {}
# Ensure we use the numerical labels from the training data for mapping
possible_classes = sorted(list(set(y_train_bal)))

print("\n--- K-Means Transductive Mapping ---")
for k in range(n_clusters):
    # Find indices of all training points in this cluster
    train_indices_in_cluster = np.where(train_cluster_labels == k)[0]

    if len(train_indices_in_cluster) > 0:
        cluster_true_labels = y_train_bal[train_indices_in_cluster]

        # Determine the majority class (numerical label)
        # mode returns an object, we take the first element of the array of modes
       # Determine the majority class (numerical label) using robust bincount method
        counts = np.bincount(cluster_true_labels)
        # argmax returns the index of the highest count, which is the majority class label
        majority_class_num = np.argmax(counts)
        cluster_to_class_map_km[k] = majority_class_num
        print(f"Cluster {k} -> Class {majority_class_num} ({label_map[majority_class_num]})")
    else:
        # Handle "no labels" case by choosing randomly
        random_class_num = random.choice(possible_classes)
        cluster_to_class_map_km[k] = random_class_num
        print(f"Cluster {k} -> NO LABELS. Chosen randomly: {random_class_num} ({label_map[random_class_num]})")

# --- K-Means: 3. Predict and Evaluate on Test Data ---
# Predict the cluster ID for the test data
test_cluster_labels = kmeans_eval.predict(X_test)

# Map the cluster ID to the final application class label (numerical)
y_pred_km = np.array([cluster_to_class_map_km[cluster_id] for cluster_id in test_cluster_labels])

# Evaluate using the defined target_names
evaluate_model(y_test, y_pred_km, "K-Means Transductive", target_names)


--- K-Means Transductive Mapping ---
Cluster 0 -> Class 5 (High-Intent Shoppers)
Cluster 1 -> Class 2 (Window Shoppers)
Cluster 2 -> Class 4 (Weekday OOS Bouncers (No Category))
Cluster 3 -> Class 4 (Weekday OOS Bouncers (No Category))
Cluster 4 -> Class 4 (Weekday OOS Bouncers (No Category))
Cluster 5 -> Class 6 (Weekend 'Quick-Look' Visitors)
Cluster 6 -> Class 0 (Weekday OOS Bouncers)
--- K-Means Transductive Classification Report ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                                    precision    recall  f1-score   support

              Weekday OOS Bouncers       1.00      1.00      1.00     40517
     Weekday 'Quick-Look' Visitors       0.00      0.00      0.00     41972
                   Window Shoppers       0.89      1.00      0.94     66331
                 Determined Buyers       0.00      0.00      0.00     57043
Weekday OOS Bouncers (No Category)       0.75      1.00      0.86      5762
              High-Intent Shoppers       0.45      1.00      0.62     72754
     Weekend 'Quick-Look' Visitors       0.96      0.97      0.96     60194

                          accuracy                           0.71    344573
                         macro avg       0.58      0.71      0.63    344573
                      weighted avg       0.56      0.71      0.61    344573

Saved K-Means Transductive confusion matrix as: k-means_transductive_confusion_matrix.png


In [24]:
try:
    # This file has the scaled features AND the original cluster label
    df_features = pd.read_csv('df_user_features_with_clusters.csv', index_col='session_id')
    df_features = df_features.rename(columns={'cluster_gmm': 'label'})
    print(f"Loaded feature matrix with shape: {df_features.shape}")
except FileNotFoundError:
    print("Error: 'df_user_features_with_clusters.csv' not found.")
    df_features = None # Handle error appropriately

# --- 2. Define Features (X) and MULTI-CLASS Target (y) ---
if df_features is not None:
    features_for_clustering = [
        'session_duration_sec', 'total_events', 'unique_items_viewed',
        'unique_categories_viewed', 'avg_item_availability',
        'session_hour_of_day', 'view_count', 'addtocart_count',
        'event_rate_per_sec', 'view_to_cart_ratio',
        'category_spread_ratio', 'is_weekend'
    ]

    # We are using the pre-scaled data from the file
    X = df_features[features_for_clustering].values

    # *** THIS IS THE KEY CHANGE ***
    # The target 'y' is now the multi-class 'label' (0-6)
    y = df_features['label'].values
    print(f"Target 'y' classes: {sorted(list(np.unique(y)))}")

    # --- 3. Split Data FIRST (Crucial for proper scaling) ---
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        X_raw, y, test_size=0.2, stratify=y, random_state=42
    )

    # --- 4. Scale Features (Fit only on X_train_raw) ---
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_raw)
    X_test_scaled = scaler.transform(X_test_raw) # Transform test set using TRAIN mean/std

    # --- 5. Balance the Training Data (X_train_bal, y_train_bal) ---
    rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
    X_train_bal, y_train_bal = rus.fit_resample(X_train_scaled, y_train)

    # Rename scaled variables for model input
    X_train_bal = X_train_bal
    X_test = X_test_scaled
# --- GMM: 1. Fit Clustering on Balanced Training Data ---
gmm_eval = GaussianMixture(
    n_components=n_clusters,
    random_state=42,
    covariance_type='spherical',
    n_init=10
)

# Fit on the balanced training data, as per your constraint
gmm_eval.fit(X_train_bal)
train_cluster_labels = gmm_eval.predict(X_train_bal)

# --- GMM: 2. Transductive Mapping (Majority Vote) ---
cluster_to_class_map_gmm = {}
possible_classes = sorted(list(set(y_train_bal)))

print("\n--- GMM Transductive Mapping ---")
for k in range(n_clusters):
    # Find indices of all training points in this cluster
    train_indices_in_cluster = np.where(train_cluster_labels == k)[0]

    if len(train_indices_in_cluster) > 0:
        cluster_true_labels = y_train_bal[train_indices_in_cluster]

        # Determine the majority class (numerical label)
        # Determine the majority class (numerical label) using robust bincount method
        counts = np.bincount(cluster_true_labels)
        # argmax returns the index of the highest count, which is the majority class label
        majority_class_num = np.argmax(counts)
        cluster_to_class_map_gmm[k] = majority_class_num
        print(f"Cluster {k} -> Class {majority_class_num} ({label_map[majority_class_num]})")
    else:
        # Handle "no labels" case by choosing randomly
        random_class_num = random.choice(possible_classes)
        cluster_to_class_map_gmm[k] = random_class_num
        print(f"Cluster {k} -> NO LABELS. Chosen randomly: {random_class_num} ({label_map[random_class_num]})")

# --- GMM: 3. Predict and Evaluate on Test Data ---
# Predict the cluster ID for the test data
test_cluster_labels = gmm_eval.predict(X_test)

# Map the cluster ID to the final application class label (numerical)
y_pred_gmm = np.array([cluster_to_class_map_gmm[cluster_id] for cluster_id in test_cluster_labels])

# Evaluate using the defined target_names
evaluate_model(y_test, y_pred_gmm, "GMM Transductive", target_names)

Error: 'df_user_features_with_clusters.csv' not found.

--- GMM Transductive Mapping ---
Cluster 0 -> Class 3 (Determined Buyers)
Cluster 1 -> Class 4 (Weekday OOS Bouncers (No Category))
Cluster 2 -> Class 4 (Weekday OOS Bouncers (No Category))
Cluster 3 -> Class 6 (Weekend 'Quick-Look' Visitors)
Cluster 4 -> Class 2 (Window Shoppers)
Cluster 5 -> Class 0 (Weekday OOS Bouncers)
Cluster 6 -> Class 5 (High-Intent Shoppers)
--- GMM Transductive Classification Report ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                                    precision    recall  f1-score   support

              Weekday OOS Bouncers       1.00      0.94      0.97     40517
     Weekday 'Quick-Look' Visitors       0.00      0.00      0.00     41972
                   Window Shoppers       0.61      0.99      0.76     66331
                 Determined Buyers       1.00      0.97      0.99     57043
Weekday OOS Bouncers (No Category)       0.35      1.00      0.52      5762
              High-Intent Shoppers       1.00      0.99      0.99     72754
     Weekend 'Quick-Look' Visitors       0.88      0.82      0.85     60194

                          accuracy                           0.83    344573
                         macro avg       0.69      0.82      0.72    344573
                      weighted avg       0.77      0.83      0.79    344573

Saved GMM Transductive confusion matrix as: gmm_transductive_confusion_matrix.png


In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- SIMULATING THE CLASSIFICATION REPORTS FROM THE NOTEBOOK OUTPUT ---
# This data is extracted directly from the output of cells [18] and [24].
km_report = {
    'Weekday OOS Bouncers': {'precision': 1.00, 'recall': 1.00, 'f1-score': 1.00, 'support': 40517},
    "Weekday 'Quick-Look' Visitors": {'precision': 0.00, 'recall': 0.00, 'f1-score': 0.00, 'support': 41972},
    'Window Shoppers': {'precision': 0.89, 'recall': 1.00, 'f1-score': 0.94, 'support': 66331},
    'Determined Buyers': {'precision': 0.00, 'recall': 0.00, 'f1-score': 0.00, 'support': 57043},
    'Weekday OOS Bouncers (No Category)': {'precision': 0.75, 'recall': 1.00, 'f1-score': 0.86, 'support': 5762},
    'High-Intent Shoppers': {'precision': 0.45, 'recall': 1.00, 'f1-score': 0.62, 'support': 72754},
    "Weekend 'Quick-Look' Visitors": {'precision': 0.96, 'recall': 0.97, 'f1-score': 0.96, 'support': 60194},
    'accuracy': 0.71,
    'macro avg': {'precision': 0.58, 'recall': 0.71, 'f1-score': 0.63, 'support': 344573},
    'weighted avg': {'precision': 0.56, 'recall': 0.71, 'f1-score': 0.61, 'support': 344573}
}

gmm_report = {
    'Weekday OOS Bouncers': {'precision': 1.00, 'recall': 0.94, 'f1-score': 0.97, 'support': 40517},
    "Weekday 'Quick-Look' Visitors": {'precision': 0.00, 'recall': 0.00, 'f1-score': 0.00, 'support': 41972},
    'Window Shoppers': {'precision': 0.61, 'recall': 0.99, 'f1-score': 0.76, 'support': 66331},
    'Determined Buyers': {'precision': 1.00, 'recall': 0.97, 'f1-score': 0.99, 'support': 57043},
    'Weekday OOS Bouncers (No Category)': {'precision': 0.35, 'recall': 1.00, 'f1-score': 0.52, 'support': 5762},
    'High-Intent Shoppers': {'precision': 1.00, 'recall': 0.99, 'f1-score': 0.99, 'support': 72754},
    "Weekend 'Quick-Look' Visitors": {'precision': 0.88, 'recall': 0.82, 'f1-score': 0.85, 'support': 60194},
    'accuracy': 0.83,
    'macro avg': {'precision': 0.69, 'recall': 0.82, 'f1-score': 0.72, 'support': 344573},
    'weighted avg': {'precision': 0.77, 'recall': 0.83, 'f1-score': 0.79, 'support': 344573}
}

# Get class names
target_names = list(km_report.keys())[:-3]

# 1. Convert reports to DataFrames
km_df = pd.DataFrame(km_report).transpose()
gmm_df = pd.DataFrame(gmm_report).transpose()

# 2. Filter for only the classification classes
km_class_df = km_df.loc[target_names].copy()
gmm_class_df = gmm_df.loc[target_names].copy()

# 3. Filter for non-zero recall classes in AT LEAST ONE model
km_recall = km_class_df['recall']
gmm_recall = gmm_class_df['recall']
non_zero_recall_mask = (km_recall > 0.00) | (gmm_recall > 0.00)

km_filtered = km_class_df[non_zero_recall_mask][['precision', 'recall', 'f1-score', 'support']]
gmm_filtered = gmm_class_df[non_zero_recall_mask][['precision', 'recall', 'f1-score', 'support']]

# 4. Create the side-by-side comparison DataFrame
comparison_df = pd.DataFrame({
    'K-Means F1-score': km_filtered['f1-score'],
    'GMM F1-score': gmm_filtered['f1-score'],
    'K-Means Recall': km_filtered['recall'],
    'GMM Recall': gmm_filtered['recall'],
    'Support': km_filtered['support'].astype(int)
})

# Add overall accuracy for a full picture
overall_accuracy = pd.DataFrame({
    'Metric': ['Overall Accuracy'],
    'K-Means': [km_report['accuracy']],
    'GMM': [gmm_report['accuracy']]
}).set_index('Metric')

print("--- Side-by-Side Comparison of Classification Performance (Non-Zero Recall Classes) ---")
print("\nF1-score/Recall Comparison:")
print(comparison_df.to_markdown(floatfmt=".2f"))

print("\nOverall Accuracy Comparison:")
print(overall_accuracy.to_markdown(floatfmt=".2f"))

# --- Draw a Bar Chart for F1-score comparison ---
plot_df = comparison_df[['K-Means F1-score', 'GMM F1-score']].reset_index()
plot_df = plot_df.rename(columns={'index': 'Class'})
plot_melted = plot_df.melt('Class', var_name='Model', value_name='F1-score')

plt.figure(figsize=(12, 7))
sns.barplot(x='Class', y='F1-score', hue='Model', data=plot_melted, palette=['skyblue', 'salmon'])
plt.title('F1-Score Comparison for Non-Zero Recall Classes: K-Means vs. GMM')
plt.ylabel('F1-score')
plt.xlabel('Customer Segment')
plt.xticks(rotation=45, ha='right')
plt.ylim(0, 1.05)
plt.legend(title='Model')
plt.tight_layout()
plt.savefig('f1_score_comparison_km_gmm.png')
plt.close()
print("\nSaved F1-score comparison plot as: f1_score_comparison_km_gmm.png")

--- Side-by-Side Comparison of Classification Performance (Non-Zero Recall Classes) ---

F1-score/Recall Comparison:
|                                    |   K-Means F1-score |   GMM F1-score |   K-Means Recall |   GMM Recall |   Support |
|:-----------------------------------|-------------------:|---------------:|-----------------:|-------------:|----------:|
| Weekday OOS Bouncers               |               1.00 |           0.97 |             1.00 |         0.94 |  40517.00 |
| Window Shoppers                    |               0.94 |           0.76 |             1.00 |         0.99 |  66331.00 |
| Determined Buyers                  |               0.00 |           0.99 |             0.00 |         0.97 |  57043.00 |
| Weekday OOS Bouncers (No Category) |               0.86 |           0.52 |             1.00 |         1.00 |   5762.00 |
| High-Intent Shoppers               |               0.62 |           0.99 |             1.00 |         0.99 |  72754.00 |
| Weekend 'Quick-Look'

In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- SIMULATING THE CLASSIFICATION REPORTS FROM THE NOTEBOOK OUTPUT ---
# This data is necessary to reconstruct the comparison DataFrame for plotting.
km_report = {
    'Weekday OOS Bouncers': {'precision': 1.00, 'recall': 1.00, 'f1-score': 1.00, 'support': 40517},
    "Weekday 'Quick-Look' Visitors": {'precision': 0.00, 'recall': 0.00, 'f1-score': 0.00, 'support': 41972},
    'Window Shoppers': {'precision': 0.89, 'recall': 1.00, 'f1-score': 0.94, 'support': 66331},
    'Determined Buyers': {'precision': 0.00, 'recall': 0.00, 'f1-score': 0.00, 'support': 57043},
    'Weekday OOS Bouncers (No Category)': {'precision': 0.75, 'recall': 1.00, 'f1-score': 0.86, 'support': 5762},
    'High-Intent Shoppers': {'precision': 0.45, 'recall': 1.00, 'f1-score': 0.62, 'support': 72754},
    "Weekend 'Quick-Look' Visitors": {'precision': 0.96, 'recall': 0.97, 'f1-score': 0.96, 'support': 60194},
}

gmm_report = {
    'Weekday OOS Bouncers': {'precision': 1.00, 'recall': 0.94, 'f1-score': 0.97, 'support': 40517},
    "Weekday 'Quick-Look' Visitors": {'precision': 0.00, 'recall': 0.00, 'f1-score': 0.00, 'support': 41972},
    'Window Shoppers': {'precision': 0.61, 'recall': 0.99, 'f1-score': 0.76, 'support': 66331},
    'Determined Buyers': {'precision': 1.00, 'recall': 0.97, 'f1-score': 0.99, 'support': 57043},
    'Weekday OOS Bouncers (No Category)': {'precision': 0.35, 'recall': 1.00, 'f1-score': 0.52, 'support': 5762},
    'High-Intent Shoppers': {'precision': 1.00, 'recall': 0.99, 'f1-score': 0.99, 'support': 72754},
    "Weekend 'Quick-Look' Visitors": {'precision': 0.88, 'recall': 0.82, 'f1-score': 0.85, 'support': 60194},
}

target_names = list(km_report.keys())

# 1. Convert reports to DataFrames and combine F1-scores
km_class_df = pd.DataFrame(km_report).transpose().loc[target_names].copy()
gmm_class_df = pd.DataFrame(gmm_report).transpose().loc[target_names].copy()

# 2. Filter for non-zero recall classes in AT LEAST ONE model
non_zero_recall_mask = (km_class_df['recall'] > 0.00) | (gmm_class_df['recall'] > 0.00)

comparison_df = pd.DataFrame({
    'K-Means F1-score': km_class_df[non_zero_recall_mask]['f1-score'],
    'GMM F1-score': gmm_class_df[non_zero_recall_mask]['f1-score'],
})

# 3. Prepare data for plotting F1-scores
plot_df = comparison_df[['K-Means F1-score', 'GMM F1-score']].reset_index()
plot_df = plot_df.rename(columns={'index': 'Class'})

# Melt the DataFrame to long format, suitable for grouped bar chart plotting
plot_melted = plot_df.melt('Class', var_name='Model', value_name='F1-score')

# --- Plotting Code ---
plt.figure(figsize=(12, 7))
sns.barplot(x='Class', y='F1-score', hue='Model', data=plot_melted, palette=['skyblue', 'salmon'])
plt.title('F1-Score Comparison for Non-Zero Recall Classes: K-Means vs. GMM')
plt.ylabel('F1-score')
plt.xlabel('Customer Segment')
plt.xticks(rotation=45, ha='right')
plt.ylim(0, 1.05)
plt.legend(title='Model')
plt.tight_layout()
plt.savefig('f1_score_comparison_km_gmm.png')
plt.close()

In [27]:
!pip install mlxtend



### Associative Rule Classification Unit

Association Rule Classification unit provides finer classification to the model. Association rules find regularities between flow parameters with different measures of interestingness for applications from transductive classifier output. The Apriori Algorithm is used for association rule learning. The derived rules are traced back to the main dataset and identified flows. Moreover, the rule association also helps predict IPs and ports used for servicing an application in the future. The rules heuristics applied to flow data causes accurate classification and thus making classification method finer due to association rule mining techniques.

In [4]:
# New Code Cell (e.g., cell 28)
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

print("--- Step 1: Data Discretization and Conversion for Apriori ---")

# Combine scaled features (X_train_bal) with the class label (y_train_bal)
# Reconstruct a DataFrame for easier handling
feature_names = [
    'session_duration_sec', 'total_events', 'unique_items_viewed',
    'unique_categories_viewed', 'avg_item_availability',
    'session_hour_of_day', 'view_count', 'addtocart_count',
    'event_rate_per_sec', 'view_to_cart_ratio',
    'category_spread_ratio', 'is_weekend'
]
df_train_bal = pd.DataFrame(X_train_bal, columns=feature_names)
df_train_bal['target_class'] = y_train_bal

# --- Discretization (Binning) ---
# This is a crucial step for continuous data in ARM.
# We'll use a simple quantile-based binning (qcut) for most features.

discretized_features = {}
for col in features_for_clustering:
    # Skip 'is_weekend' as it's already binary (0 or 1)
    if col == 'is_weekend':
        df_train_bal[f'{col}_Binned'] = df_train_bal[col].astype(bool)
        continue

    # Use 3 bins for continuous features: Low, Medium, High
    try:
        # Create bins based on quantiles (equal frequency)
        df_train_bal[f'{col}_Binned'] = pd.qcut(
            df_train_bal[col], q=3, labels=['Low', 'Medium', 'High'], duplicates='drop'
        ).astype(str)
    except ValueError as e:
        # Handle cases where a feature might have too few unique values for 3 quantiles
        print(f"Warning: Could not qcut {col}. Using simple cut instead. Error: {e}")
        df_train_bal[f'{col}_Binned'] = pd.cut(
            df_train_bal[col], bins=3, labels=['Low', 'Medium', 'High'], include_lowest=True
        ).astype(str)

    discretized_features[col] = f'{col}_Binned'

# Convert the numerical 'target_class' back to its descriptive name for clear rules
class_names = [label_map[i] for i in df_train_bal['target_class']]
df_train_bal['Target_Class_Name'] = class_names

# Select only the binned columns and the target
binned_cols = list(discretized_features.values()) + ['Target_Class_Name']
df_apriori = df_train_bal[binned_cols]

# --- One-Hot Encoding (for Apriori input) ---
# Create an itemset for each row. Prefix all feature columns for clarity.
df_one_hot = pd.get_dummies(df_apriori, columns=df_apriori.columns, prefix=df_apriori.columns, dtype=bool)

# Drop redundant/unwanted columns (like the original binned columns' name)
df_one_hot.columns = [c.replace('_Binned_', '_') for c in df_one_hot.columns]
df_one_hot.columns = [c.replace('_Target_Class_Name_', 'CLASS_') for c in df_one_hot.columns]

print(f"One-Hot Encoded DataFrame shape for Apriori: {df_one_hot.shape}")

--- Step 1: Data Discretization and Conversion for Apriori ---
One-Hot Encoded DataFrame shape for Apriori: (161350, 40)


In [None]:
# New Code Cell (e.g., cell 29)

# --- Apriori Algorithm ---
# Find frequent itemsets (min_support is a hyperparameter to tune)
min_support = 0.05
frequent_itemsets = apriori(df_one_hot, min_support=min_support, use_colnames=True)
print(f"\nFound {len(frequent_itemsets)} frequent itemsets with min_support={min_support}")

# --- Association Rule Generation ---
# Generate rules (min_confidence is a hyperparameter to tune)
# Lift is a key measure of interestingness (Lift > 1 suggests a positive association)
min_confidence = 0.8
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)

# Filter rules to only include those that predict a CLASS (i.e., classification rules)
classification_rules = rules[rules['consequents'].apply(lambda x: any('CLASS_' in item for item in x))]

# Filter for the most interesting rules (high lift, high confidence)
classification_rules_sorted = classification_rules.sort_values(
    by=['lift', 'confidence'], ascending=False
).reset_index(drop=True)

# Select and display the top 10 most interesting rules
print(f"\nGenerated {len(classification_rules)} classification rules with min_confidence={min_confidence}")
print("\n--- Top 10 Associative Classification Rules (Sorted by Lift) ---")

# Formatting output for better readability
def format_rule(row):
    antecedent = ', '.join(list(row['antecedents']))
    consequent = ', '.join(list(row['consequents']))
    return (
        f"Rule: IF ({antecedent}) THEN ({consequent}) | "
        f"Conf: {row['confidence']:.2f}, Lift: {row['lift']:.2f}, "
        f"Support: {row['support']:.3f}"
    )

for i, row in classification_rules_sorted.head(10).iterrows():
    print(format_rule(row))

In [None]:
# New Code Cell (e.g., cell 30)
# --- Conceptual Associative Rule Classifier (Max Confidence Heuristic) ---

# This code is highly simplified and requires careful mapping of test data
# to the one-hot format. This cell is for demonstration only.

print("\n--- Associative Rule Classification Unit Heuristic (Max Confidence) ---")

# Step 1: Prepare a simplified set of rules for the classifier.
# We'll use only the antecedents, the predicted class, and confidence.
rule_list = []
for index, row in classification_rules_sorted.iterrows():
    # Only keep single-class consequents for simplicity in this example
    if len(row['consequents']) == 1:
        rule_list.append({
            'antecedents': set(row['antecedents']),
            'consequent': list(row['consequents'])[0].replace('CLASS_', ''),
            'confidence': row['confidence']
        })

# Step 2: Implement the classifier function (Requires *full* test data prep first)
def predict_with_rules(test_instance_itemset, rules):
    """
    Simulates prediction using the max confidence rule heuristic.
    """
    matching_rules = []
    
    # 1. Match: Find all rules whose antecedents are a subset of the instance's itemset
    for rule in rules:
        if rule['antecedents'].issubset(test_instance_itemset):
            matching_rules.append(rule)
            
    if not matching_rules:
        return None # No rule matched (fallback to a default class in a real system)
        
    # 2. Conflict Resolution: Max Confidence
    best_rule = max(matching_rules, key=lambda x: x['confidence'])
    
    return best_rule['consequent']

print("Associative Rule Classifier logic defined. The actual prediction on X_test would require:")
print("1. Re-running the discretization on the **test set** X_test_raw with the **training set bins**.")
print("2. Converting the test set to the same one-hot itemset format.")
print("3. Iterating through the test set and applying the `predict_with_rules` function.")
print("\nThis creates the 'finer classification' by applying specific IF-THEN conditions derived from the data.")