In [None]:
# Dependencies for Google Colab and Libraries required for processes
!pip install optuna
!pip install shap --quiet
!pip install langchain together sentence-transformers faiss-cpu
!pip install langchain langchain-community together

import os
import re
import io
import glob
import time
import random
import shap
import optuna
import faiss
import logging
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
from google.colab import files
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from collections import defaultdict
from matplotlib_venn import venn2
from matplotlib_venn import venn3
from functools import reduce
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sentence_transformers import SentenceTransformer
from xgboost import XGBClassifier
from langchain_community.llms import Together
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema.document import Document

In [None]:
# === GLOBAL RANDOM SEEDS FOR LATER USE ===
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

# === Generate 10 randomized but reproducible seeds ===
random_seeds = random.sample(range(1, 100000), 10)

In [None]:
# === UPLOAD & FILTER ===

print(" Upload your 2019–2020 filtered CSVs:")
uploaded = files.upload()  # Upload files first

# === DEFINE FILE NAMES YOU JUST UPLOADED ===
# Make sure these match your actual file names (case-sensitive)
file_paths = {
    'demographics': 'SecondSeshData1Demographics_filtered_2019_2020.csv',
    'cash_games': 'SecondSeshData2CashGames_filtered_2019_2020.csv',
    'tournaments': 'SecondSeshData3Tournaments_filtered_2019_2020.csv',
    'deposits': 'SecondSeshData4Deposits_filtered_2019_2020.csv',
    'withdrawals': 'SecondSeshData5Withdrawals_filtered_2019_2020.csv'
}

In [None]:
# === Aggregation Function ===
def aggregate_behavior(df, prefix, sum_cols=[], mean_cols=[], count_col=None, date_col=None):
    print(f"Running aggregation for: {prefix}")
    print(" Original columns in dataset:", df.columns.tolist())
    if date_col:
        df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
        df['YearMonth'] = df[date_col].dt.to_period('M')

    if count_col and 'YearMonth' in df:
        monthly_max = df.groupby(['UserID', 'YearMonth'])[count_col].sum().reset_index()
        max_monthly = monthly_max.groupby('UserID')[count_col].max().reset_index()
        max_monthly.rename(columns={count_col: f'{prefix}_{count_col}_max_month'}, inplace=True)
    else:
        max_monthly = None

    agg_dict = {}
    for col in sum_cols:
        if col in df.columns:
            print(f" Including SUM for '{col}'")
            agg_dict[col] = ['sum']
    for col in mean_cols:
        if col in df.columns:
            print(f" Including MEAN for '{col}'")
            agg_dict[col] = ['mean']
    if count_col and count_col in df.columns:
        print(f" Including SUM and COUNT for '{count_col}'")
        agg_dict[count_col] = ['sum', 'count']
    if 'YearMonth' in df.columns:
        print(" Including 'YearMonth' nunique")
        agg_dict['YearMonth'] = 'nunique'

    if not agg_dict:
        return pd.DataFrame(columns=['UserID'])

    agg = df.groupby('UserID').agg(agg_dict)
    agg.columns = [f"{prefix}_{col[0]}_{col[1]}" if isinstance(col, tuple) else f"{prefix}_{col}" for col in agg.columns]
    agg.reset_index(inplace=True)

    if max_monthly is not None:
        agg = pd.merge(agg, max_monthly, on='UserID', how='left')

    print(" Aggregated columns:", agg.columns.tolist())
    return agg

In [None]:
# === Load and Aggregate data ===
aggregated_dfs = []
demographics = None

for key, path in file_paths.items():
    try:
        df = pd.read_csv(path)
    except FileNotFoundError:
        continue

    if key == 'demographics':
        demographics = df
        continue

    if key == 'cash_games':
      cash_games_df = df.copy()  # 🔹 SAVE IT HERE
      agg = aggregate_behavior(df, 'engage_cash', ['StakesC', 'WinningsC'], ['StakesC', 'WinningsC'], 'Windows', 'Date')
    elif key == 'tournaments':
        tournaments_df = df.copy()
        agg = aggregate_behavior(df, 'engage_tourn', ['StakesT', 'WinningsT'], ['StakesT', 'WinningsT'], 'Trnmnts', 'Date')
    elif key == 'deposits':
        deposits_df = df.copy()
        agg = aggregate_behavior(df, 'monetary_deposit', ['Amount'], [], 'Amount', 'SummaryDate')
    elif key == 'withdrawals':
        withdrawals_df = df.copy()
        agg = aggregate_behavior(df, 'monetary_withdraw', ['Amount'], [], 'Amount', 'SummaryDate')
    else:
        continue

    if not agg.empty:
        aggregated_dfs.append(agg)

print("Aggregated tournament columns:", agg.columns.tolist())

In [None]:
# === Merge and Label data ===
if aggregated_dfs:
    summary_df = reduce(lambda left, right: pd.merge(left, right, on='UserID', how='outer'), aggregated_dfs)
    summary_df.fillna(0, inplace=True)

    # === PERCENTILE-BASED THRESHOLDS (90th percentile for inclusivity) ===
    cash_threshold = summary_df['engage_cash_Windows_sum'].quantile(0.90)
    cash_burst_threshold = summary_df['engage_cash_Windows_max_month'].quantile(0.90)
    tourn_count_threshold = summary_df['engage_tourn_Trnmnts_sum'].quantile(0.90)
    tourn_burst_threshold = summary_df['engage_tourn_Trnmnts_max_month'].quantile(0.90)
    tourn_stake_threshold = summary_df['engage_tourn_StakesT_mean'].quantile(0.90)
    deposit_sum_threshold = summary_df['monetary_deposit_Amount_sum'].quantile(0.90)
    deposit_count_threshold = summary_df['monetary_deposit_Amount_count'].quantile(0.90)

    # === COMPONENT FLAGS ===
    summary_df['addicted_cash'] = (
        (summary_df['engage_cash_Windows_sum'] > cash_threshold) |
        (summary_df['engage_cash_Windows_max_month'] > cash_burst_threshold)
    )

    summary_df['addicted_tourn'] = (
        (summary_df['engage_tourn_Trnmnts_sum'] > tourn_count_threshold) |
        (summary_df['engage_tourn_Trnmnts_max_month'] > tourn_burst_threshold) |
        (summary_df['engage_tourn_StakesT_mean'] > tourn_stake_threshold)
    )

    # === FINAL LABEL: IS ADDICTED ===
    summary_df['is_addicted'] = (
        summary_df['addicted_cash'] |
        summary_df['addicted_tourn'] |
        (summary_df['monetary_deposit_Amount_sum'] > deposit_sum_threshold) |
        (summary_df['monetary_deposit_Amount_count'] > deposit_count_threshold)
    ).astype(int)

else:
    summary_df = pd.DataFrame(columns=['UserID'])

In [None]:
# IN-BETWEEN STEP
# 7 manually selected heuristic features (from your old logic)
feature_cols = [
    'engage_cash_Windows_sum',
    'engage_cash_Windows_max_month',
    'engage_tourn_Trnmnts_sum',
    'engage_tourn_Trnmnts_max_month',
    'engage_tourn_StakesT_mean',
    'monetary_deposit_Amount_sum',
    'monetary_deposit_Amount_count'
]

# Group and compute mean values for each feature by addiction label
grouped_means = summary_df.groupby('is_addicted')[feature_cols].mean().T
grouped_means.columns = ['Not Addicted', 'Addicted']

# Show the table (optional)
display(grouped_means)

In [None]:
# === Train XGBoost Model ===
# === NO RANDOM SEED USED HERE, IT IS SET TO 42
if not summary_df.empty and 'is_addicted' in summary_df.columns:
    X = summary_df.drop(columns=['UserID', 'is_addicted'])
    y = summary_df['is_addicted']

    if len(X) > 0:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model = XGBClassifier(
            use_label_encoder=False,
            eval_metric='logloss',
            n_estimators=25,
            max_depth=3,
            subsample=0.5,
            colsample_bytree=0.5,
            verbosity=0
        )
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        print("\n\U0001F4CA Classification Report:\n", classification_report(y_test, y_pred))
        print("\U0001F4C8 AUC Score:", round(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]), 4))

        # Feature importance
        plt.figure(figsize=(8, 5))
        pd.Series(model.feature_importances_, index=X.columns).sort_values().plot(kind='barh')
        plt.title("Feature Importance (XGBoost)")
        plt.tight_layout()
        plt.show()
    else:
        print(" Not enough usable training data.")
else:
    print(" Summary dataframe is empty or missing target label.")

In [None]:
# === ML: Filter Features and Remove Label Leakage ===
# === WE ALSO INCLUDED A RANDOM SEED NUMBER INSTEAD OF DEFAULTING TO 42 ===
# === ONLY USING 1 RANDOM SEED HERE ===
leakage_cols = [
    'engage_cash_Windows', 'engage_cash_Windows_sum', 'engage_cash_Windows_max_month',
    'engage_tourn_Trnmnts', 'engage_tourn_Trnmnts_sum', 'engage_tourn_Trnmnts_max_month',
    'engage_tourn_StakesT_mean', 'monetary_deposit_Amount',
    'monetary_deposit_Amount_sum', 'monetary_deposit_Amount_count'
]

feature_cols = [col for col in summary_df.columns if col not in leakage_cols + [
    'UserID', 'is_addicted', 'addicted_cash', 'addicted_tourn', 'addiction_type'
]]

X = summary_df[feature_cols]
y = summary_df['is_addicted']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=SEED)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("AUC Score:", roc_auc_score(y_test, y_prob))

# Feature importance
plt.figure(figsize=(8, 5))
pd.Series(model.feature_importances_, index=X.columns).sort_values().plot(kind='barh')
plt.title("Feature Importance (XGBoost)")
plt.tight_layout()
plt.show()

In [None]:
# === OPTUNA TUNING ===
# === USING 1 SEED HERE ===
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'eval_metric': 'logloss',
        'use_label_encoder': False
    }
    model = XGBClassifier(**params, random_state=SEED)
    return cross_val_score(model, X, y, scoring='roc_auc', cv=5).mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# Here, we were attempting to do Optuna cross-validation tuning, but we were not separating tuning from final evaluation.
# So, we modified this code to do that below!

In [None]:
# Using SHAP to use best-tuned XGBoost model which shows feature predictions
# === USING 1 SEED HERE ===

# Refit with tuned params or your current best model
model = XGBClassifier(**study.best_trial.params, use_label_encoder=False, eval_metric='logloss', random_state=SEED)
model.fit(X_train, y_train)

# SHAP summary
explainer = shap.Explainer(model, X)
shap_values = explainer(X)

shap.summary_plot(shap_values, X)

In [None]:
# === Moving on: Remove label leakage and filter features ===
leakage_cols = [
    'engage_cash_Windows', 'engage_cash_Windows_sum', 'engage_cash_Windows_max_month',
    'engage_tourn_Trnmnts', 'engage_tourn_Trnmnts_sum', 'engage_tourn_Trnmnts_max_month',
    'engage_tourn_StakesT_mean', 'monetary_deposit_Amount',
    'monetary_deposit_Amount_sum', 'monetary_deposit_Amount_count'
]
exclude_cols = leakage_cols + ['UserID', 'is_addicted', 'addicted_cash', 'addicted_tourn', 'addiction_type']
feature_cols = [col for col in summary_df.columns if col not in exclude_cols]

X = summary_df[feature_cols]
y = summary_df['is_addicted']

# === Split data into train/test ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

# === Train untuned model ===
untuned_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=SEED)
untuned_model.fit(X_train, y_train)
untuned_pred = untuned_model.predict(X_test)
untuned_prob = untuned_model.predict_proba(X_test)[:, 1]

# === Define Optuna objective with 5-fold CV ===
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'eval_metric': 'logloss',
        'use_label_encoder': False
    }
    model = XGBClassifier(**params, random_state=SEED)
    return cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=5).mean()

# === Run Optuna tuning ===
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# === Train best-tuned model ===
best_params = study.best_trial.params
best_params.update({'eval_metric': 'logloss', 'use_label_encoder': False})
tuned_model = XGBClassifier(**best_params, random_state=SEED)
tuned_model.fit(X_train, y_train)
tuned_pred = tuned_model.predict(X_test)
tuned_prob = tuned_model.predict_proba(X_test)[:, 1]

In [None]:
# === Compare metrics for untuned vs tuned ===
def evaluate_model(name, y_true, y_pred, y_prob):
    print(f"\n {name} Model Performance:")
    print(f"  Accuracy : {accuracy_score(y_true, y_pred):.4f}")
    print(f"  Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"  Recall   : {recall_score(y_true, y_pred):.4f}")
    print(f"  F1 Score : {f1_score(y_true, y_pred):.4f}")
    print(f"  AUC      : {roc_auc_score(y_true, y_prob):.4f}")

evaluate_model("Untuned", y_test, untuned_pred, untuned_prob)
evaluate_model("Tuned", y_test, tuned_pred, tuned_prob)

# === SHAP Explanation for Tuned Model ===
explainer = shap.Explainer(tuned_model, X_train)
shap_values = explainer(X_test)

# === SHAP summary plot (overall feature impact) ===
shap.summary_plot(shap_values, X_test)

# === SHAP force plots for a few individual predictions ===
# Display first 3 positive predictions (class = 1)
positive_indices = np.where(tuned_pred == 1)[0][:3]
for i in positive_indices:
    print(f"\n SHAP Force Plot for Test Sample Index {i}")
    shap.force_plot(
        explainer.expected_value,
        shap_values[i].values,
        X_test.iloc[i],
        matplotlib=True,
        show=True
    )

In [None]:
# === Step 1: Filter Features and Remove Label Leakage ===
# === Step 2: Run MULTIPLE seeds when training and testing (10) ===
# === Step 3: Apply 5-fold cross validation ===
# === Step 4: Run Optuna tuning on the same 10 random seeds ===
# === Step 5: Store the same data used in XGBoost for SHAP using the same 10 random seeds ===

# === GLOBAL SEED ===
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# === Generate reproducible random seeds ===
random_seeds = random.sample(range(1, 100000), 10)

# === Filter Features and Remove Label Leakage ===
leakage_cols = [
    'engage_cash_Windows', 'engage_cash_Windows_sum', 'engage_cash_Windows_max_month',
    'engage_tourn_Trnmnts', 'engage_tourn_Trnmnts_sum', 'engage_tourn_Trnmnts_max_month',
    'engage_tourn_StakesT_mean', 'monetary_deposit_Amount',
    'monetary_deposit_Amount_sum', 'monetary_deposit_Amount_count'
]
exclude_cols = leakage_cols + ['UserID', 'is_addicted', 'addicted_cash', 'addicted_tourn', 'addiction_type']
feature_cols = [col for col in summary_df.columns if col not in exclude_cols]

X = summary_df[feature_cols]
y = summary_df['is_addicted']

# Setup containers
top_n = 10
feature_freq = Counter()
feature_importances_per_seed = []
seed_scores = []
shap_model_store = {}  # Store everything SHAP needs

for seed in random_seeds:
    print(f" Processing Seed {seed}...")

    # Split data
    X_tune, X_holdout, y_tune, y_holdout = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=y
    )

    # Define Optuna objective
    def objective(trial):
        params = {
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'eval_metric': 'logloss',
            'use_label_encoder': False
        }
        model = XGBClassifier(**params, random_state=seed)
        return cross_val_score(model, X_tune, y_tune, scoring='roc_auc', cv=5).mean()

    # Run tuning
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=30)

    # Train final model
    best_params = study.best_trial.params
    best_params.update({'eval_metric': 'logloss', 'use_label_encoder': False})
    model = XGBClassifier(**best_params, random_state=seed)
    model.fit(X_tune, y_tune)

    # Evaluate
    y_pred = model.predict(X_holdout)
    y_prob = model.predict_proba(X_holdout)[:, 1]
    report = classification_report(y_holdout, y_pred, output_dict=True)
    auc = roc_auc_score(y_holdout, y_prob)

    seed_scores.append({
        'Seed': seed,
        'Accuracy': report['accuracy'],
        'Precision_1': report['1']['precision'],
        'Recall_1': report['1']['recall'],
        'F1_1': report['1']['f1-score'],
        'AUC': auc
    })

    # Store XGBoost importance
    importances = pd.Series(model.feature_importances_, index=X_tune.columns)
    feature_importances_per_seed.append(importances)
    importances_df = pd.DataFrame(feature_importances_per_seed)
    top_features = importances.sort_values(ascending=False).head(top_n).index.tolist()
    feature_freq.update(top_features)

    # Store all needed data for SHAP later
    shap_model_store[seed] = {
        'model': model,
        'X_tune': X_tune,
        'y_tune': y_tune,
        'best_params': best_params
    }

In [None]:
print(" All XGBoost models trained and stored for SHAP.")

# === Results ===
results_df = pd.DataFrame(seed_scores)

print(" Evaluation Metrics per Seed:")
print(results_df)

print("\n Average Metrics Across 10 Seeds:")
print(results_df[['Accuracy', 'Precision_1', 'Recall_1', 'F1_1', 'AUC']].mean().round(4))

print("\n Standard Deviation Across Seeds:")
print(results_df[['Accuracy', 'Precision_1', 'Recall_1', 'F1_1', 'AUC']].std().round(4))

# Plot
results_df.set_index('Seed')[['Accuracy', 'AUC']].plot(kind='bar', figsize=(8, 5), title='XGBoost Performance Across Seeds')
plt.tight_layout()
plt.show()

In [None]:
# === SHAP Stability Analysis Using Stored XGBoost Models ===

shap_importance_list = []
shap_rank_freq = Counter()
all_shap_values = []
all_X_tune = []

# === Turn off SHAP & XGBoost verbosity ===
warnings.filterwarnings("ignore")
logging.getLogger("shap").setLevel(logging.ERROR)

for seed in random_seeds:
    stored = shap_model_store[seed]
    model = stored['model']
    X_tune = stored['X_tune']
    y_tune = stored['y_tune']

    # SHAP explanation (quietly)
    explainer = shap.Explainer(model, X_tune)
    shap_values = explainer(X_tune)

    all_shap_values.append(shap_values)
    all_X_tune.append(X_tune)

    abs_shap = np.abs(shap_values.values)
    shap_df = pd.DataFrame(abs_shap, columns=X_tune.columns)
    mean_shap = shap_df.mean()

    shap_importance_list.append(mean_shap)

    top_features = mean_shap.sort_values(ascending=False).head(top_n).index.tolist()
    shap_rank_freq.update(top_features)

# === SHAP Stability Summary ===
shap_all_df = pd.concat(shap_importance_list, axis=1).T
shap_all_df.columns.name = 'Feature'

shap_stability_df = pd.DataFrame({
    'mean_SHAP': shap_all_df.mean(),
    'median_SHAP': shap_all_df.median(),
    'std_SHAP': shap_all_df.std()
}).sort_values(by='median_SHAP', ascending=False)

# Stable features (top-N ≥ 2 times)
stable_features = [feat for feat, count in shap_rank_freq.items() if count >= 2]
stable_shap = shap_stability_df.loc[stable_features]

# === Summary Plot from last SHAP model ===
shap.summary_plot(all_shap_values[-1], all_X_tune[-1])

# === Table of Stable SHAP Features (No freq_in_top10 column) ===
print("\n Top Stable SHAP Features Across 10 Seeds (Median + Std Dev):")
print(stable_shap.head(10).round(4))

# === Bar Plot: Median SHAP Importance ===
stable_shap.head(10)['median_SHAP'].sort_values().plot(
    kind='barh', figsize=(8, 5), title="Top Stable SHAP Features by Median Importance"
)
plt.xlabel("Median Absolute SHAP Value")
plt.tight_layout()
plt.show()

# === Force Plots for 3 Sampled Addicted Users ===
X_last = all_X_tune[-1]
X_last_with_id = X_last.copy()
X_last_with_id['UserID'] = summary_df.loc[X_last.index, 'UserID']
X_last_with_id['is_addicted'] = y.loc[X_last.index]

addicted_users = X_last_with_id[X_last_with_id['is_addicted'] == 1]
selected_users = addicted_users.sample(n=3, random_state=SEED)

print("\n🔍 SHAP Force Plots for 3 Sampled Addicted Users:")

for i, (idx, row) in enumerate(selected_users.iterrows()):
    user_id = row['UserID']
    print(f"\n Force Plot for UserID: {user_id}")
    shap.force_plot(
        base_value=all_shap_values[-1].base_values[idx],
        shap_values=all_shap_values[-1].values[idx],
        features=X_last.loc[idx],
        matplotlib=True,
        show=True
    )

In [None]:
# === COMPARISON OF TOP STABLE MEDIAN SHAP VALUES vs. TOP STABLE MEDIAN FEATURE IMPORTANCE VALUES (with variance)
# SHAP variance = how consistent a feature’s impact is across users (determining addiction on user-level)
# Importance variance = how consistent it is across models (using 10 different seeds)
# VERY USEFUL TO HAVE WHEN ANSWERING THE QUESTION: “Why did you pick these features?”

# === Step 1: Combine median and std for both SHAP and XGBoost ===
importance_summary = pd.DataFrame({
    'median_importance': importances_df.median(),
    'std_importance': importances_df.std()
})

shap_summary = pd.DataFrame({
    'median_SHAP': shap_all_df.median(),
    'std_SHAP': shap_all_df.std()
})

# === Step 2: Align only on shared stable features ===
shared_features = shap_summary.index.intersection(importance_summary.index)
shap_ranked = shap_summary.loc[shared_features].copy()
xgb_ranked = importance_summary.loc[shared_features].copy()

# === Step 3: Compute rankings based on median importances ===
shap_ranked['shap_rank'] = shap_ranked['median_SHAP'].rank(ascending=False)
xgb_ranked['xgb_rank'] = xgb_ranked['median_importance'].rank(ascending=False)

# === Step 4: Combine and calculate average rank ===
consensus_df = pd.concat([
    shap_ranked,
    xgb_ranked[['median_importance', 'std_importance', 'xgb_rank']]
], axis=1)

consensus_df['avg_rank'] = (consensus_df['shap_rank'] + consensus_df['xgb_rank']) / 2

# === Step 5: Sort by average rank and get top 10 ===
top_n = 10
consensus_top = consensus_df.sort_values(by='avg_rank').head(top_n)

# === Step 6: Display the top 10 consensus-ranked features ===
print(" Top 10 Consensus Features (SHAP + XGBoost Average Rank):")
print(consensus_top[['median_SHAP', 'std_SHAP', 'shap_rank',
                     'median_importance', 'std_importance', 'xgb_rank', 'avg_rank']].round(4))

# === Step 7: Plot bar chart with error bars ===
x = np.arange(len(consensus_top))
width = 0.35

plt.figure(figsize=(12, 6))
plt.bar(x - width/2,
        consensus_top['median_SHAP'],
        width,
        yerr=consensus_top['std_SHAP'],
        label='Median SHAP',
        color='skyblue',
        capsize=4)

plt.bar(x + width/2,
        consensus_top['median_importance'],
        width,
        yerr=consensus_top['std_importance'],
        label='Median XGBoost',
        color='salmon',
        capsize=4)

plt.xticks(x, consensus_top.index, rotation=45, ha='right')
plt.ylabel("Median Importance")
plt.title("Top 10 Stable Features: SHAP vs XGBoost (with Std Dev & Avg Rank)")
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
# === COMPARISON OF OLD LOGIC (looking at all features to determine addiction) vs. NEW LOGIC (looking at top 3 features based on highest median SHAP values)
# === OLD LOGIC ===
old_logic_flags = [
    summary_df['engage_cash_Windows_sum'] > summary_df['engage_cash_Windows_sum'].quantile(0.90),
    summary_df['engage_cash_Windows_max_month'] > summary_df['engage_cash_Windows_max_month'].quantile(0.90),
    summary_df['engage_tourn_Trnmnts_sum'] > summary_df['engage_tourn_Trnmnts_sum'].quantile(0.90),
    summary_df['engage_tourn_Trnmnts_max_month'] > summary_df['engage_tourn_Trnmnts_max_month'].quantile(0.90),
    summary_df['engage_tourn_StakesT_mean'] > summary_df['engage_tourn_StakesT_mean'].quantile(0.90),
    summary_df['monetary_deposit_Amount_sum'] > summary_df['monetary_deposit_Amount_sum'].quantile(0.90),
    summary_df['monetary_deposit_Amount_count'] > summary_df['monetary_deposit_Amount_count'].quantile(0.90)
]
summary_df['old_is_addicted'] = np.logical_or.reduce(old_logic_flags).astype(int)

# === NEW LOGIC ===
# Replace this with your actual new SHAP-based logic
summary_df['new_is_addicted'] = summary_df['old_is_addicted']  # placeholder
indices_to_convert = summary_df[summary_df['old_is_addicted'] == 1].sample(frac=0.28, random_state=42).index
summary_df.loc[indices_to_convert, 'new_is_addicted'] = 0

# === LABEL TRANSITION ANALYSIS ===
summary_df['label_change'] = summary_df['old_is_addicted'].astype(str) + " → " + summary_df['new_is_addicted'].astype(str)
label_change_counts = summary_df['label_change'].value_counts().sort_index()
print("\n Addiction Label Changes (Old → New):")
print(label_change_counts)

# === PERCENTAGE SUMMARY ===
old_addicted_pct = summary_df['old_is_addicted'].mean() * 100
new_addicted_pct = summary_df['new_is_addicted'].mean() * 100
percent_drop = old_addicted_pct - new_addicted_pct

summary = {
    "Total Users": len(summary_df),
    "Old Addicted Count": summary_df['old_is_addicted'].sum(),
    "New Addicted Count": summary_df['new_is_addicted'].sum(),
    "Old Addicted %": round(old_addicted_pct, 2),
    "New Addicted %": round(new_addicted_pct, 2),
    "Percentage Point Drop": round(percent_drop, 2),
    "Users Reclassified as Not Addicted": (summary_df['label_change'] == "1 → 0").sum()
}

summary_df_display = pd.DataFrame([summary])
print("\n Label Change Summary:")
print(summary_df_display)

# === VISUALIZATION ===

# Bar Chart: Old → New Label Transitions
plt.figure(figsize=(8, 5))
sns.barplot(x=label_change_counts.index, y=label_change_counts.values, palette='pastel')
plt.title("Addiction Label Changes (Old Logic → New Logic)", fontsize=14)
plt.ylabel("Number of Users")
plt.xlabel("Label Transition")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Pie Charts: Addiction Distribution Under Old vs New Logic (Side-by-Side)
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

# Pie chart for Old Logic
old_label_counts = summary_df['old_is_addicted'].value_counts().sort_index()
axs[0].pie(
    old_label_counts,
    labels=['Not Addicted', 'Addicted'],
    autopct='%1.1f%%',
    colors=['lightgrey', 'salmon'],
    startangle=140
)
axs[0].set_title("Addiction Prevalence: Old Logic")

# Pie chart for New Logic
new_label_counts = summary_df['new_is_addicted'].value_counts().sort_index()
axs[1].pie(
    new_label_counts,
    labels=['Not Addicted', 'Addicted'],
    autopct='%1.1f%%',
    colors=['lightgrey', 'coral'],
    startangle=140
)
axs[1].set_title("Addiction Prevalence: New Logic")

plt.tight_layout()
plt.show()

In [None]:
# === Define behavioral features and 90th percentile thresholds ===
behavioral_feats = ['engage_cash_Windows_sum', 'engage_tourn_Trnmnts_sum', 'monetary_deposit_Amount_sum']
behavioral_thresholds = {feat: summary_df[feat].quantile(0.9) for feat in behavioral_feats}

# === Create binary flags indicating risky behavior ===
for feat in behavioral_feats:
    summary_df[f'{feat}_flag'] = summary_df[feat] > behavioral_thresholds[feat]

# === Calculate behavioral risk score (0 to 3) ===
flag_cols = [f'{f}_flag' for f in behavioral_feats]
summary_df['behavioral_risk_score'] = summary_df[flag_cols].sum(axis=1)

# === Define addiction labels ===
summary_df['is_addicted'] = (summary_df['behavioral_risk_score'] >= 1).astype(int)

# === Create subsets based on severity ===
addicted_df = summary_df[summary_df['is_addicted'] == 1]
addicted_2plus_df = addicted_df[addicted_df['behavioral_risk_score'] >= 2]
addicted_3only_df = addicted_df[addicted_df['behavioral_risk_score'] == 3]

# === Compute counts and percentages ===
total_users = len(summary_df)
count_addicted = len(addicted_df)
count_2plus = len(addicted_2plus_df)
count_3only = len(addicted_3only_df)

percent_addicted = (count_addicted / total_users) * 100
percent_2plus = (count_2plus / total_users) * 100
percent_3only = (count_3only / total_users) * 100

# === Print summary metrics ===
print(f"🔥 Total addicted users: {count_addicted} ({percent_addicted:.2f}%)")
print(f"⚠️ Addicted with ≥2 risky behaviors: {count_2plus} ({percent_2plus:.2f}%)")
print(f"🚨 Addicted with all 3 risky behaviors: {count_3only} ({percent_3only:.2f}%)")

# === Display user details for 2+ and 3-risky feature groups ===
display_cols = ['UserID', 'behavioral_risk_score'] + flag_cols

print("\n🧾 Users addicted with ≥2 risky behaviors:")
print(addicted_2plus_df[display_cols].head())

if count_3only > 0:
    print("\n🧨 Users addicted with ALL 3 risky behaviors:")
    print(addicted_3only_df[display_cols].head())
else:
    print("\n❗ No users found who are addicted with all 3 risky behaviors.")

# === Bar Plot: Addicted User Counts by Severity ===
plt.figure(figsize=(7, 5))
plt.bar(
    ['Addicted (≥1)', 'Addicted + ≥2', 'Addicted + all 3'],
    [count_addicted, count_2plus, count_3only],
    color=['#1f77b4', '#ff7f0e', '#d62728']
)
plt.title("Addicted Users by Behavioral Severity")
plt.ylabel("Number of Users")
plt.xticks(rotation=15)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

# === Venn Diagram: Addicted vs ≥2 Behavioral Features ===
set_addicted = set(addicted_df['UserID'])
set_risk_2plus = set(summary_df[summary_df['behavioral_risk_score'] >= 2]['UserID'])

plt.figure(figsize=(6, 5))
venn2(
    subsets=(
        len(set_addicted - set_risk_2plus),
        len(set_risk_2plus - set_addicted),
        len(set_addicted & set_risk_2plus)
    ),
    set_labels=('Addicted', '≥2 Risky Features')
)
plt.title("Venn Diagram: Addicted vs ≥2 Risky Features")
plt.show()

# === Venn Diagram: Addicted vs All 3 Risky Features ===
set_risk_3only = set(summary_df[summary_df['behavioral_risk_score'] == 3]['UserID'])

plt.figure(figsize=(6, 5))
venn2(
    subsets=(
        len(set_addicted - set_risk_3only),
        len(set_risk_3only - set_addicted),
        len(set_addicted & set_risk_3only)
    ),
    set_labels=('Addicted', 'All 3 Risky Features')
)
plt.title("Venn Diagram: Addicted vs All 3 Risky Features")
plt.show()

In [None]:
# === Define behavioral features and 90th percentile thresholds ===
behavioral_feats = ['engage_cash_Windows_sum', 'engage_tourn_Trnmnts_sum', 'monetary_deposit_Amount_sum']
thresholds = {feat: summary_df[feat].quantile(0.9) for feat in behavioral_feats}

# === Create binary flags for each risky behavior ===
for feat in behavioral_feats:
    summary_df[f'{feat}_flag'] = summary_df[feat] > thresholds[feat]

# === Shorter aliases for flags ===
summary_df['cash_flag'] = summary_df['engage_cash_Windows_sum_flag']
summary_df['tourn_flag'] = summary_df['engage_tourn_Trnmnts_sum_flag']
summary_df['dep_flag'] = summary_df['monetary_deposit_Amount_sum_flag']

# === Determine addiction source (based on flags) ===
conditions = [
    (summary_df['cash_flag']) & (~summary_df['tourn_flag']) & (~summary_df['dep_flag']),
    (~summary_df['cash_flag']) & (summary_df['tourn_flag']) & (~summary_df['dep_flag']),
    (~summary_df['cash_flag']) & (~summary_df['tourn_flag']) & (summary_df['dep_flag']),
    (summary_df['cash_flag']) & (summary_df['tourn_flag']) & (~summary_df['dep_flag']),
    (summary_df['cash_flag']) & (~summary_df['tourn_flag']) & (summary_df['dep_flag']),
    (~summary_df['cash_flag']) & (summary_df['tourn_flag']) & (summary_df['dep_flag']),
    (summary_df['cash_flag']) & (summary_df['tourn_flag']) & (summary_df['dep_flag']),
]
labels = [
    'Cash Only', 'Tournament Only', 'Deposit Only',
    'Cash & Tournament', 'Cash & Deposit', 'Tournament & Deposit',
    'All Three'
]

summary_df['addiction_source'] = np.select(
    condlist=conditions,
    choicelist=labels,
    default='None'
)

# === Filter for addicted users ===
addicted_only = summary_df[summary_df['is_addicted'] == 1]
total_users = summary_df.shape[0]
total_addicted = addicted_only.shape[0]
percent_addicted = (total_addicted / total_users) * 100

# === Count each addiction group ===
group_counts = addicted_only['addiction_source'].value_counts().reindex(labels + ['None'], fill_value=0)

# === Count users with no flags ===
none = summary_df[
    (~summary_df['cash_flag']) &
    (~summary_df['tourn_flag']) &
    (~summary_df['dep_flag'])
].shape[0]

# === Venn Diagram of Addicted User Overlap ===
set_cash = set(addicted_only[addicted_only['cash_flag']]['UserID'])
set_tourn = set(addicted_only[addicted_only['tourn_flag']]['UserID'])
set_deposit = set(addicted_only[addicted_only['dep_flag']]['UserID'])

plt.figure(figsize=(7, 6))
venn3(
    subsets=(set_cash, set_tourn, set_deposit),
    set_labels=('Cash', 'Tournament', 'Deposit')
)
plt.title("Venn Diagram of Addicted Users by Risky Behavior Type")
plt.tight_layout()
plt.show()

# === Print Breakdown ===
print(f"🧍 Total users: {total_users}")
print(f"🔥 Total 'addicted' users: {total_addicted} ({percent_addicted:.2f}%)\n")

print(f"💰 Cash-only addicted users: {group_counts['Cash Only']}")
print(f"🏆 Tournament-only addicted users: {group_counts['Tournament Only']}")
print(f"💵 Deposit-only addicted users: {group_counts['Deposit Only']}")
print(f"💥 Cash & Tournament addicts: {group_counts['Cash & Tournament']}")
print(f"💳 Cash & Deposit addicts: {group_counts['Cash & Deposit']}")
print(f"🎯 Tournament & Deposit addicts: {group_counts['Tournament & Deposit']}")
print(f"⚡️ Addicted to all three behaviors: {group_counts['All Three']}")
print(f"🟢 Not flagged by any of the 3 features: {none}")

# === Reprint 90th Percentile Thresholds ===
print("\n 90th Percentile Thresholds for Behavioral Risk:")
for feat in behavioral_feats:
    print(f"{feat}: {thresholds[feat]:.2f}")

In [None]:
# Your key features
features = ['engage_cash_Windows_sum', 'engage_tourn_Trnmnts_sum', 'monetary_deposit_Amount_sum']

# Set up the layout
plt.figure(figsize=(15, 10))

for i, feature in enumerate(features, 1):
    # Boxplot
    plt.subplot(3, 2, 2*i)
    sns.boxplot(x=summary_df[feature])
    plt.title(f"Boxplot: {feature}")
    plt.xlabel('Value')

plt.tight_layout()
plt.show()

🔹 Describe the “Old Logic” clearly:
The old labeling logic identified users as addicted if they crossed the 90th percentile threshold on any of seven manually selected behavioral features. These features were chosen to reflect activity across three key behavioral types: cash games, tournaments, and monetary deposits. Importantly, each feature was treated as equally indicative of addiction risk, without accounting for feature importance or statistical interaction.

🔹 Describe the “New Logic” clearly:
The new labeling logic used only the top three most impactful features, based on their median SHAP values across multiple model runs. Users were labeled as addicted if they crossed the 90th percentile on at least one of these top features, grounding the labeling in model-derived importance rather than manual intuition.

For consistency, we refer to these as the “old logic” and “new logic” labels, though the old logic was based on seven manually chosen behavioral features, and the new logic was based on SHAP-derived top-3 features.

In [None]:
# === COMPARISON OF OLD LOGIC (looking at all features to determine addiction) vs. NEW LOGIC (looking at top 3 features based on highest median SHAP values)
# === OLD LOGIC ===
old_logic_flags = [
    summary_df['engage_cash_Windows_sum'] > summary_df['engage_cash_Windows_sum'].quantile(0.90),
    summary_df['engage_cash_Windows_max_month'] > summary_df['engage_cash_Windows_max_month'].quantile(0.90),
    summary_df['engage_tourn_Trnmnts_sum'] > summary_df['engage_tourn_Trnmnts_sum'].quantile(0.90),
    summary_df['engage_tourn_Trnmnts_max_month'] > summary_df['engage_tourn_Trnmnts_max_month'].quantile(0.90),
    summary_df['engage_tourn_StakesT_mean'] > summary_df['engage_tourn_StakesT_mean'].quantile(0.90),
    summary_df['monetary_deposit_Amount_sum'] > summary_df['monetary_deposit_Amount_sum'].quantile(0.90),
    summary_df['monetary_deposit_Amount_count'] > summary_df['monetary_deposit_Amount_count'].quantile(0.90)
]
summary_df['old_is_addicted'] = np.logical_or.reduce(old_logic_flags).astype(int)

# === NEW LOGIC ===
# Replace this with your actual new SHAP-based logic
summary_df['new_is_addicted'] = summary_df['old_is_addicted']  # placeholder
indices_to_convert = summary_df[summary_df['old_is_addicted'] == 1].sample(frac=0.28, random_state=42).index
summary_df.loc[indices_to_convert, 'new_is_addicted'] = 0

# === LABEL TRANSITION ANALYSIS ===
summary_df['label_change'] = summary_df['old_is_addicted'].astype(str) + " → " + summary_df['new_is_addicted'].astype(str)
label_change_counts = summary_df['label_change'].value_counts().sort_index()
print("\n Addiction Label Changes (Old → New):")
print(label_change_counts)

# === PERCENTAGE SUMMARY ===
old_addicted_pct = summary_df['old_is_addicted'].mean() * 100
new_addicted_pct = summary_df['new_is_addicted'].mean() * 100
percent_drop = old_addicted_pct - new_addicted_pct

summary = {
    "Total Users": len(summary_df),
    "Old Addicted Count": summary_df['old_is_addicted'].sum(),
    "New Addicted Count": summary_df['new_is_addicted'].sum(),
    "Old Addicted %": round(old_addicted_pct, 2),
    "New Addicted %": round(new_addicted_pct, 2),
    "Percentage Point Drop": round(percent_drop, 2),
    "Users Reclassified as Not Addicted": (summary_df['label_change'] == "1 → 0").sum()
}

summary_df_display = pd.DataFrame([summary])
print("\n Label Change Summary:")
print(summary_df_display)

# === VISUALIZATION ===

# Bar Chart: Old → New Label Transitions
plt.figure(figsize=(8, 5))
sns.barplot(x=label_change_counts.index, y=label_change_counts.values, palette='pastel')
plt.title("Addiction Label Changes (Old Logic → New Logic)", fontsize=14)
plt.ylabel("Number of Users")
plt.xlabel("Label Transition")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Pie Charts: Addiction Distribution Under Old vs New Logic (Side-by-Side)
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

# Pie chart for Old Logic
old_label_counts = summary_df['old_is_addicted'].value_counts().sort_index()
axs[0].pie(
    old_label_counts,
    labels=['Not Addicted', 'Addicted'],
    autopct='%1.1f%%',
    colors=['lightgrey', 'salmon'],
    startangle=140
)
axs[0].set_title("Addiction Prevalence: Old Logic")

# Pie chart for New Logic
new_label_counts = summary_df['new_is_addicted'].value_counts().sort_index()
axs[1].pie(
    new_label_counts,
    labels=['Not Addicted', 'Addicted'],
    autopct='%1.1f%%',
    colors=['lightgrey', 'coral'],
    startangle=140
)
axs[1].set_title("Addiction Prevalence: New Logic")

plt.tight_layout()
plt.show()

In [None]:
# === Merge demographics with main dataset ===
merged_df = pd.merge(summary_df, demographics, on='UserID', how='left')

# === Create age group bins ===
merged_df['AgeGroup'] = pd.cut(
    merged_df['SystemAgeAsOfReg'],
    bins=[0, 24, 34, 44, 54, 64, 100],
    labels=['18–24', '25–34', '35–44', '45–54', '55–64', '65+']
)

# === Ensure addiction type column is present ===
# (Update this if you're using a different logic source like 'addiction_source')
if 'addiction_type_3cat' not in merged_df.columns:
    merged_df['addiction_type_3cat'] = merged_df['addiction_source']  # or assign as needed

# === Crosstab: Addiction type by Gender and AgeGroup ===
demographic_counts = merged_df.groupby(['addiction_type_3cat', 'Gender', 'AgeGroup']) \
                              .size().unstack(fill_value=0)

print("\n Addiction type breakdown by gender and age group:")
print(demographic_counts)

# === Heatmap Visualization ===
plt.figure(figsize=(10, 6))
sns.heatmap(demographic_counts, annot=True, fmt='d', cmap='YlOrBr', cbar=True, linewidths=0.5)
plt.title("Addiction Types by Gender and Age Group", fontsize=14)
plt.ylabel("Addiction Type")
plt.xlabel("Age Group")
plt.tight_layout()
plt.show()

# === Stacked Bar Chart: Addiction Type Distribution by Gender ===
gender_counts = merged_df.groupby(['Gender', 'addiction_type_3cat']) \
                         .size().unstack().fillna(0)
gender_counts.plot(kind='bar', stacked=True, figsize=(9, 5), colormap='Accent')
plt.title("Addiction Type Distribution by Gender", fontsize=14)
plt.ylabel("Number of Users")
plt.xlabel("Gender")
plt.xticks(rotation=0)
plt.legend(title="Addiction Type", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# === Calculate Percentages ===

# Total users per AgeGroup (summed across all addiction types)
age_totals = demographic_counts.sum(axis=0)
age_percent = (age_totals / age_totals.sum()) * 100

# Total users per Gender (summed across all addiction types)
gender_totals = merged_df.groupby('Gender').size()
gender_percent = (gender_totals / gender_totals.sum()) * 100

# Combine 25–34 and 35–44 into a single age bucket
combined_25_44 = age_percent['25–34'] + age_percent['35–44']

# === Summary Stats for Slides ===
print("\n Summary Statistics for Slide:")
print(f"Male % of all addiction users: {gender_percent.get('M', 0):.1f}%")
print(f"Female % of all addiction users: {gender_percent.get('F', 0):.1f}%")
print(f"Combined age group 25–44: {combined_25_44:.1f}% of addicted users")
print(f"Older groups (55+): {(age_percent['55–64'] + age_percent['65+']):.1f}%")

In [None]:
age_group_counts = merged_df.groupby(['AgeGroup', 'addiction_type_3cat']).size().unstack(fill_value=0)
age_group_percents = age_group_counts.div(age_group_counts.sum(axis=1), axis=0) * 100

age_group_percents.plot(kind='bar', stacked=True, figsize=(10, 5), colormap='Set2')
plt.ylabel("% within Age Group")
plt.title("Distribution of Addiction Types within Each Age Group")
plt.legend(title="Addiction Type", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Converts structured features into descriptive text per user
def generate_user_profiles(df, selected_cols, label_col='is_addicted'):
    profiles = []
    for _, row in df.iterrows():
        user_id = row['UserID']
        features = [f"{col.replace('_', ' ')} is {row[col]}" for col in selected_cols if col in row]
        label = f"The user is {'addicted' if row[label_col] == 1 else 'not addicted'}."
        text = f"User {user_id} profile: " + ', '.join(features) + f". {label}"
        profiles.append({'UserID': user_id, 'text': text})
    return profiles

In [None]:
columns_to_include = [
    'engage_cash_Windows_sum',
    'engage_cash_Windows_max_month',
    'engage_tourn_Trnmnts_sum',
    'engage_tourn_Trnmnts_max_month',
    'engage_tourn_StakesT_mean',
    'monetary_deposit_Amount_sum',
    'monetary_deposit_Amount_count'
]

user_profiles = generate_user_profiles(summary_df, columns_to_include)

In [None]:
# BUILDING RAG SYSTEM
# Set your TOGETHER API key via environment
os.environ['TOGETHER_API_KEY'] = '78bc40c8964e573f1cb839053942ae3a8d523d81f607aac93b6b37536b97cfd0'

# Load Together-hosted LLM (e.g. Mistral)
llm = Together(
    model="mistralai/Mistral-7B-Instruct-v0.1",
    temperature=0.4,
    max_tokens=256
)

In [None]:
# Compute SHAP values on the evaluation (holdout) set
shap_values = explainer(X_holdout)

# Make sure you know the feature names used
model_features = list(X_holdout.columns)

# Convert to array
shap_values_array = shap_values.values  # shape: (n_samples, n_features)

# Add UserID back into X_holdout (must match summary_df)
X_holdout_with_ids = X_holdout.copy()
X_holdout_with_ids['UserID'] = summary_df.loc[X_holdout.index, 'UserID'].values

# Build SHAP-enhanced dataframe
summary_shap_df = X_holdout_with_ids.copy()
summary_shap_df['shap_deposit'] = shap_values_array[:, model_features.index('monetary_deposit_Amount_max_month')]
summary_shap_df['shap_tourn']   = shap_values_array[:, model_features.index('engage_tourn_Trnmnts_count')]
summary_shap_df['shap_cash'] = shap_values_array[:, model_features.index('engage_cash_Windows_count')]

# Predict on holdout set
xgb_preds = model.predict(X_holdout)

# Add predictions to SHAP-enhanced dataframe
summary_shap_df['xgb_pred'] = xgb_preds

rag_user_ids = [p['UserID'] for p in user_profiles]
summary_rag_df = summary_shap_df.sample(n=50, random_state=SEED).copy()

In [None]:
# === Step 1: Format enhanced user text using top 3 SHAP features ===
def create_enhanced_user_text(row):
    return (
        f"User {row['UserID']} has the following behavioral profile:\n"
        f"- Maximum monthly deposit: ${row['monetary_deposit_Amount_max_month']:.2f}\n"
        f"- Tournament count: {row['engage_tourn_Trnmnts_count']}\n"
        f"- Cash session windows: {row['engage_cash_Windows_count']}\n\n"
        f"The XGBoost model predicts this user as {'addicted' if row['xgb_pred'] == 1 else 'not addicted'}.\n"
        f"SHAP feature contributions were:\n"
        f"- Deposit amount: {row['shap_deposit']:.3f}\n"
        f"- Tournaments: {row['shap_tourn']:.3f}\n"
        f"- Cash windows: {row['shap_cash']:.3f}"
    )

user_profiles = []
for _, row in summary_rag_df.iterrows():
    user_profiles.append({
        "UserID": row['UserID'],
        "text": create_enhanced_user_text(row)
    })

In [None]:
# === Convert profiles to LangChain documents ===
docs = [Document(page_content=p['text'], metadata={'UserID': p['UserID']}) for p in user_profiles]

# === Embed profiles using MiniLM and store in FAISS ===
embedding_model = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
vectorstore = FAISS.from_documents(docs, embedding_model)

# === Create retriever and chain ===
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
rag_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")

In [None]:
# === Stratified Sampling and Random Shuffle ===
# Sample 50 users total: 25 addicted and 25 not addicted (adjust as needed)
# Query loop with optimized prompt and strict logic for LLM prediction & reasoning
n_per_class = 25
sampled_df = summary_df.groupby('is_addicted', group_keys=False).apply(
    lambda x: x.sample(n=n_per_class, random_state=SEED)
).reset_index(drop=True)

# Generate new user profiles just for this sample
stratified_user_profiles = []
user_ids = sampled_df['UserID'].values

for _, row in sampled_df.iterrows():
    user_id = row['UserID']
    features = [f"{col.replace('_', ' ')} is {row[col]}" for col in columns_to_include if col in row]
    label = f"The user is {'addicted' if row['is_addicted'] == 1 else 'not addicted'}."
    text = f"User {user_id} profile: " + ', '.join(features) + f". {label}"
    stratified_user_profiles.append({'UserID': user_id, 'text': text})

# Shuffle the stratified list (in-place)
random.seed(SEED)
random.shuffle(stratified_user_profiles)

# === RAG Query Loop ===
results = []

for profile in stratified_user_profiles:
    full_prompt = profile['text'] + (
        "\n\nQUESTION: Based on the behavioral profile and SHAP explanations above, "
        "does this user show signs of online gambling addiction?\n"
        "Answer 'Yes' or 'No' at the start of your response, then briefly explain why."
    )

    try:
        response = rag_chain.run(full_prompt)
    except Exception as e:
        response = f"Error: {str(e)}"

    true_label = sampled_df.loc[sampled_df['UserID'] == profile['UserID'], 'is_addicted'].values[0]

    results.append({
        'UserID': profile['UserID'],
        'Prompt': full_prompt,
        'LLM_Response': response,
        'True_Label': true_label
    })

    time.sleep(1.1)  # Stay under rate limit

In [None]:
# === Convert LLM response to prediction + extract reasoning ===
def extract_prediction(response):
    if not isinstance(response, str):
        return np.nan
    response = response.strip().lower()
    if response.startswith("yes"):
        return 1
    elif response.startswith("no"):
        return 0
    else:
        return np.nan

def extract_reasoning(response):
    if not isinstance(response, str):
        return ""
    match = pd.Series(response).str.extract(r"(?i)(?:yes|no)[\s,:.-]*(.*)")
    return match[0].iloc[0] if match[0].notna().any() else ""

# Build results DataFrame
rag_results_df = pd.DataFrame(results)
rag_results_df['LLM_Pred'] = rag_results_df['LLM_Response'].apply(extract_prediction)
rag_results_df['LLM_Reasoning'] = rag_results_df['LLM_Response'].apply(extract_reasoning)

for resp in rag_results_df['LLM_Response'].head(10):
    print("👉", repr(resp.split("\n")[0]))

In [None]:
# === Filter for valid predictions ===
valid_preds = rag_results_df.dropna(subset=['LLM_Pred'])

# === Log Skipped Users ===
skipped_preds = rag_results_df[rag_results_df['LLM_Pred'].isna()].copy()

print(f"\n Skipped {len(skipped_preds)} users due to ambiguous or invalid LLM responses.")
print("🔍 Here are the first 5 skipped responses:\n")

for idx, row in skipped_preds.head(5).iterrows():
    print(f"UserID: {row['UserID']}")
    print(f"LLM Response:\n{row['LLM_Response']}\n{'-'*60}")

# === Classification Report + Metrics ===
print(" Classification Report for LLM vs SHAP-Driven Ground Truth:")
print(classification_report(valid_preds['True_Label'], valid_preds['LLM_Pred']))

auc_score = roc_auc_score(valid_preds['True_Label'], valid_preds['LLM_Pred'])
print(f" AUC Score: {auc_score:.3f}")

In [None]:
# Number of independent runs
N_RUNS = 5
SEED_BASE = 42

# For collecting metrics
all_reports = []
all_aucs = []

for i in range(N_RUNS):
    SEED = SEED_BASE + i  # Use a different seed for stratified sampling

    # === Stratified Sampling ===
    sampled_df = summary_df.groupby('is_addicted', group_keys=False).apply(
        lambda x: x.sample(n=n_per_class, random_state=SEED)
    ).reset_index(drop=True)

    # === Generate User Profiles ===
    stratified_user_profiles = []
    for _, row in sampled_df.iterrows():
        user_id = row['UserID']
        features = [f"{col.replace('_', ' ')} is {row[col]}" for col in columns_to_include if col in row]
        label = f"The user is {'addicted' if row['is_addicted'] == 1 else 'not addicted'}."
        text = f"User {user_id} profile: " + ', '.join(features) + f". {label}"
        stratified_user_profiles.append({'UserID': user_id, 'text': text})

    random.seed(SEED)
    random.shuffle(stratified_user_profiles)

    # === RAG Query Loop ===
    results = []
    for profile in stratified_user_profiles:
        full_prompt = profile['text'] + (
            "\n\nQUESTION: Based on the behavioral profile and SHAP explanations above, "
            "does this user show signs of online gambling addiction?\n"
            "Answer 'Yes' or 'No' at the start of your response, then briefly explain why."
        )

        try:
            response = rag_chain.run(full_prompt)
        except Exception as e:
            response = f"Error: {str(e)}"

        true_label = sampled_df.loc[sampled_df['UserID'] == profile['UserID'], 'is_addicted'].values[0]

        results.append({
            'UserID': profile['UserID'],
            'Prompt': full_prompt,
            'LLM_Response': response,
            'True_Label': true_label
        })

        time.sleep(1.1)

    # === Prediction + Reasoning Extraction ===
    rag_results_df = pd.DataFrame(results)
    rag_results_df['LLM_Pred'] = rag_results_df['LLM_Response'].apply(extract_prediction)
    rag_results_df['LLM_Reasoning'] = rag_results_df['LLM_Response'].apply(extract_reasoning)

    valid_preds = rag_results_df.dropna(subset=['LLM_Pred'])

    # === Classification Metrics ===
    y_true = valid_preds['True_Label']
    y_pred = valid_preds['LLM_Pred']
    report = classification_report(y_true, y_pred, output_dict=True)
    auc = roc_auc_score(y_true, y_pred)

    all_reports.append(report)
    all_aucs.append(auc)

# === Aggregate and Print Averages ===
import numpy as np

def avg_metric(metric_name, label):
    return np.mean([r[label][metric_name] for r in all_reports])

print("\n📊 Averaged Classification Results ({} runs):".format(N_RUNS))
for label in ['0', '1', 'macro avg', 'weighted avg']:
    print(f"\nLabel {label}:")
    for metric in ['precision', 'recall', 'f1-score']:
        mean = avg_metric(metric, label)
        std = np.std([r[label][metric] for r in all_reports])
        print(f"  {metric}: {mean:.3f} ± {std:.3f}")

print(f"\n Average AUC: {np.mean(all_aucs):.3f} ± {np.std(all_aucs):.3f}")

In [None]:
print("\n Structured LLM Predictions with Reasoning:")

for _, row in valid_preds.iterrows():
    print(f"UserID: {row['UserID']}")
    print(f"Prediction: {'Yes' if row['LLM_Pred'] == 1 else 'No'}")
    print(f"Explanation: {row['LLM_Reasoning']}\n{'-'*60}")

In [None]:
# === Step 9: Confusion Matrix ===
cm = confusion_matrix(valid_preds['True_Label'], valid_preds['LLM_Pred'])

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted Not Addicted', 'Predicted Addicted'],
            yticklabels=['Actual Not Addicted', 'Actual Addicted'])
plt.title(f' Confusion Matrix: Valid LLM Predictions vs SHAP Labels ({len(valid_preds)} of {len(rag_results_df)} Users)')
plt.xlabel('LLM Prediction')
plt.ylabel('Ground Truth Label')
plt.tight_layout()
plt.show()

# === Step 10: Visual Summary of Predictions and Reasoning ===
summary_table = valid_preds[['UserID', 'LLM_Pred', 'True_Label', 'LLM_Reasoning']].copy()
summary_table['LLM_Pred'] = summary_table['LLM_Pred'].map({1: 'Addicted', 0: 'Not Addicted'})
summary_table['True_Label'] = summary_table['True_Label'].map({1: 'Addicted', 0: 'Not Addicted'})

# Optional: truncate reasoning to make it readable in console
summary_table['LLM_Reasoning'] = summary_table['LLM_Reasoning'].str.slice(0, 200)

print("\n Class Distribution in Valid Predictions:")
print(valid_preds['True_Label'].value_counts().rename({0: 'Not Addicted', 1: 'Addicted'}))

print(f"\n LLM Classification & Explanation Summary (First 10 of {len(valid_preds)} Users):\n")
print(summary_table.head(10).to_string(index=False))