In [1]:
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_recall_curve
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import torch
import joblib

In [2]:
# Memory cleanup
gc.collect()

# Load data
casino_data = pd.read_csv("casino_players_data.csv")

# **Dataset Analysis**
print("Basic dataset information:")
print(casino_data.info())
print("Target variable distribution (Churn):")
print(casino_data["Churn"].value_counts(normalize=True))
print("Feature statistics:")
print(casino_data.describe())

Basic dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 21 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Player_ID              100000 non-null  int64  
 1   Num_Sessions           100000 non-null  int64  
 2   Avg_Session_Time       100000 non-null  int64  
 3   Avg_Bet_Amount         100000 non-null  int64  
 4   Num_Wins               100000 non-null  int64  
 5   Num_Losses             100000 non-null  int64  
 6   Total_Winnings         100000 non-null  float64
 7   Total_Losses           100000 non-null  float64
 8   Net_Profit             100000 non-null  float64
 9   Favorite_Game          100000 non-null  object 
 10  Days_Since_Last_Play   100000 non-null  int64  
 11  Player_Type            100000 non-null  object 
 12  Active_Days_Per_Month  100000 non-null  int64  
 13  Used_Bonuses           100000 non-null  int64  
 14  Total_Depo

In [3]:
# Convert categorical columns to numerical values
encoder = LabelEncoder()
for col in ["Favorite_Game", "Player_Type"]:
    if col in casino_data.columns:
        casino_data[col] = encoder.fit_transform(casino_data[col])

In [4]:
# Remove highly correlated features
corr_matrix = casino_data.corr()
high_corr_features = set()
threshold_corr = 0.9  # Correlation threshold
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > threshold_corr:
            colname = corr_matrix.columns[i]
            high_corr_features.add(colname)

casino_data = casino_data.drop(columns=high_corr_features)


In [5]:
# Prepare data
X = casino_data.drop(columns=[col for col in ["Churn", "Player_ID"] if col in casino_data.columns])
y = casino_data["Churn"]

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Detect GPU availability for model training
gpu_available = torch.cuda.is_available()
if gpu_available:
    print("GPU available - enabling GPU support for models!")
else:
    print("GPU not available - training on CPU.")

# **Train initial RandomForest model for feature selection**
rf_initial = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
rf_initial.fit(X_train, y_train)


# **Select important features based on RandomForest model**
def select_important_features(model, X, threshold=0.01):
    feature_importances = model.feature_importances_
    important_features = [X.columns[i] for i in range(len(feature_importances)) if feature_importances[i] > threshold]
    return X[important_features]


X_train = select_important_features(rf_initial, X_train)
X_test = X_test[X_train.columns]  # Remove the same features from the test set

GPU not available - training on CPU.


In [10]:
models = {
    "RandomForest": RandomForestClassifier(n_estimators=150, max_depth=10, min_samples_split=5, class_weight='balanced',
                                           random_state=42, n_jobs=-1),
    "XGBoost": xgb.XGBClassifier(n_estimators=150, max_depth=6, reg_lambda=1.5, reg_alpha=0.5,
                                 eval_metric='logloss', verbosity=1),
    "LightGBM": lgb.LGBMClassifier(n_estimators=150, max_depth=6, lambda_l1=1.0, lambda_l2=1.0, verbose=1),
    "CatBoost": cb.CatBoostClassifier(n_estimators=150, depth=6, l2_leaf_reg=1.5, verbose=1,
                                      task_type='GPU' if gpu_available else 'CPU'),
    "LogisticRegression": LogisticRegression(class_weight='balanced', max_iter=500)
}

In [11]:
metrics_results = []
for model_name, model in models.items():
    print(f"Training model {model_name}...")
    model.fit(X_train, y_train)
    print(f"Model {model_name} training completed.")

    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_proba >= 0.8).astype(int)

    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)

    metrics_results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": report['1']['precision'],
        "Recall": report['1']['recall'],
        "F1-Score": report['1']['f1-score']
    })

    joblib.dump(model, f"{model_name}.pkl")

# Create comparison table
metrics_df = pd.DataFrame(metrics_results)
print(metrics_df)

Training model RandomForest...
Model RandomForest training completed.
Training model XGBoost...
Model XGBoost training completed.
Training model LightGBM...
[LightGBM] [Info] Number of positive: 48818, number of negative: 31182
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000753 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 607
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.610225 -> initscore=0.448258
[LightGBM] [Info] Start training from score 0.448258
Model LightGBM training completed.
Training model CatBoost...
0:	learn: 0.6641608	total: 8.09ms	remaining: 1.2s
1:	learn: 0.6367535	total: 16.4ms	remaining: 1.21s
2:	learn: 0.6108978	total: 24.8ms	remaining: 1.21s
3:	learn: 0.5865213	total: 33.2ms	remaining: 1.21s
4:	learn: 0.5634171	tota

In [12]:
# Save processed data to disk
casino_data.to_csv("casino_players_data_processed.csv", index=False)
print("Data has been saved to disk.")

Data has been saved to disk.
