# STEP 1: Import

In [2]:
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, confusion_matrix

print("=" * 80)
print("PREDICTION WITH PRE-TRAINED OPTIMIZED MODELS")
print("=" * 80)

PREDICTION WITH PRE-TRAINED OPTIMIZED MODELS


# STEP 2: Load data

In [3]:
print("\n[2] Loading Data...")

train = pd.read_csv('C:\\Users\\Kanyavan\\Documents\\Year3_semester1\\ML\\CPE342-Hackathon\\task2\\train.csv')
test = pd.read_csv('C:\\Users\\Kanyavan\\Documents\\Year3_semester1\\ML\\CPE342-Hackathon\\task2\\test.csv')


[2] Loading Data...


# STEP 3: Exploratory Data Analysis

In [4]:
X_train = train.drop(['id', 'player_id', 'segment'], axis=1)
y_train = train['segment'].astype(int)
X_test = test.drop(['id', 'player_id'], axis=1)
test_ids = test['id']

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (101658, 47)
Test shape: (25889, 46)


# STEP 4: Feature Engineering

In [4]:
print("\n[4] Feature Engineering...")


[4] Feature Engineering...


In [16]:
def engineer_features(df):
    df = df.copy()

    # Spending features
    df['spending_per_hour'] = df['total_spending_thb'] / (df['total_playtime_hours'] + 1)
    df['spending_intensity'] = df['avg_monthly_spending'] * df['spending_frequency']
    df['is_spender'] = (df['total_spending_thb'] > 0).astype(int)

    # Engagement features
    df['engagement_score'] = (df['play_frequency'] * df['avg_session_duration'] * df['login_streak']).fillna(0)
    df['activity_consistency'] = df['login_streak'] / (df['days_since_last_login'] + 1)

    # Social features
    df['social_score'] = (df['friend_count'] * df['team_play_percentage'] * df['chat_activity_score']).fillna(0)
    df['is_social_player'] = (df['team_play_percentage'] > 50).astype(int)

    # Competitive features
    df['competitive_score'] = (df['ranked_participation_rate'] * df['tournament_entries'] * df['competitive_rank']).fillna(0)
    df['is_competitive'] = (df['ranked_participation_rate'] > 50).astype(int)

    # Remove random metrics
    df = df.drop(['random_metric_1', 'random_metric_2', 'random_metric_3'], axis=1, errors='ignore')

    return df

X_train_eng = engineer_features(X_train)
X_test_eng = engineer_features(X_test)

print(f"Features after engineering: {X_train_eng.shape[1]}")

Features after engineering: 50


# STEP 5: Data Preprocessing

In [5]:
print("\n[5] Preprocessing...")


[5] Preprocessing...


In [18]:
numerical_features = X_train_eng.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train_eng.select_dtypes(include=['object']).columns.tolist()

In [19]:
# Label Encoding
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    combined = pd.concat([X_train_eng[col], X_test_eng[col]], axis=0).astype(str)
    le.fit(combined)
    X_train_eng[col] = le.transform(X_train_eng[col].astype(str))
    X_test_eng[col] = le.transform(X_test_eng[col].astype(str))
    label_encoders[col] = le

# Imputation
num_imputer = SimpleImputer(strategy='median')
X_train_eng[numerical_features] = num_imputer.fit_transform(X_train_eng[numerical_features])
X_test_eng[numerical_features] = num_imputer.transform(X_test_eng[numerical_features])

if categorical_features:
    cat_imputer = SimpleImputer(strategy='most_frequent')
    X_train_eng[categorical_features] = cat_imputer.fit_transform(X_train_eng[categorical_features])
    X_test_eng[categorical_features] = cat_imputer.transform(X_test_eng[categorical_features])

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_eng)
X_test_scaled = scaler.transform(X_test_eng)

print("Preprocessing completed!")

Preprocessing completed!


# STEP 6: LOAD PRE-TRAINED OPTIMIZED MODELS

In [6]:
print("\n[6] Loading Pre-trained Optimized Models...")


[6] Loading Pre-trained Optimized Models...


In [21]:
try:
    with open('model_xgboost_optimized.pkl', 'rb') as f:
        model_xgb = pickle.load(f)
    print("✓ Loaded XGBoost model")
except FileNotFoundError:
    print("✗ XGBoost model not found. Run hyperparameter_tuning_optuna.py first!")
    model_xgb = None

try:
    with open('model_lightgbm_optimized.pkl', 'rb') as f:
        model_lgbm = pickle.load(f)
    print("✓ Loaded LightGBM model")
except FileNotFoundError:
    print("✗ LightGBM model not found. Run hyperparameter_tuning_optuna.py first!")
    model_lgbm = None

try:
    with open('model_catboost_optimized.pkl', 'rb') as f:
        model_cat = pickle.load(f)
    print("✓ Loaded CatBoost model")
except FileNotFoundError:
    print("✗ CatBoost model not found. Run hyperparameter_tuning_optuna.py first!")
    model_cat = None

✓ Loaded XGBoost model
✓ Loaded LightGBM model
✓ Loaded CatBoost model


In [22]:
# Check if at least one model is loaded
loaded_models = {}
if model_xgb is not None:
    loaded_models['XGBoost'] = model_xgb
if model_lgbm is not None:
    loaded_models['LightGBM'] = model_lgbm
if model_cat is not None:
    loaded_models['CatBoost'] = model_cat

if not loaded_models:
    print("\n❌ ERROR: No optimized models found!")
    print("Please run 'hyperparameter_tuning_optuna.py' first to train and save models.")
    exit(1)

print(f"\n✓ Successfully loaded {len(loaded_models)} model(s)")


✓ Successfully loaded 3 model(s)


# STEP 7: EVALUATE ON TRAINING DATA

In [23]:
print("\n[5] Evaluating Models on Training Data...")

for name, model in loaded_models.items():
    y_train_pred = model.predict(X_train_scaled)
    f1 = f1_score(y_train, y_train_pred, average='macro')
    print(f"{name:20s} | Training Macro F1: {f1:.4f}")


[5] Evaluating Models on Training Data...
XGBoost              | Training Macro F1: 0.9023
LightGBM             | Training Macro F1: 0.8719
CatBoost             | Training Macro F1: 0.8388


# STEP 8: CREATE ENSEMBLE

In [7]:
print("\n[8] Creating Ensemble...")


[8] Creating Ensemble...


In [26]:
if len(loaded_models) >= 2:
    print("Creating Stacking Ensemble with loaded models...")
    
    # Prepare estimators for stacking
    estimators = [(name, model) for name, model in loaded_models.items()]
    
    # Create and train stacking classifier
    stacking_clf = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(max_iter=1000, random_state=42),
        cv=5,
        n_jobs=-1
    )
    
    # Note: Stacking needs to be trained on the original data
    # Since our loaded models were trained on SMOTE-balanced data,
    # we'll use soft voting instead for better results
    print("Using Soft Voting Ensemble instead (better for pre-trained models)...")
    
    # Make predictions from each model
    predictions = {}
    for name, model in loaded_models.items():
        pred = model.predict(X_test_scaled)
        predictions[name] = pred.flatten() if hasattr(pred, 'flatten') else pred
    
    # Soft voting: majority vote
    from scipy import stats
    pred_array = np.vstack(list(predictions.values()))
    y_pred_ensemble = stats.mode(pred_array, axis=0, keepdims=False)[0]
    
    # Evaluate ensemble on training data
    train_predictions = {}
    for name, model in loaded_models.items():
        pred = model.predict(X_train_scaled)
        train_predictions[name] = pred.flatten() if hasattr(pred, 'flatten') else pred
    train_pred_array = np.vstack(list(train_predictions.values()))
    y_train_pred_ensemble = stats.mode(train_pred_array, axis=0, keepdims=False)[0]
    
    ensemble_f1 = f1_score(y_train, y_train_pred_ensemble, average='macro')
    print(f"Ensemble (Voting)     | Training Macro F1: {ensemble_f1:.4f}")
    
    use_ensemble = True
    
else:
    print("Only one model loaded, using single model predictions...")
    use_ensemble = False

Creating Stacking Ensemble with loaded models...
Using Soft Voting Ensemble instead (better for pre-trained models)...
Ensemble (Voting)     | Training Macro F1: 0.8770


# STEP 9: MAKE FINAL PREDICTIONS

In [8]:
print("\n[9] Making Final Predictions...")


[9] Making Final Predictions...


In [28]:
if use_ensemble:
    print("Using Ensemble predictions...")
    y_pred = y_pred_ensemble
    best_model_name = "Ensemble (Voting)"
else:
    # Use the single loaded model
    model_name = list(loaded_models.keys())[0]
    print(f"Using {model_name} predictions...")
    y_pred = loaded_models[model_name].predict(X_test_scaled)
    best_model_name = model_name

Using Ensemble predictions...


# STEP 10: CREATE SUBMISSION

In [9]:
print("\n[10] Creating Submission File...")


[10] Creating Submission File...


In [30]:
submission = pd.DataFrame({
    'id': test_ids,
    'segment': y_pred.astype(int)
})

submission.to_csv('task2_submission_optimized.csv', index=False)

print(f"\n✓ Submission saved to 'task2_submission_optimized.csv'")
print(f"Predictions shape: {submission.shape}")
print(f"\nPredicted segment distribution:")
print(submission['segment'].value_counts().sort_index())


✓ Submission saved to 'task2_submission_optimized.csv'
Predictions shape: (25889, 2)

Predicted segment distribution:
segment
0    10888
1     6308
2     4976
3     3717
Name: count, dtype: int64


## SUMMARY

In [31]:
print("\n" + "=" * 80)
print("PREDICTION COMPLETED SUCCESSFULLY!")
print("=" * 80)
print(f"Model used: {best_model_name}")
print(f"Submission file: task2_submission_optimized.csv")
print("=" * 80)

# Display detailed classification report if ensemble
if use_ensemble:
    print("\nDetailed Ensemble Performance on Training Data:")
    print(classification_report(y_train, y_train_pred_ensemble, 
                              target_names=['Casual', 'Grinder', 'Social', 'Whale']))
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_train, y_train_pred_ensemble)
    print(cm)


PREDICTION COMPLETED SUCCESSFULLY!
Model used: Ensemble (Voting)
Submission file: task2_submission_optimized.csv

Detailed Ensemble Performance on Training Data:
              precision    recall  f1-score   support

      Casual       0.88      0.92      0.90     40064
     Grinder       0.87      0.86      0.87     25397
      Social       0.88      0.84      0.86     20549
       Whale       0.90      0.86      0.88     15648

    accuracy                           0.88    101658
   macro avg       0.88      0.87      0.88    101658
weighted avg       0.88      0.88      0.88    101658


Confusion Matrix:
[[36939  1819  1083   223]
 [ 2581 21815   488   513]
 [ 1971   610 17269   699]
 [  584   754   799 13511]]
