In [None]:
!pip install catboost
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score
)
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.over_sampling import SMOTE
import numpy as np
import joblib

# === Configuration === #
LABEL_MAPPING = {'Beginner': 0, 'Advanced': 1}  # Label encoding
CATBOOST_PARAMS = {
    "iterations": 1000,
    "depth": 6,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "Accuracy",
    "random_seed": 42,
    "early_stopping_rounds": 30,
    "l2_leaf_reg": 3.0
}

# === Load Data === #
train_data = pd.read_csv('/content/drive/MyDrive/training_data_with_final.csv')
validation_data = pd.read_csv('/content/drive/MyDrive/validation_data_with_final.csv')
test_data = pd.read_csv('/content/drive/MyDrive/testing_data_with_final.csv')

# === Feature Selection === #
features = [col for col in train_data.columns if col not in ['Name', 'category']]
X_train = train_data[features]
y_train = train_data['category'].map(LABEL_MAPPING)

X_val = validation_data[features]
y_val = validation_data['category'].map(LABEL_MAPPING)

X_test = test_data[features]
y_test = test_data['category'].map(LABEL_MAPPING)

# === Handle Class Imbalance with SMOTE === #
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# === Prepare CatBoost Pools === #
train_pool = Pool(X_train_balanced, y_train_balanced)
val_pool = Pool(X_val, y_val)
test_pool = Pool(X_test, y_test)

# === Initialize CatBoost Model === #
catboost_model = CatBoostClassifier(**CATBOOST_PARAMS)

# === Train Model === #
catboost_model.fit(train_pool, eval_set=val_pool, verbose=50)

# === Evaluate Performance === #
def evaluate_model(model, X, y, dataset_name="Dataset"):
    """Evaluate the model on a dataset and print detailed metrics."""
    predictions = model.predict(X)
    probabilities = model.predict_proba(X)[:, 1]

    accuracy = accuracy_score(y, predictions)
    roc_auc = roc_auc_score(y, probabilities)

    print(f"\n{dataset_name} Accuracy: {accuracy:.2f}")
    print(f"{dataset_name} ROC-AUC: {roc_auc:.2f}")
    print(f"\nClassification Report ({dataset_name}):")
    print(classification_report(y, predictions, target_names=LABEL_MAPPING.keys()))
    print(f"\nConfusion Matrix ({dataset_name}):")
    print(confusion_matrix(y, predictions))
    return accuracy, roc_auc

# Training Metrics
evaluate_model(catboost_model, X_train_balanced, y_train_balanced, "Training")

# Validation Metrics
evaluate_model(catboost_model, X_val, y_val, "Validation")

# Test Metrics
evaluate_model(catboost_model, X_test, y_test, "Test")

# === Save the Model === #
joblib.dump(catboost_model, 'catboost_model_with_smote_weights.joblib')
print("CatBoost model saved as 'catboost_model_with_smote_weights.joblib'")

# === Optional: Cross-Validation === #
# Dynamically determine the number of splits
min_samples_per_class = min(y_train_balanced.value_counts())
n_splits = min(5, min_samples_per_class)  # Use at most 5 splits or fewer if necessary

# Function for cross-validation
def cross_validate_model(X, y, params, n_splits=n_splits):
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv_scores = []

    for train_idx, val_idx in cv.split(X, y):
        X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
        y_train_cv, y_val_cv = y.iloc[train_idx], y.iloc[val_idx]

        # Train CatBoost
        model = CatBoostClassifier(**params)
        model.fit(X_train_cv, y_train_cv, verbose=0)

        # Validate model
        y_pred = model.predict(X_val_cv)
        accuracy = accuracy_score(y_val_cv, y_pred)
        cv_scores.append(accuracy)

    return np.mean(cv_scores)

# Define CatBoost parameters
CATBOOST_PARAMS = {
    'iterations': 100,
    'learning_rate': 0.1,
    'depth': 6,
    'loss_function': 'Logloss',
    'eval_metric': 'Accuracy',
    'random_seed': 42,
    'verbose': 0,
    'early_stopping_rounds': 30
}

# Perform cross-validation
cross_val_accuracy = cross_validate_model(X_train_balanced, y_train_balanced, CATBOOST_PARAMS)
print(f"Cross-Validation Mean Accuracy: {cross_val_accuracy:.2f}")
