In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

### Loading data 

In [3]:
X_test = pd.read_csv('x_test.csv')
y_test = pd.read_csv('y_test.csv').squeeze()

X_train = pd.read_csv('x_train_balanced.csv')
y_train = pd.read_csv('y_train_balanced.csv').squeeze()  # Ensure it becomes a Series


### Modelling

In [4]:



# Define the models and parameter grids for hyperparameter tuning
models_params = {
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [10, 50, 100, 150, 200],
            'max_features': ['auto', 'sqrt', 'log2'],
            'max_depth': [None, 3, 5, 10, 15, 20, 25],
            'min_samples_split': [2, 5, 16, 20],
            'min_samples_leaf': [1, 2, 4, 8]
        }
    },
    'AdaBoost': {
        'model': AdaBoostClassifier(),
        'params': {
            'n_estimators': [50, 100, 200, 300],
            'learning_rate': [0.01, 0.1, 1.0]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'n_estimators': [50, 100, 200, 250, 300],
            'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.5, 0.001],
            'max_depth': [3, 6, 8, 10, 12, 15, 20, 25]
        }
    }
}

# Perform Grid Search with Cross-Validation
best_estimators = {}
for model_name, mp in models_params.items():
    random_search = RandomizedSearchCV(mp['model'], mp['params'], n_iter=5, cv=3, scoring='accuracy', random_state=42)
    random_search.fit(X_train, y_train)
    best_estimators[model_name] = random_search.best_estimator_
    print(f"Best parameters for {model_name}: {random_search.best_params_}")

# Evaluate the models
metrics = {}
for model_name, model in best_estimators.items():
    y_pred = model.predict(X_test)
    metrics[model_name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'AUC': roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    }

# Display the metrics
for model_name, m in metrics.items():
    print(f"\nMetrics for {model_name}:")
    for metric_name, value in m.items():
        print(f"{metric_name}: {value:.4f}")

Best parameters for RandomForest: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 15}
Best parameters for AdaBoost: {'n_estimators': 100, 'learning_rate': 0.1}
Best parameters for XGBoost: {'n_estimators': 250, 'max_depth': 8, 'learning_rate': 0.2}

Metrics for RandomForest:
Accuracy: 0.6400
Precision: 0.7482
Recall: 0.7376
F1 Score: 0.7429
AUC: 0.6239

Metrics for AdaBoost:
Accuracy: 0.6050
Precision: 0.7541
Recall: 0.6525
F1 Score: 0.6996
AUC: 0.6346

Metrics for XGBoost:
Accuracy: 0.6400
Precision: 0.7315
Recall: 0.7730
F1 Score: 0.7517
AUC: 0.5984
