## Task 2: Shallow Learning Approaches

**Train and evaluate at least three classical models:**

- Random Forest, XGBoost / Gradient Boosting
  (others: Logistic Regression, LightGBM allowed as extras)
- Use Ensemble Learner

**Expected steps:**

- Tune hyperparameters through cross-validation
- Use stratification for splits
- Evaluate using accuracy, confusion matrix, F1

In [1]:
from preprocessor import preprocessor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics import classification_report
from experiment_tracking import ExperimentTracker

In [2]:
X_train, X_test, y_train, y_test = preprocessor()
X_train.shape, y_train.shape, X_test.shape, y_test.shape

  data = pd.read_csv(


((5625, 46), (5625,), (1407, 46), (1407,))

In [3]:
mlflow = ExperimentTracker(experiment_name="AML Task 2 Experiment 1")


2025/12/20 13:56:17 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/20 13:56:17 INFO mlflow.store.db.utils: Updating database tables
2025/12/20 13:56:17 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/20 13:56:17 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/20 13:56:17 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/20 13:56:17 INFO alembic.runtime.migration: Will assume non-transactional DDL.


# RandomForest

In [4]:
rfc = RandomForestClassifier()

param_grid_rfc = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

with mlflow.start_run(run_name="Random Forest Classifier") as run:
    random_search = RandomizedSearchCV(estimator=rfc, param_distributions=param_grid_rfc,
                                    n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1)
    random_search.fit(X_train, y_train)
    best_rfc = random_search.best_estimator_
    best_rfc
    y_pred_rfc = best_rfc.predict(X_test)
    y_proba_rfc = best_rfc.predict_proba(X_test)[:, 1]
    roc_auc_rfc = roc_auc_score(y_test, y_proba_rfc)
    accuracy_rfc = accuracy_score(y_test, y_pred_rfc)

    mlflow.log_metrics({
        "test_accuracy": accuracy_rfc,
        "test_roc_auc": roc_auc_rfc
    })

    print(f"ROC AUC: {roc_auc_rfc}")
    print(f"Accuracy: {accuracy_rfc}")
    print(classification_report(y_test, y_pred_rfc))

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=   1.7s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=   1.7s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=   2.4s
[CV] END bootstrap=True, max_depth=50, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   2.4s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   0.9s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   0.7s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   0.7s
[CV] END bootstrap=True, max_depth=50, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   

# Gradient Boosting

In [5]:
gbc = GradientBoostingClassifier()

param_grid_gbc = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.6, 0.8, 1.0],
    'min_samples_split': [2, 5, 10]
}

with mlflow.start_run(run_name="Gradient Boosting Classifier") as run:
    random_search_gbc = RandomizedSearchCV(estimator=gbc, param_distributions=param_grid_gbc,
                                    n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1)
    random_search_gbc.fit(X_train, y_train)
    best_gbc = random_search_gbc.best_estimator_
    best_gbc
    y_pred_gbc = best_gbc.predict(X_test)
    y_proba_gbc = best_gbc.predict_proba(X_test)[:, 1]
    roc_auc_gbc = roc_auc_score(y_test, y_proba_gbc)
    accuracy_gbc = accuracy_score(y_test, y_pred_gbc)

    mlflow.log_metrics({
        "test_accuracy": accuracy_gbc,
        "test_roc_auc": roc_auc_gbc
    })

    print(f"ROC AUC: {roc_auc_gbc}")
    print(f"Accuracy: {accuracy_gbc}")
    print(classification_report(y_test, y_pred_gbc))

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END learning_rate=0.05, max_depth=7, min_samples_split=5, n_estimators=200, subsample=1.0; total time=   3.3s
[CV] END learning_rate=0.05, max_depth=7, min_samples_split=5, n_estimators=200, subsample=1.0; total time=   3.3s
[CV] END learning_rate=0.05, max_depth=7, min_samples_split=5, n_estimators=200, subsample=1.0; total time=   3.7s
[CV] END learning_rate=0.01, max_depth=5, min_samples_split=2, n_estimators=400, subsample=0.6; total time=   3.1s
[CV] END learning_rate=0.01, max_depth=5, min_samples_split=2, n_estimators=400, subsample=0.6; total time=   3.1s
[CV] END learning_rate=0.01, max_depth=5, min_samples_split=2, n_estimators=400, subsample=0.6; total time=   3.6s
[CV] END learning_rate=0.05, max_depth=3, min_samples_split=5, n_estimators=200, subsample=0.6; total time=   1.0s
[CV] END learning_rate=0.05, max_depth=3, min_samples_split=5, n_estimators=200, subsample=0.6; total time=   1.0s
[CV] END learning_

# XGBoost

In [6]:
xgb = XGBClassifier(eval_metric='logloss')
param_grid_xgb = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}
with mlflow.start_run(run_name="XGBoost Classifier") as run:
    random_search_xgb = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid_xgb,
                                    n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1)
    random_search_xgb.fit(X_train, y_train)
    best_xgb = random_search_xgb.best_estimator_
    best_xgb
    y_pred_xgb = best_xgb.predict(X_test)
    y_proba_xgb = best_xgb.predict_proba(X_test)[:, 1]
    roc_auc_xgb = roc_auc_score(y_test, y_proba_xgb)
    accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

    mlflow.log_metrics({
        "test_accuracy": accuracy_xgb,
        "test_roc_auc": roc_auc_xgb
    })

    print(f"ROC AUC: {roc_auc_xgb}")
    print(f"Accuracy: {accuracy_xgb}")
    print(classification_report(y_test, y_pred_xgb))

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=7, n_estimators=400, subsample=0.8; total time=   0.8s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=7, n_estimators=400, subsample=0.8; total time=   0.9s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=7, n_estimators=400, subsample=0.8; total time=   0.9s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=9, n_estimators=200, subsample=1.0; total time=   0.9s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=9, n_estimators=200, subsample=1.0; total time=   1.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=9, n_estimators=200, subsample=1.0; total time=   1.0s
[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=5, n_estimators=200, subsample=0.6; total time=   0.4s
[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=5, n_estimators=200, subsample=0.6; total time=   0.3s
[CV] END col

# VotingClassifier 

In [7]:

voting_clf = VotingClassifier(estimators=[
    ('rfc', best_rfc),
    ('gbc', best_gbc),
    ('xgb', best_xgb)
], voting='soft')


with mlflow.start_run(run_name="Voting Classifier") as run:

    voting_clf.fit(X_train, y_train)
    y_pred = voting_clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, voting_clf.predict_proba(X_test)[:, 1])
    report = classification_report(y_test, y_pred)
    
    mlflow.log_metrics({
        "test_accuracy": accuracy,
        "test_roc_auc": roc_auc
    })
    
    print(f"Test Accuracy: {accuracy}")
    print(f"Test ROC AUC: {roc_auc}")
    print(report)

Test Accuracy: 0.8017057569296375
Test ROC AUC: 0.8418826909215364
              precision    recall  f1-score   support

           0       0.84      0.89      0.87      1034
           1       0.65      0.54      0.59       373

    accuracy                           0.80      1407
   macro avg       0.75      0.72      0.73      1407
weighted avg       0.79      0.80      0.80      1407



# Stacking Classifier

In [8]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression


stacking_clf = StackingClassifier(
    estimators=[
        ('rfc', best_rfc),
        ('gbc', best_gbc),
        ('xgb', best_xgb)
    ],
    final_estimator=LogisticRegression(),
    cv=5,
    n_jobs=-1
)

with mlflow.start_run(run_name="Stacking Classifier") as run:
    stacking_clf.fit(X_train, y_train)
    y_pred = stacking_clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, stacking_clf.predict_proba(X_test)[:, 1])
    report = classification_report(y_test, y_pred)
    
    mlflow.log_metrics({
        "test_accuracy": accuracy,
        "test_roc_auc": roc_auc
    })
    
    print(f"Test Accuracy: {accuracy}")
    print(f"Test ROC AUC: {roc_auc}")
    print(report)

Test Accuracy: 0.8031272210376688
Test ROC AUC: 0.8417815713463423
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1034
           1       0.65      0.55      0.60       373

    accuracy                           0.80      1407
   macro avg       0.75      0.72      0.73      1407
weighted avg       0.80      0.80      0.80      1407



# Tuning XGBoost

In [9]:
xgb = XGBClassifier(eval_metric='logloss')

# Expanded grid with regularization and structural parameters
param_grid_xgb = {
    'n_estimators': [100, 300, 500, 800],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    
    # NEW TUNING PARAMETERS:
    'gamma': [0, 0.1, 0.2, 0.5],          # Minimum loss reduction required to make a further partition
    'min_child_weight': [1, 3, 5],        # Controls overfitting (higher = more conservative)
    'reg_alpha': [0, 0.01, 0.1, 1],       # L1 regularization (Lasso)
    'reg_lambda': [1, 1.5, 2, 5]          # L2 regularization (Ridge)
}

with mlflow.start_run(run_name="XGBoost Classifier Tuned") as run:
    # Increased n_iter to 100 to cover the larger search space
    random_search_xgb = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid_xgb,
                                    n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
    random_search_xgb.fit(X_train, y_train)
    best_xgb = random_search_xgb.best_estimator_
    
    y_pred_xgb = best_xgb.predict(X_test)
    y_proba_xgb = best_xgb.predict_proba(X_test)[:, 1]
    roc_auc_xgb = roc_auc_score(y_test, y_proba_xgb)
    accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

    mlflow.log_metrics({
        "test_accuracy": accuracy_xgb,
        "test_roc_auc": roc_auc_xgb
    })

    print(f"ROC AUC: {roc_auc_xgb}")
    print(f"Accuracy: {accuracy_xgb}")
    print(classification_report(y_test, y_pred_xgb))


Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=300, reg_alpha=1, reg_lambda=2, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=300, reg_alpha=1, reg_lambda=2, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=300, reg_alpha=1, reg_lambda=2, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=1.0, gamma=0.5, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=100, reg_alpha=0.01, reg_lambda=2, subsample=1.0; total time=   0.1s
[CV] END colsample_bytree=1.0, gamma=0.5, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=100, reg_alpha=0.01, reg_lambda=2, subsample=1.0; total time=   0.1s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.01, max_d