In [7]:
import sys
sys.path.append('..')

import os
import warnings

import joblib
import pandas as pd
import numpy as np
import spacy
from tqdm.auto import tqdm as tqdm_func
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

from utils.data_utils import load_data, process_data_for_rf
from utils.evaluation import evaluate_model

from IPython.display import display, Markdown

In [8]:
warnings.filterwarnings('ignore')

# Load spaCy model for NLP tasks
# May need to run python command first to download
#!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

In [9]:
def train_pipeline(X_train, y_train, cv_folds=5, param_grid=None):
    """
    Train a RandomForest model using k-fold cross-validation with hyperparameter tuning.

    Parameters:
    - X_train: Training features (DataFrame or array)
    - y_train: Training labels (Series or array)
    - cv_folds: Number of folds for cross-validation.
    - param_grid: Dictionary of hyperparameters to search over.

    Returns:
    - best_model: The best RandomForest model found via GridSearchCV.
    - grid_search: The fitted GridSearchCV object (for further inspection if needed).
    """
    # Set default hyperparameter grid if not provided
    if param_grid is None:
        param_grid = {
            "n_estimators": [50, 100, 150],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 2]
        }

    # Initialize the random forest classifier
    rf = RandomForestClassifier(random_state=42)

    # Setup k-fold cross-validation
    kfold = KFold(n_splits=cv_folds, shuffle=True, random_state=42)

    # Initialize GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=kfold,
        scoring='accuracy',
        n_jobs=-1,
        refit=True,  # refit on the entire training set with the best params
        verbose=2
    )

    # Fit grid search
    grid_search.fit(X_train, y_train)

    print("Best parameters found:", grid_search.best_params_)
    print("Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))

    best_model = grid_search.best_estimator_
    return best_model, grid_search

### Train the model using the pipeline with k-fold validation and hyperparameter tuning

In [10]:
# Load data
df = load_data('../data/train.csv', num_rows=1000)

# Process data
rf_obj = process_data_for_rf(df)
X = rf_obj['X']
y = rf_obj['y'] 
scaler = rf_obj['scaler']
encoder = rf_obj['encoder']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
best_model, grid_search = train_pipeline(X_train, y_train, cv_folds=5)

y_predictions = best_model.predict(X_test)
y_probabilities = best_model.predict_proba(X_test)[:, 1]

rf_metrics, rf_markdown = evaluate_model(
    y_test,
    y_predictions,
    y_probabilities,
    model_name="Random Forest"
)

display(Markdown(rf_markdown))

Using existing complexity features from DataFrame...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters found: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 150}
Best cross-validation accuracy: 0.6178


# Random Forest Evaluation Results

## Metrics

| Metric                    | Score    |
|---------------------------|----------|
| Validation Accuracy       | 0.6200 |
| Validation F1 Score (weighted) | 0.6083 |
| Validation Precision (weighted) | 0.6045 |
| Validation Recall (class 1) | 0.7619 |
| Binary Precision (class 1) | 0.6761 |
| Binary F1 Score (class 1) | 0.7164 |
| ROC-AUC Score             | 0.5337 |

## Detailed Classification Report

```
              precision    recall  f1-score   support

           0       0.48      0.38      0.42        37
           1       0.68      0.76      0.72        63

    accuracy                           0.62       100
   macro avg       0.58      0.57      0.57       100
weighted avg       0.60      0.62      0.61       100

```


### Train final model on best model configuration found as a consequence of hyperparameter tuning. 

In [12]:
# Load all data
df_all = load_data('../data/train.csv')

# Process and split data
rf_obj = process_data_for_rf(df_all)
X = rf_obj['X']
y = rf_obj['y'] 
scaler = rf_obj['scaler']
feature_names = rf_obj['feature_names']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Train best model on all data
rf_model = best_model.fit(X_train, y_train)
y_predictions = rf_model.predict(X_test)
y_probabilities = rf_model.predict_proba(X_test)[:, 1]

rf_metrics, rf_markdown = evaluate_model(
    y_test,
    y_predictions,
    y_probabilities,
    model_name="Random Forest"
)

display(Markdown(rf_markdown))

print(f"Completed training on {len(X_train)} rows")

Using existing complexity features from DataFrame...


# Random Forest Evaluation Results

## Metrics

| Metric                    | Score    |
|---------------------------|----------|
| Validation Accuracy       | 0.6628 |
| Validation F1 Score (weighted) | 0.6375 |
| Validation Precision (weighted) | 0.6541 |
| Validation Recall (class 1) | 0.8601 |
| Binary Precision (class 1) | 0.6745 |
| Binary F1 Score (class 1) | 0.7561 |
| ROC-AUC Score             | 0.6924 |

## Detailed Classification Report

```
              precision    recall  f1-score   support

           0       0.62      0.36      0.45     16100
           1       0.67      0.86      0.76     24933

    accuracy                           0.66     41033
   macro avg       0.65      0.61      0.60     41033
weighted avg       0.65      0.66      0.64     41033

```


Completed training on 369293 rows


In [13]:
# --- Save the final model ---
os.makedirs("models", exist_ok=True)
joblib.dump(rf_model, "models/random_forest_model.pkl")
joblib.dump(X_train, "models/rf_X_train.pkl")
joblib.dump(y_train, "models/rf_y_train.pkl")
joblib.dump(X_test, "models/rf_X_test.pkl")
joblib.dump(y_test, "models/rf_y_test.pkl")
joblib.dump(scaler, "models/rf_scaler.pkl")
joblib.dump(encoder, "models/rf_encoder.pkl")
joblib.dump(feature_names, "models/rf_feature_names.pkl")
print("Model and preprocessors saved.")

print("\nFinal model saved to models/random_forest_model.pkl")


Model and preprocessors saved.

Final model saved to models/random_forest_model.pkl


In [14]:
val_df = pd.read_csv('../data/validation.csv')

# Process data for Random Forest
rf_obj = process_data_for_rf(val_df)
rf_X = rf_obj['X']
rf_y = rf_obj['y'] 

rf_y_predictions = rf_model.predict(rf_X)
rf_y_probabilities = rf_model.predict_proba(rf_X)[:, 1]

# For Random Forest
rf_metrics, rf_markdown = evaluate_model(
    rf_y,
    rf_y_predictions,
    y_prob=rf_y_probabilities,
    model_name="Random Forest"
)

display(Markdown(rf_markdown))


Using existing complexity features from DataFrame...


# Random Forest Evaluation Results

## Metrics

| Metric                    | Score    |
|---------------------------|----------|
| Validation Accuracy       | 0.6596 |
| Validation F1 Score (weighted) | 0.6335 |
| Validation Precision (weighted) | 0.6528 |
| Validation Recall (class 1) | 0.8619 |
| Binary Precision (class 1) | 0.6683 |
| Binary F1 Score (class 1) | 0.7529 |
| ROC-AUC Score             | 0.6863 |

## Detailed Classification Report

```
              precision    recall  f1-score   support

           0       0.63      0.35      0.45      3984
           1       0.67      0.86      0.75      6016

    accuracy                           0.66     10000
   macro avg       0.65      0.61      0.60     10000
weighted avg       0.65      0.66      0.63     10000

```
