## Import Statements

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score
)

## Load Dataset

In [4]:
X_train = pd.read_csv("../Datasets/X_train.csv")
y_train = pd.read_csv("../Datasets/y_train.csv").values.ravel()

X_val = pd.read_csv("../Datasets/X_val.csv")
y_val = pd.read_csv("../Datasets/y_val.csv").values.ravel()

X_test = pd.read_csv("../Datasets/X_test.csv")
y_test = pd.read_csv("../Datasets/y_test.csv").values.ravel()

X_train_resampled = pd.read_csv("../Datasets/X_train_resampled.csv")
y_train_resampled = pd.read_csv("../Datasets/y_train_resampled.csv").values.ravel()

# Random Forest

## Use Log-Transformed Features

### Use SMOTE Resampled Training Data

#### Baseline

#### Hyperparameter Tuning

### Use original train with class weighting

#### Baseline

In [24]:
# Define Random Forest model 
rf = RandomForestClassifier(random_state=42)

# Grid Search hyperparameters 
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10],
    'min_samples_split': [5],
    'min_samples_leaf': [2],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,                # 3-fold cross-validation
    scoring='roc_auc',  
    n_jobs=-1,
    verbose=2
)

In [None]:
# Fit Grid Search 
grid_search.fit(X_train_resampled, y_train_resampled)

print("Best Parameters:", grid_search.best_params_)
print("Best CV ROC-AUC:", grid_search.best_score_)

# Use best model 
best_rf = grid_search.best_estimator_

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [22]:
# Predict on validation set 
y_pred = best_rf.predict(X_val)
y_proba = best_rf.predict_proba(X_val)[:, 1]

# Evaluation Metrics
precision = precision_score(y_val, y_pred, average='macro')
recall = recall_score(y_val, y_pred, average='macro')
f1 = f1_score(y_val, y_pred, average='macro')
roc_auc = roc_auc_score(y_val, y_proba)
pr_auc = average_precision_score(y_val, y_proba)

print("\n=== Validation Performance ===")
print(f"Precision (macro): {precision:.4f}")
print(f"Recall (macro): {recall:.4f}")
print(f"F1-score (macro): {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"PR-AUC: {pr_auc:.4f}")


=== Validation Performance ===
Precision (macro): 0.5484
Recall (macro): 0.6878
F1-score (macro): 0.5696
ROC-AUC: 0.8608
PR-AUC: 0.1188


## Random Forest without SMOTE

In [23]:
best_params = {
    'max_depth': 10,
    'max_features': 'log2',
    'min_samples_leaf': 2,
    'min_samples_split': 5,
    'n_estimators': 200,
    'random_state': 42
}

# Initialize and train the final Random Forest
final_rf = RandomForestClassifier(**best_params)
final_rf.fit(X_train, y_train)  # ravel() if y_train is a dataframe

# Predictions
y_val_pred = final_rf.predict(X_val)
y_val_proba = final_rf.predict_proba(X_val)[:, 1]  # probability of class 1 (fraud)

# Evaluation Metrics
precision = precision_score(y_val, y_val_pred, average='macro')
recall = recall_score(y_val, y_val_pred, average='macro')
f1 = f1_score(y_val, y_val_pred, average='macro')
roc_auc = roc_auc_score(y_val, y_val_proba)
pr_auc = average_precision_score(y_val, y_val_proba)

print("\n=== Validation Performance ===")
print(f"Precision (macro): {precision:.4f}")
print(f"Recall (macro): {recall:.4f}")
print(f"F1-score (macro): {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"PR-AUC: {pr_auc:.4f}")



=== Validation Performance ===
Precision (macro): 0.4933
Recall (macro): 0.5000
F1-score (macro): 0.4966
ROC-AUC: 0.8718
PR-AUC: 0.1472


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Optuna, log vs non-log