### Import necessary libraries

In [1]:
import numpy as np
import torch
import torch.nn as nn
from sklearn.metrics import roc_auc_score
import pandas as pd
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import optuna
from sklearn.model_selection import train_test_split
import random
import sklearn.datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from xgboost import plot_importance
import sklearn.datasets
from sklearn.ensemble import RandomForestClassifier
import random
from sklearn.preprocessing import StandardScaler

In [2]:
# load training data
Xtr_loadpath = 'Xtr.csv'
Xts_loadpath = 'Xts.csv'
ytr_loadpath = 'ytr.csv'

Xtr = np.loadtxt(Xtr_loadpath, delimiter=",")
Xts = np.loadtxt(Xts_loadpath, delimiter=",")
ytr = np.loadtxt(ytr_loadpath, delimiter=",")

In [3]:
# No standardization performed

Xtr_standardized = Xtr
Xts_standardized = Xts
ytr_standardized = ytr

# Save the standardized training data
Xtr_savepath = 'Xtr_xgboost.csv'
Xts_savepath = 'Xts_xgboost.csv'
ytr_savepath = 'ytr_xgboost.csv'
yts_hat_savepath = 'yts_hat_RandomForestOptuna.csv'

np.savetxt(Xtr_savepath, Xtr_standardized, delimiter=",")
np.savetxt(Xts_savepath, Xts_standardized, delimiter=",")
np.savetxt(ytr_savepath, ytr_standardized, delimiter=",")

# Perform 80:20 train:test split
X_train, X_test, y_train, y_test = train_test_split(Xtr_standardized, ytr_standardized, test_size = 0.2, random_state = random.randint(0,1000))



In [4]:
# Train a basic Random Forest model with no hyperparameter tuning

rf = sklearn.ensemble.RandomForestClassifier()
rf.fit(X_train, y_train)

# Generate train and test AUC scores
aucTrain = roc_auc_score(y_train,rf.predict(X_train))
print('Training AUC: ',aucTrain)

aucTest = roc_auc_score(y_test,rf.predict(X_test))
print('Test AUC: ',aucTest)


Training AUC:  1.0
Test AUC:  0.8313244469654


### Compare feature importance

In [5]:
print(rf.feature_importances_)

[0.06006881 0.05543014 0.17423612 0.11587187 0.05812095 0.153847
 0.11871743 0.26370767]


### Apply Optuna hyperparameter optimization technique

In [17]:
def objective(trial):
    
    # load training data
    Xtr_loadpath = 'Xtr.csv'
    Xts_loadpath = 'Xts.csv'
    ytr_loadpath = 'ytr.csv'
    Xtr = np.loadtxt(Xtr_loadpath, delimiter=",")
    Xts = np.loadtxt(Xts_loadpath, delimiter=",")
    ytr = np.loadtxt(ytr_loadpath, delimiter=",")
    
    x_train,x_test,y_train,y_test = train_test_split(Xtr,ytr,test_size=0.2,random_state = random.randint(0,1000))
    
    
    
    criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
    max_depth = trial.suggest_int("max_depth", 10, 40, log = True)
    n_estimators = trial.suggest_int("n_estimators", 300, 500)
    min_samples_split = trial.suggest_int("min_samples_split", 0, 100)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 0, 100)
    
    rf = sklearn.ensemble.RandomForestClassifier(criterion = criterion,
                                                max_depth = max_depth,
                                                n_estimators = n_estimators)
    score = cross_val_score(rf, x_test, y_test, n_jobs = -1, cv = 3)
    accuracy = score.mean()
    return accuracy

study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials = 100)

[32m[I 2022-12-08 18:31:43,059][0m A new study created in memory with name: no-name-72b75720-508f-4721-9896-dfd3b82533fa[0m
[32m[I 2022-12-08 18:31:45,197][0m Trial 0 finished with value: 0.8525024274649463 and parameters: {'criterion': 'entropy', 'max_depth': 19, 'n_estimators': 357, 'min_samples_split': 44, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.8525024274649463.[0m
[32m[I 2022-12-08 18:31:46,174][0m Trial 1 finished with value: 0.8420009214611913 and parameters: {'criterion': 'gini', 'max_depth': 34, 'n_estimators': 350, 'min_samples_split': 89, 'min_samples_leaf': 46}. Best is trial 0 with value: 0.8525024274649463.[0m
[32m[I 2022-12-08 18:31:47,515][0m Trial 2 finished with value: 0.8554971763367566 and parameters: {'criterion': 'entropy', 'max_depth': 36, 'n_estimators': 451, 'min_samples_split': 4, 'min_samples_leaf': 63}. Best is trial 2 with value: 0.8554971763367566.[0m
[32m[I 2022-12-08 18:31:48,229][0m Trial 3 finished with value: 0.833002417710

[32m[I 2022-12-08 18:32:13,767][0m Trial 31 finished with value: 0.8564936750843798 and parameters: {'criterion': 'gini', 'max_depth': 29, 'n_estimators': 338, 'min_samples_split': 30, 'min_samples_leaf': 29}. Best is trial 30 with value: 0.8629986808397603.[0m
[32m[I 2022-12-08 18:32:14,507][0m Trial 32 finished with value: 0.8299949124536831 and parameters: {'criterion': 'gini', 'max_depth': 24, 'n_estimators': 315, 'min_samples_split': 40, 'min_samples_leaf': 1}. Best is trial 30 with value: 0.8629986808397603.[0m
[32m[I 2022-12-08 18:32:15,280][0m Trial 33 finished with value: 0.8419994207100654 and parameters: {'criterion': 'gini', 'max_depth': 30, 'n_estimators': 345, 'min_samples_split': 30, 'min_samples_leaf': 47}. Best is trial 30 with value: 0.8629986808397603.[0m
[32m[I 2022-12-08 18:32:16,071][0m Trial 34 finished with value: 0.8449979214596905 and parameters: {'criterion': 'gini', 'max_depth': 15, 'n_estimators': 366, 'min_samples_split': 52, 'min_samples_leaf':

[32m[I 2022-12-08 18:32:40,024][0m Trial 62 finished with value: 0.8410006708357533 and parameters: {'criterion': 'gini', 'max_depth': 26, 'n_estimators': 344, 'min_samples_split': 31, 'min_samples_leaf': 25}. Best is trial 30 with value: 0.8629986808397603.[0m
[32m[I 2022-12-08 18:32:40,808][0m Trial 63 finished with value: 0.8545014279646964 and parameters: {'criterion': 'gini', 'max_depth': 24, 'n_estimators': 357, 'min_samples_split': 27, 'min_samples_leaf': 11}. Best is trial 30 with value: 0.8629986808397603.[0m
[32m[I 2022-12-08 18:32:41,521][0m Trial 64 finished with value: 0.848989919454687 and parameters: {'criterion': 'gini', 'max_depth': 31, 'n_estimators': 308, 'min_samples_split': 47, 'min_samples_leaf': 5}. Best is trial 30 with value: 0.8629986808397603.[0m
[32m[I 2022-12-08 18:32:42,247][0m Trial 65 finished with value: 0.857999678839259 and parameters: {'criterion': 'gini', 'max_depth': 15, 'n_estimators': 329, 'min_samples_split': 19, 'min_samples_leaf': 4

[32m[I 2022-12-08 18:33:04,140][0m Trial 93 finished with value: 0.8349999174586881 and parameters: {'criterion': 'gini', 'max_depth': 17, 'n_estimators': 339, 'min_samples_split': 21, 'min_samples_leaf': 21}. Best is trial 78 with value: 0.8675024349687019.[0m
[32m[I 2022-12-08 18:33:04,913][0m Trial 94 finished with value: 0.8440029234631933 and parameters: {'criterion': 'gini', 'max_depth': 17, 'n_estimators': 348, 'min_samples_split': 49, 'min_samples_leaf': 55}. Best is trial 78 with value: 0.8675024349687019.[0m
[32m[I 2022-12-08 18:33:05,612][0m Trial 95 finished with value: 0.8544976760868815 and parameters: {'criterion': 'gini', 'max_depth': 19, 'n_estimators': 312, 'min_samples_split': 29, 'min_samples_leaf': 41}. Best is trial 78 with value: 0.8675024349687019.[0m
[32m[I 2022-12-08 18:33:06,740][0m Trial 96 finished with value: 0.8369929149539345 and parameters: {'criterion': 'entropy', 'max_depth': 23, 'n_estimators': 432, 'min_samples_split': 32, 'min_samples_le

In [18]:
trial = study.best_trial
print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

Accuracy: 0.8675024349687019
Best hyperparameters: {'criterion': 'gini', 'max_depth': 14, 'n_estimators': 424, 'min_samples_split': 27, 'min_samples_leaf': 27}


In [19]:
optuna.visualization.plot_optimization_history(study)

In [20]:
optuna.visualization.plot_slice(study)

In [21]:
optuna.visualization.plot_param_importances(study)

In [22]:
rf = sklearn.ensemble.RandomForestClassifier(**study.best_params)
rf.fit(X_train, y_train)

scoreTrain = rf.score(X_train,y_train)
print(f"Train Accuracy: {scoreTrain}")

scoreTest = rf.score(X_test,y_test)
print(f"Test Accuracy: {scoreTest}")


Train Accuracy: 0.87425
Test Accuracy: 0.8515


In [27]:
print("AUC on train data: ", roc_auc_score(y_train,rf.predict(X_train)))
print("AUC on test data: ", roc_auc_score(y_test,rf.predict(X_test)))

AUC on train data:  0.8395548931594023
AUC on test data:  0.8097348269994328
