In [None]:
import sys
sys.executable

In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, make_scorer, f1_score, roc_auc_score

#supress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
full_data = pd.read_csv("full_data.csv")

full_data = full_data.fillna(0)
X = full_data.drop(columns = ["label"])
y = full_data.label

print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

Hyperparameter Set A

In [None]:
params = { 
    'n_estimators': [10, 50, 100, 200],
    'max_features': ['sqrt'],
    'max_depth' : [5,10],
    'criterion' :['gini'],
    "class_weight": (None, "balanced")
}

# standard RandomizedSearchCV for comparison
clf_random = RandomizedSearchCV(estimator = RandomForestClassifier(), 
                                param_distributions = params, 
                                n_iter = 100, 
                                cv = StratifiedKFold(n_splits=10),
                                verbose=2, random_state=42, n_jobs = -1)

# modified GridSearchCV from Bertoli et al. 2021
clf_grid = GridSearchCV(estimator = RandomForestClassifier(),
                       param_grid = params,
                       scoring = 'accuracy',
                       cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=17))

# training the model
train_rand = clf_random.fit(X_train, y_train)
train_grid = clf_grid.fit(X_train, y_train)

# testing the model
test_rand = clf_random.score(X_test, y_test)

test_grid = clf_grid.score(X_test, y_test)

In [None]:
# Make predictions for the test sets
y_rand_test = clf_random.predict(X_test)
y_grid_test = clf_grid.predict(X_test)

# View accuracy scores
rand_score = accuracy_score(y_test, y_rand_test)
print(rand_score)

grid_score = accuracy_score(y_test, y_grid_test)
print(grid_score)

In [None]:
from sklearn.metrics import classification_report

# View the classification report for test data and predictions for random search
print(classification_report(y_test, y_rand_test))

# View the classification report for test data and predictions for grid search
print(classification_report(y_test, y_grid_test))

Hyperparameter Set B

In [None]:
params = { 
    'n_estimators': [15, 25, 50, 100],
    'max_features': ['sqrt'],
    'max_depth' : [5,10],
    'criterion' :['gini'],
    "class_weight": (None, "balanced")
}

# standard RandomizedSearchCV for comparison
clf_random = RandomizedSearchCV(estimator = RandomForestClassifier(), 
                                param_distributions = params, 
                                n_iter = 100, 
                                cv = StratifiedKFold(n_splits=10),
                                verbose=2, random_state=42, n_jobs = -1)

# modified GridSearchCV from Bertoli et al. 2021
clf_grid = GridSearchCV(estimator = RandomForestClassifier(),
                       param_grid = params,
                       scoring = 'accuracy',
                       cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=17))

# training the model
train_rand = clf_random.fit(X_train, y_train)
train_grid = clf_grid.fit(X_train, y_train)

# testing the model
test_rand = clf_random.score(X_test, y_test)

test_grid = clf_grid.score(X_test, y_test)

In [None]:
# Make predictions for the test sets
y_rand_test = clf_random.predict(X_test)
y_grid_test = clf_grid.predict(X_test)

# View accuracy scores
rand_score = accuracy_score(y_test, y_rand_test)
print(rand_score)

grid_score = accuracy_score(y_test, y_grid_test)
print(grid_score)

In [None]:
from sklearn.metrics import classification_report

# View the classification report for test data and predictions for random search
print(classification_report(y_test, y_rand_test))

# View the classification report for test data and predictions for grid search
print(classification_report(y_test, y_grid_test))

Hyperparameter Set C

In [None]:
params = { 
    'n_estimators': [200, 500],
    'max_features': ['sqrt'],
    'max_depth' : [4, 5, 6, 7, 8],
    'criterion' :['gini'],
    "class_weight": (None, "balanced"),
    'min_samples_leaf': [14]
}

# standard RandomizedSearchCV for comparison
clf_random = RandomizedSearchCV(estimator = RandomForestClassifier(), 
                                param_distributions = params, 
                                n_iter = 100, 
                                cv = StratifiedKFold(n_splits=10),
                                verbose=2, random_state=42, n_jobs = -1)

# modified GridSearchCV from Bertoli et al. 2021
clf_grid = GridSearchCV(estimator = RandomForestClassifier(),
                       param_grid = params,
                       scoring = 'accuracy',
                       cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=17))

# training the model
train_rand = clf_random.fit(X_train, y_train)
train_grid = clf_grid.fit(X_train, y_train)

# testing the model
test_rand = clf_random.score(X_test, y_test)

test_grid = clf_grid.score(X_test, y_test)

In [None]:
from sklearn.metrics import classification_report

# View the classification report for test data and predictions for random search
print(classification_report(y_test, y_rand_test))

# View the classification report for test data and predictions for grid search
print(classification_report(y_test, y_grid_test))

In [None]:
# Make predictions for the test sets
y_rand_test = clf_random.predict(X_test)
y_grid_test = clf_grid.predict(X_test)

# View accuracy scores
rand_score = accuracy_score(y_test, y_rand_test)
print(rand_score)

grid_score = accuracy_score(y_test, y_grid_test)
print(grid_score)

Hyperparameter Set D

In [None]:
params = { 
    'n_estimators': [200, 500],
    'max_features': ['log2'],
    'max_depth' : [4, 5, 6, 7, 8],
    'criterion' :['gini'],
    "class_weight": (None, "balanced"),
    "min_samples_leaf": [14]
}

# standard RandomizedSearchCV for comparison
clf_random = RandomizedSearchCV(estimator = RandomForestClassifier(), 
                                param_distributions = params, 
                                n_iter = 100, 
                                cv = StratifiedKFold(n_splits=10),
                                verbose=2, random_state=42, n_jobs = -1)

# modified GridSearchCV from Bertoli et al. 2021
clf_grid = GridSearchCV(estimator = RandomForestClassifier(),
                       param_grid = params,
                       scoring = 'accuracy',
                       cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=17))

# training the model
train_rand = clf_random.fit(X_train, y_train)
train_grid = clf_grid.fit(X_train, y_train)

# testing the model
test_rand = clf_random.score(X_test, y_test)

test_grid = clf_grid.score(X_test, y_test)

In [None]:
# Make predictions for the test sets
y_rand_test = clf_random.predict(X_test)
y_grid_test = clf_grid.predict(X_test)

# View accuracy scores
rand_score = accuracy_score(y_test, y_rand_test)
print(rand_score)

grid_score = accuracy_score(y_test, y_grid_test)
print(grid_score)

In [None]:
from sklearn.metrics import classification_report

# View the classification report for test data and predictions for random search
print(classification_report(y_test, y_rand_test))

# View the classification report for test data and predictions for grid search
print(classification_report(y_test, y_grid_test))