In [None]:
import pickle
import csv
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(style='ticks')
sns.set_palette("tab10")
sns.set_context('paper')
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, confusion_matrix

from sklearn.ensemble import RandomForestClassifier
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

print(sklearn.__version__)

In [None]:
# file name structure: p{post-publication year}l{label year}f.csv
# names = [“p0l5f.csv”, “p1l5f.csv”, “p2l5f.csv”, “p3l5f.csv”, “p4l5f.csv”, “p5l5f.csv”, “p1l1f.csv”, “p2l2f.csv”, “p3l3f.csv”, “p4l4f.csv”]

post_year = 0
label_year = 5
df = pd.read_csv(f"p{post_year}l{label_year}f.csv")

sampledf = df.sample(n = 10000, random_state = 2022)

#### Popping out Target Variable and SMOTE

In [None]:
y = sampledf.iloc[:, 0]

X = sampledf.drop(["high_impact"], axis=1)

In [None]:
cols = list(X.columns)

In [None]:
seed = 2022
X_resampled, y_resampled = SMOTE(random_state = seed).fit_resample(X, y)

#### Train test split

In [None]:
seed = 2022
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.25, random_state = seed)

#### Evaulation Metrics 

In [None]:
def calcMetrics (actual, predicted, threshold):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for i in range(len(predicted)):
        if predicted[i] < threshold:
            if actual[i] == 0:
                tn += 1
            else:
                fn += 1
        else:
            if actual[i] == 1:
                tp += 1
            else:
                fp += 1

    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1 = 2 * precision * recall / (precision + recall)
    
    return tp, fp, fn, tn, precision, recall, f1

In [None]:
def getAccuracy(actual, predicted):
    numCorrect = 0
    
    for i in range(len(actual)):
        if predicted[i] == actual[i]:
            numCorrect += 1
    
    return numCorrect/len(actual)

In addition to this, we can use balanced accuracy and roc_auc score.

In [None]:
def runForest(decrease):

    kf = KFold(n_splits = 5)
    
    sumAcc = 0

    for train_index, val_index in kf.split(X_train):
        X_train_only , X_val = X_train.iloc[train_index,:], X_train.iloc[val_index,:]
        y_train_only , y_val = y_train[train_index], y_train[val_index]

        model = ensemble.RandomForestClassifier(min_impurity_decrease = decrease, max_depth = depth, criterion = lossfn)

        model.fit(X_train_only, y_train_only)
        pred_values = model.predict(X_val)
        
        #print(type(pred_values))
        
        acc = getAccuracy(pred_values.tolist(), y_val.tolist())
        sumAcc += acc

    return sumAcc/5

In [None]:
import time

model = RandomForestClassifier()

params = [{'max_features': [0.01, "log2", 0.05, "sqrt", 0.075, .01, .02, .03, .05],
         'min_samples_leaf': [1, 3, 5, 10, 30, 50],
         'criterion': ["gini", "entropy"]}]

gs_forest = GridSearchCV(model,
                      param_grid=params,
                      scoring='accuracy',
                      cv=10,
                      verbose = 2)
t0 = time.time()

gs_forest.fit(X_train, y_train)

t1 = time.time()

t1-t0

In [None]:
gs_forest.best_params_

In [None]:
gs_forest.score(X_train, y_train)

In [None]:
results = pd.DataFrame(gs_forest.cv_results_)
results