In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
X_train = pd.read_csv('agg_Xtrain.csv').set_index('Unnamed: 0')
X_test = pd.read_csv('agg_Xtest.csv').set_index('Unnamed: 0')
y_train = pd.read_csv('agg_ytrain.csv').set_index('Unnamed: 0')
y_test = pd.read_csv('agg_ytest.csv').set_index('Unnamed: 0')

In [3]:
#hyperparameter tuning

Kcv = KFold(n_splits=10)
parameters = {'n_estimators':[5,20,50,100,150],
              'criterion':['gini','entropy'],
              #'max_depth':[10*i for i in range(1,13)],
              #'min_samples_split' :[2,6,10],
              #'min_samples_leaf' :[1,2,4],
              'bootstrap' : [True, False]
              
}

model = RandomForestClassifier()
cv = GridSearchCV(model, param_grid=parameters, cv=Kcv)

In [None]:
%%time
#best params found by grid search
cv.fit(X_train,y_train)
cv.best_params_

In [None]:
%%time
model.set_params(**cv.best_params_)
model.fit(X_train,y_train)
pred = model.predict(X_test)
f1 = f1_score(y_test, pred, average='weighted') # which average to use??
precision = precision_score(y_test, pred, average='weighted')
recall = recall_score(y_test, pred, average='weighted')

print(f"precision: {precision}, recall: {recall}, f1score: {f1}")

In [None]:
#plot result

matrix = confusion_matrix(y_test, pred)
matrix_normalized = confusion_matrix(y_test, pred, normalize='true')

fig1, ax = plt.subplots(figsize=(10,10))
sns.heatmap(matrix, annot=True, fmt='.2f')
plt.title('Confusion matrix without normalization')
plt.ylabel('Actual')
plt.xlabel('Predicted')

fig2, ax = plt.subplots(figsize=(10,10))
sns.heatmap(matrix_normalized, annot=True, fmt='.2f')
plt.title('Confusion matrix with normalization')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()



In [None]:
#feature importance
importance = model.feature_importances_

fig = plt.figure(figsize=(10,10))
plt.bar(X_test.columns, importance)
plt.xticks(rotation=40, ha = 'right')
plt.show()

In [None]:
print('training data')
print(classification_report(y_train, model.predict(X_train)))

print('test data')
print(classification_report(y_test, model.predict(X_test)))