In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

## Preparing Dataframe

In [2]:
rec1 = pd.read_csv('rec1.csv')
rec2 = pd.read_csv('rec2.csv')
rec3 = pd.read_csv('rec3.csv')
rec4 = pd.read_csv('rec4.csv')
print(rec1.shape, rec2.shape, rec3.shape, rec4.shape)

# Concatenate the 4 participants into one dataframe
df = pd.concat([rec1, rec2, rec3, rec4], ignore_index=True)
print(df.shape)

# Drop redundant indexing column
df = df.drop('Unnamed: 0', axis=1)

# Number of target statuses
targets = df['Status'].unique()
print('Number of Statuses: ' + str(len(targets)))

# Separate dataframe into predictors and targets
X = df[df.columns[:-1]]
y = df[df.columns[-1]]

# Split predictors and targets into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

(426, 66) (1477, 66) (310, 66) (310, 66)
(2523, 66)
Number of Statuses: 27


## Random Forests

In [3]:
# Random Forest Classifier
clf_rf = RandomForestClassifier(random_state=0)

# Hyperparameters for RF classifier
params = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80 ,90, 100],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

# Randomized search to find optimal hyperparameters
rand_rf = RandomizedSearchCV(clf_rf, params, n_jobs=-1, iid=False, cv=5, return_train_score=True)

result_rf = rand_rf.fit(X_train, y_train)

In [4]:
# Obtained optimal hyperparameter values
print(result_rf.best_params_)

{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 90}


In [7]:
# Calculate accuracy of optimized RF model
train_score = result_rf.score(X_train, y_train)
test_score = result_rf.score(X_test, y_test)

print('Train Score: ' + str(train_score))
print('Test Score: ' + str(test_score))

Train Score: 1.0
Test Score: 0.9782178217821782
