In [1]:
import numpy as np
import pandas as pd

# Preparing Dataframe

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [3]:
rec1 = pd.read_csv('rec1.csv')
rec2 = pd.read_csv('rec2.csv')
rec3 = pd.read_csv('rec3.csv')
rec4 = pd.read_csv('rec4.csv')
rec5 = pd.read_csv('rec5.csv')
rec8 = pd.read_csv('rec8.csv')
rec9 = pd.read_csv('rec9.csv')
print(rec1.shape, rec2.shape, rec3.shape, rec4.shape, rec5.shape, rec8.shape, rec9.shape)

# Concatenate the 4 participants into one dataframe
df = pd.concat([rec1, rec2, rec3, rec4, rec5, rec8, rec9], ignore_index=True)
print(df.shape)

# Drop redundant indexing column
df = df.drop('Unnamed: 0', axis=1)

# Number of target statuses
targets = df['Status'].unique()
print('Number of Statuses: ' + str(len(targets)))

# Separate dataframe into predictors and targets
X = df[df.columns[:-1]]
y = df[df.columns[-1]]

# Split predictors and targets into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

(426, 66) (1477, 66) (310, 66) (310, 66) (253, 66) (253, 66) (253, 66)
(3282, 66)
Number of Statuses: 27


# Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression

In [5]:
# Logistic Regression Classifier
clf_log = LogisticRegression(random_state=0, solver='saga', multi_class='multinomial', max_iter=1000)

# Parameters to tune
params = {
    'C': [10e-5, 10e-4, 10e-3, 10e-2, 10e-1]
}

# Grid Search for best parameter value
grid_log = GridSearchCV(clf_log, params, cv=5, n_jobs=-1, return_train_score=True, iid=False)

# Train Logistic Regression
result_log = grid_log.fit(X_train, y_train)



In [6]:
# Obtain accuracy for training and testing
train_acc_log = result_log.score(X_train, y_train)
test_acc_log = result_log.score(X_test, y_test)

print('Best parameter value is: ' + str(result_log.best_params_))
print('Training accuracy: ' + str(train_acc_log))
print('Testing accuracy: ' + str(test_acc_log))

Best parameter value is: {'C': 0.0001}
Training accuracy: 0.7161904761904762
Testing accuracy: 0.7351598173515982


# Linear Discriminant Analysis

In [7]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold

In [8]:
# LDA Classifier
clf_lda = LinearDiscriminantAnalysis()

# 5-fold cross validation
kf = KFold(n_splits = 5, shuffle=True, random_state=0)

train_scores = []
test_scores = []

# Obtain and iterate over train and test indices
for train_index, test_index in kf.split(X):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train LDA classifier
    clf_lda.fit(X_train, y_train)
    
    # Obtain training and testing scores
    train_scores.append(clf_lda.score(X_train, y_train))
    test_scores.append(clf_lda.score(X_test, y_test))

In [9]:
# Obtain training and testing accuracy
train_acc_lda = np.mean(train_scores)
test_acc_lda = np.mean(test_scores)

print('Mean training score: ' + str(train_acc_lda))
print('Mean testing score: ' + str(test_acc_lda))

Mean training score: 0.7697289087150473
Mean testing score: 0.7336957530534209


# Random Forest

In [10]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [11]:
# Random Forest Classifier
clf_rf = RandomForestClassifier(random_state=0)

# Hyperparameters for RF classifier
params = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80 ,90, 100],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

# Randomized search to find optimal hyperparameters
rand_rf = RandomizedSearchCV(clf_rf, params, n_jobs=-1, iid=False, cv=5, return_train_score=True)

# Fit classifier
result_rf = rand_rf.fit(X_train, y_train)

In [12]:
# Obtained optimal hyperparameter values
print('Best parameter values are: ' + str(result_rf.best_params_))

# Calculate accuracy of optimized RF model
train_acc_rf = result_rf.score(X_train, y_train)
test_acc_rf = result_rf.score(X_test, y_test)

print('Train Score: ' + str(train_acc_rf))
print('Test Score: ' + str(test_acc_rf))

Best parameter values are: {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 80}
Train Score: 0.9996191926884996
Test Score: 0.9679878048780488
