In [21]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer, fbeta_score
import os
from sklearn.model_selection import GridSearchCV

ROOT_DIR = os.path.abspath("../../..")
DATA_DIR = os.path.join(ROOT_DIR, "data/PAAWS/HINF_results/ML_value/ml.csv")


Read the data from csv file 

In [6]:
# read the csv file
data_df = pd.read_csv(DATA_DIR)
data_df

Unnamed: 0,timestamp,user_id,is_dominant_hand,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,is_awake
0,1.636340e+09,10.0,1.0,133.897566,138.723742,11.607398,20.176750,15.028654,13.808494,8.592218,5.973276,219.127295,60.719584,1.0
1,1.636341e+09,10.0,1.0,186.119181,104.270434,123.303300,33.353866,7.082763,79.694083,27.139002,27.714479,10.839804,15.673297,1.0
2,1.636341e+09,10.0,1.0,19.137241,9.478856,14.606582,29.526991,23.313256,54.736681,356.012776,228.638837,15.498464,9.004995,1.0
3,1.636341e+09,10.0,1.0,8.861755,28.725396,21.522301,34.755128,13.680757,23.010703,65.102956,37.238526,40.971031,120.009205,1.0
4,1.636341e+09,10.0,1.0,113.669381,24.312051,55.787079,32.075842,24.206239,36.009034,17.578392,120.446664,23.342901,21.418968,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111163,1.651773e+09,32.0,1.0,26.166969,29.261009,204.805835,145.493203,9.930581,37.620746,215.617256,139.099470,166.391553,249.366493,1.0
111164,1.651773e+09,32.0,1.0,675.729349,477.644093,661.365728,627.166552,323.841812,238.286517,475.389594,558.661125,419.851758,86.814788,1.0
111165,1.651773e+09,32.0,1.0,299.888626,438.333245,343.235316,393.861286,293.038422,407.391127,259.169468,161.635855,192.665703,320.127491,1.0
111166,1.651773e+09,32.0,1.0,409.636119,281.254187,705.151593,400.213953,504.022807,486.839360,481.032165,370.752820,403.714479,588.956122,1.0


# Logistic Regression

In [11]:
# initliaze the scaler
scaler = StandardScaler()

# get the features, which is from t1 to t10
features_cols = ['t1', 't2', 't3', 't4', 't5', 't6', 't7', 't8', 't9', 't10']

# get the 10 fold cross validation
cv = KFold(n_splits=10, random_state=42, shuffle=True)

accuracy_lst = []
precision_lst = []
f1_lst = []
recall_lst = []
balanced_lst = []
auroc_lst = []
fpr_lst = []
# loop through each fold
for train_index, test_index in cv.split(data_df):
    # get the training and testing data
    train_df = data_df.iloc[train_index]
    test_df = data_df.iloc[test_index]
    
    # get the X_train, y_train, X_test, y_test
    X_train = train_df[features_cols]
    y_train = train_df['is_awake']
    X_test = test_df[features_cols]
    y_test = test_df['is_awake']
    
    # scale the data
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # initialize the model
    model = LogisticRegression()
    # train the model
    model.fit(X_train, y_train)
    # get the prediction
    y_pred = model.predict(X_test)
    
    # add metrics to the list
    accuracy_lst.append(accuracy_score(y_test, y_pred))
    precision_lst.append(precision_score(y_test, y_pred))
    f1_lst.append(f1_score(y_test, y_pred))
    recall_lst.append(recall_score(y_test, y_pred))
    balanced_lst.append(balanced_accuracy_score(y_test, y_pred))
    auroc_lst.append(roc_auc_score(y_test, y_pred))
    fpr_lst.append(confusion_matrix(y_test, y_pred)[0][1] / (confusion_matrix(y_test, y_pred)[0][1] + confusion_matrix(y_test, y_pred)[0][0]))
    
# print the average metrics
print("Average Accuracy: {}".format(np.mean(accuracy_lst)))
print("Average Balanced Accuracy: {}".format(np.mean(balanced_lst)))
print("Average F1 Score: {}".format(np.mean(f1_lst)))
print("Average Precision: {}".format(np.mean(precision_lst)))
print("Average Recall: {}".format(np.mean(recall_lst)))
print("Average ROC AUC: {}".format(np.mean(auroc_lst)))
print("Average FPR: {}".format(np.mean(fpr_lst)))    

Average Accuracy: 0.7351756180775106
Average Balanced Accuracy: 0.6976496958691507
Average F1 Score: 0.6060090210189399
Average Precision: 0.7517745455126438
Average Recall: 0.5076124500994543
Average ROC AUC: 0.6976496958691507
Average FPR: 0.11231305836115302


# Guassian Naive Bayes

In [12]:
# initliaze the scaler
scaler = StandardScaler()

# get the features, which is from t1 to t10
features_cols = ['t1', 't2', 't3', 't4', 't5', 't6', 't7', 't8', 't9', 't10']

# get the 10 fold cross validation
cv = KFold(n_splits=10, random_state=42, shuffle=True)

accuracy_lst = []
precision_lst = []
f1_lst = []
recall_lst = []
balanced_lst = []
auroc_lst = []
fpr_lst = []
# loop through each fold
for train_index, test_index in cv.split(data_df):
    # get the training and testing data
    train_df = data_df.iloc[train_index]
    test_df = data_df.iloc[test_index]
    
    # get the X_train, y_train, X_test, y_test
    X_train = train_df[features_cols]
    y_train = train_df['is_awake']
    X_test = test_df[features_cols]
    y_test = test_df['is_awake']
    
    # scale the data
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # initialize the guassian naive bayes model
    clf = GaussianNB()
    # train the model
    model.fit(X_train, y_train)
    # get the prediction
    y_pred = model.predict(X_test)
    
    # add metrics to the list
    accuracy_lst.append(accuracy_score(y_test, y_pred))
    precision_lst.append(precision_score(y_test, y_pred))
    f1_lst.append(f1_score(y_test, y_pred))
    recall_lst.append(recall_score(y_test, y_pred))
    balanced_lst.append(balanced_accuracy_score(y_test, y_pred))
    auroc_lst.append(roc_auc_score(y_test, y_pred))
    fpr_lst.append(confusion_matrix(y_test, y_pred)[0][1] / (confusion_matrix(y_test, y_pred)[0][1] + confusion_matrix(y_test, y_pred)[0][0]))
    
# print the average metrics
print("Average Accuracy: {}".format(np.mean(accuracy_lst)))
print("Average Balanced Accuracy: {}".format(np.mean(balanced_lst)))
print("Average F1 Score: {}".format(np.mean(f1_lst)))
print("Average Precision: {}".format(np.mean(precision_lst)))
print("Average Recall: {}".format(np.mean(recall_lst)))
print("Average ROC AUC: {}".format(np.mean(auroc_lst)))
print("Average FPR: {}".format(np.mean(fpr_lst)))

Average Accuracy: 0.7351756180775106
Average Balanced Accuracy: 0.6976496958691507
Average F1 Score: 0.6060090210189399
Average Precision: 0.7517745455126438
Average Recall: 0.5076124500994543
Average ROC AUC: 0.6976496958691507
Average FPR: 0.11231305836115302


# Random Forest

In [16]:
# initliaze the scaler
scaler = StandardScaler()

# get the features, which is from t1 to t10
features_cols = ['t1', 't2', 't3', 't4', 't5', 't6', 't7', 't8', 't9', 't10']

# get the 10 fold cross validation
cv = KFold(n_splits=10, random_state=42, shuffle=True)

accuracy_lst = []
precision_lst = []
f1_lst = []
recall_lst = []
balanced_lst = []
auroc_lst = []
fpr_lst = []
# loop through each fold
for train_index, test_index in cv.split(data_df):
    # get the training and testing data
    train_df = data_df.iloc[train_index]
    test_df = data_df.iloc[test_index]
    
    # get the X_train, y_train, X_test, y_test
    X_train = train_df[features_cols]
    y_train = train_df['is_awake']
    X_test = test_df[features_cols]
    y_test = test_df['is_awake']
    
    # scale the data
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # initialize the random forest model
    clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=3)
    # train the model
    model.fit(X_train, y_train)
    # get the prediction
    y_pred = model.predict(X_test)
    
    # add metrics to the list
    accuracy_lst.append(accuracy_score(y_test, y_pred))
    precision_lst.append(precision_score(y_test, y_pred))
    f1_lst.append(f1_score(y_test, y_pred))
    recall_lst.append(recall_score(y_test, y_pred))
    balanced_lst.append(balanced_accuracy_score(y_test, y_pred))
    auroc_lst.append(roc_auc_score(y_test, y_pred))
    fpr_lst.append(confusion_matrix(y_test, y_pred)[0][1] / (confusion_matrix(y_test, y_pred)[0][1] + confusion_matrix(y_test, y_pred)[0][0]))
    
# print the average metrics
print("Average Accuracy: {}".format(np.mean(accuracy_lst)))
print("Average Balanced Accuracy: {}".format(np.mean(balanced_lst)))
print("Average F1 Score: {}".format(np.mean(f1_lst)))
print("Average Precision: {}".format(np.mean(precision_lst)))
print("Average Recall: {}".format(np.mean(recall_lst)))
print("Average ROC AUC: {}".format(np.mean(auroc_lst)))
print("Average FPR: {}".format(np.mean(fpr_lst)))

Average Accuracy: 0.7351756180775106
Average Balanced Accuracy: 0.6976496958691507
Average F1 Score: 0.6060090210189399
Average Precision: 0.7517745455126438
Average Recall: 0.5076124500994543
Average ROC AUC: 0.6976496958691507
Average FPR: 0.11231305836115302


# MLP Classifier

In [26]:
# initliaze the scaler
scaler = StandardScaler()

# get the features, which is from t1 to t10
features_cols = ['t1', 't2', 't3', 't4', 't5', 't6', 't7', 't8', 't9', 't10']

# get the 10 fold cross validation
cv = KFold(n_splits=10, random_state=42, shuffle=True)

accuracy_lst = []
precision_lst = []
f1_lst = []
recall_lst = []
balanced_lst = []
auroc_lst = []
fpr_lst = []
# loop through each fold
for train_index, test_index in cv.split(data_df):
    # get the training and testing data
    train_df = data_df.iloc[train_index]
    test_df = data_df.iloc[test_index]
    
    # get the X_train, y_train, X_test, y_test
    X_train = train_df[features_cols]
    y_train = train_df['is_awake']
    X_test = test_df[features_cols]
    y_test = test_df['is_awake']
    
    # scale the data
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # initialize the MLP model with 3 hidden layers with 10 neurons each
    clf = MLPClassifier(activation= 'relu', alpha= 0.05, hidden_layer_sizes= (10, 30, 10), learning_rate= 'adaptive', solver= 'adam')
    # train the model
    model.fit(X_train, y_train)
    # get the prediction
    y_pred = model.predict(X_test)
    
    # add metrics to the list
    accuracy_lst.append(accuracy_score(y_test, y_pred))
    precision_lst.append(precision_score(y_test, y_pred))
    f1_lst.append(f1_score(y_test, y_pred))
    recall_lst.append(recall_score(y_test, y_pred))
    balanced_lst.append(balanced_accuracy_score(y_test, y_pred))
    auroc_lst.append(roc_auc_score(y_test, y_pred))
    fpr_lst.append(confusion_matrix(y_test, y_pred)[0][1] / (confusion_matrix(y_test, y_pred)[0][1] + confusion_matrix(y_test, y_pred)[0][0]))
    
# print the average metrics
print("Average Accuracy: {}".format(np.mean(accuracy_lst)))
print("Average Balanced Accuracy: {}".format(np.mean(balanced_lst)))
print("Average F1 Score: {}".format(np.mean(f1_lst)))
print("Average Precision: {}".format(np.mean(precision_lst)))
print("Average Recall: {}".format(np.mean(recall_lst)))
print("Average ROC AUC: {}".format(np.mean(auroc_lst)))
print("Average FPR: {}".format(np.mean(fpr_lst)))

Average Accuracy: 0.7351756180775106
Average Balanced Accuracy: 0.6976496958691507
Average F1 Score: 0.6060090210189399
Average Precision: 0.7517745455126438
Average Recall: 0.5076124500994543
Average ROC AUC: 0.6976496958691507
Average FPR: 0.11231305836115302


In [31]:
# create a scorer function that use fbeta_score as the metric
sc_fn = make_scorer(fbeta_score, beta=2)

# grid search for the best parameters
mlp_params = {
    'hidden_layer_sizes': [(10,30,10)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.05],
    'learning_rate': ['adaptive'],
}
clf = GridSearchCV(MLPClassifier(max_iter=1000), mlp_params, n_jobs=-1, cv=10, scoring='precision')

# get the features, which is from t1 to t10
features_cols = ['t1', 't2', 't3', 't4', 't5', 't6', 't7', 't8', 't9', 't10']
# get the X and y
X = data_df[features_cols]
y = data_df['is_awake']

#get X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# fit the model
clf.fit(X_train, y_train)

#print the best parameters
clf.best_params_

# get the prediction
y_pred = clf.predict(X_test)

# calculate the metrics
print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))
print("Balanced Accuracy: {}".format(balanced_accuracy_score(y_test, y_pred)))
print("F1 Score: {}".format(f1_score(y_test, y_pred)))
print("Precision: {}".format(precision_score(y_test, y_pred)))
print("Recall: {}".format(recall_score(y_test, y_pred)))
print("ROC AUC: {}".format(roc_auc_score(y_test, y_pred)))
print("FPR: {}".format(confusion_matrix(y_test, y_pred)[0][1] / (confusion_matrix(y_test, y_pred)[0][1] + confusion_matrix(y_test, y_pred)[0][0])))
print("Confusion Matrix: {}".format(confusion_matrix(y_test, y_pred)))

Accuracy: 0.8329135558154178
Balanced Accuracy: 0.8364740307335514
F1 Score: 0.8054464519507725
Precision: 0.7612353989309047
Recall: 0.8551095296341599
ROC AUC: 0.8364740307335514
FPR: 0.18216146816705686
Confusion Matrix: [[10829  2412]
 [ 1303  7690]]


In [25]:
clf.best_params_

{'activation': 'relu',
 'alpha': 0.05,
 'hidden_layer_sizes': (10, 30, 10),
 'learning_rate': 'adaptive',
 'solver': 'adam'}