In [101]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer, fbeta_score
import os
from sklearn.model_selection import GridSearchCV

ROOT_DIR = os.path.abspath("../../..")
DATA_DIR = os.path.join(ROOT_DIR, "data/PAAWS/HINF_results/ML_value/wake_features.csv")


Read the data from csv file 

In [105]:
# read the csv file
data_df = pd.read_csv(DATA_DIR)
# drop NaN values
data_df = data_df.dropna()
data_df



Unnamed: 0,timestamp,x_mean,x_std,x_min,x_max,x_median,x_skew,x_fft_dc,x_fft_mean,x_fft_std,...,z_fft_pos_count,z_fft_above_mean,z_fft_num_peaks,z_fft_skew,z_fft_kurtosis,z_fft_energy,z_fft_sma,datetime,is_awake,subject
0,1635815700,0.158287,0.074746,0.03930,0.29740,0.175425,-0.040234,126.63000,0.389287,2.076635,...,400,33,140,11.741352,150.176417,2.120664,0.260082,2021-11-01 21:15:00,1,10
1,1635815710,0.137511,0.080691,-0.00745,0.34220,0.132550,0.498314,110.00910,0.315514,2.258943,...,400,51,134,11.548458,154.603403,3.887681,0.445099,2021-11-01 21:15:10,1,10
2,1635815720,0.157129,0.123498,0.01275,0.49765,0.128900,1.090247,125.70290,1.011735,3.341043,...,400,44,129,17.838872,335.639515,8.502025,0.376533,2021-11-01 21:15:20,1,10
3,1635815730,0.142624,0.152997,-0.00675,0.50000,0.109050,1.144524,114.09945,1.051623,4.194893,...,400,47,122,17.406423,325.327849,6.882156,0.341568,2021-11-01 21:15:30,1,10
4,1635815740,0.038864,0.214759,-0.67155,0.21385,0.121975,-2.245617,31.09090,1.141406,5.962219,...,400,43,149,7.006836,60.669167,16.011907,1.043398,2021-11-01 21:15:40,1,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581390,1652189360,0.951549,0.015338,0.91970,1.02580,0.950400,1.051907,761.23935,0.163048,0.401727,...,400,56,108,9.522243,117.361447,2.431103,0.443435,2022-05-10 09:29:20,1,32
581391,1652189370,0.491484,0.314455,0.06225,0.95740,0.300450,0.530619,393.18750,1.857907,8.692247,...,400,34,131,12.921325,203.770210,13.296781,0.734212,2022-05-10 09:29:30,1,32
581392,1652189380,0.216943,0.059249,0.02930,0.44765,0.219825,0.208391,173.55435,0.423688,1.620301,...,400,42,129,5.974539,41.733856,2.735418,0.451018,2022-05-10 09:29:40,1,32
581393,1652189390,0.230154,0.035526,0.15065,0.31295,0.227700,0.328370,184.12330,0.223065,0.979125,...,400,35,118,10.708023,138.554225,5.143551,0.476563,2022-05-10 09:29:50,1,32


In [107]:
# get the data for the subject 10 to 20 as training data
train_df = data_df[data_df['subject'] >= 10]
train_df = train_df[train_df['subject'] <= 20]
# get the data for the subject 21 to 30 as testing data
test_df = data_df[data_df['subject'] >= 21]

# remove columns timestamp, datetime, subject from the training data
train_df = train_df.drop(columns=['timestamp', 'datetime', 'subject'])
# X is the features ecept for the is_away column
X = train_df.drop(columns=['is_awake'])
# y is the is_away column
y = train_df['is_awake']

# Logistic Regression

In [108]:
# initliaze the scaler
scaler = StandardScaler()


# get the 10 fold cross validation
cv = KFold(n_splits=5, random_state=42, shuffle=True)

accuracy_lst = []
precision_lst = []
f1_lst = []
recall_lst = []
balanced_lst = []
auroc_lst = []
fpr_lst = []
# loop through each fold
for train_index, test_index in cv.split(X):
    # get the training and testing data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # scale the data
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # initialize the model
    model = LogisticRegression()
    print("Fitting model")
    # train the model
    model.fit(X_train, y_train)
    print("Getting prediction")
    # get the prediction
    y_pred = model.predict(X_test)
    print("Getting metrics")
    # add metrics to the list
    accuracy_lst.append(accuracy_score(y_test, y_pred))
    precision_lst.append(precision_score(y_test, y_pred))
    f1_lst.append(f1_score(y_test, y_pred))
    recall_lst.append(recall_score(y_test, y_pred))
    balanced_lst.append(balanced_accuracy_score(y_test, y_pred))
    auroc_lst.append(roc_auc_score(y_test, y_pred))
    fpr_lst.append(confusion_matrix(y_test, y_pred)[0][1] / (confusion_matrix(y_test, y_pred)[0][1] + confusion_matrix(y_test, y_pred)[0][0]))
    
# print the average metrics
print("Average Accuracy: {}".format(np.mean(accuracy_lst)))
print("Average Balanced Accuracy: {}".format(np.mean(balanced_lst)))
print("Average F1 Score: {}".format(np.mean(f1_lst)))
print("Average Precision: {}".format(np.mean(precision_lst)))
print("Average Recall: {}".format(np.mean(recall_lst)))
print("Average ROC AUC: {}".format(np.mean(auroc_lst)))
print("Average FPR: {}".format(np.mean(fpr_lst)))

Fitting model
Getting prediction
Getting metrics
Fitting model
Getting prediction
Getting metrics
Fitting model
Getting prediction
Getting metrics
Fitting model
Getting prediction
Getting metrics
Fitting model
Getting prediction
Getting metrics
Average Accuracy: 0.7650257055268986
Average Balanced Accuracy: 0.6654854910367535
Average F1 Score: 0.5078851523211501
Average Precision: 0.584722109759735
Average Recall: 0.4489744631158031
Average ROC AUC: 0.6654854910367534
Average FPR: 0.11800348104229619


# Guassian Naive Bayes

In [109]:
# initliaze the scaler
scaler = StandardScaler()


# get the 10 fold cross validation
cv = KFold(n_splits=5, random_state=42, shuffle=True)

accuracy_lst = []
precision_lst = []
f1_lst = []
recall_lst = []
balanced_lst = []
auroc_lst = []
fpr_lst = []
# loop through each fold
for train_index, test_index in cv.split(X):
    # get the training and testing data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # scale the data
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # initialize the model
    model = GaussianNB()
    print("Fitting model")
    # train the model
    model.fit(X_train, y_train)
    print("Getting prediction")
    # get the prediction
    y_pred = model.predict(X_test)
    print("Getting metrics")
    # add metrics to the list
    accuracy_lst.append(accuracy_score(y_test, y_pred))
    precision_lst.append(precision_score(y_test, y_pred))
    f1_lst.append(f1_score(y_test, y_pred))
    recall_lst.append(recall_score(y_test, y_pred))
    balanced_lst.append(balanced_accuracy_score(y_test, y_pred))
    auroc_lst.append(roc_auc_score(y_test, y_pred))
    fpr_lst.append(confusion_matrix(y_test, y_pred)[0][1] / (confusion_matrix(y_test, y_pred)[0][1] + confusion_matrix(y_test, y_pred)[0][0]))
    
# print the average metrics
print("Average Accuracy: {}".format(np.mean(accuracy_lst)))
print("Average Balanced Accuracy: {}".format(np.mean(balanced_lst)))
print("Average F1 Score: {}".format(np.mean(f1_lst)))
print("Average Precision: {}".format(np.mean(precision_lst)))
print("Average Recall: {}".format(np.mean(recall_lst)))
print("Average ROC AUC: {}".format(np.mean(auroc_lst)))
print("Average FPR: {}".format(np.mean(fpr_lst)))

Fitting model
Getting prediction
Getting metrics
Fitting model
Getting prediction
Getting metrics
Fitting model
Getting prediction
Getting metrics
Fitting model
Getting prediction
Getting metrics
Fitting model
Getting prediction
Getting metrics
Average Accuracy: 0.7421028719340412
Average Balanced Accuracy: 0.6325596623595688
Average F1 Score: 0.45115992762269413
Average Precision: 0.5300549001904221
Average Recall: 0.3943148860834078
Average ROC AUC: 0.6325596623595688
Average FPR: 0.12919556136427027


# Random Forest

In [122]:
# initliaze the scaler
scaler = StandardScaler()


# get the 10 fold cross validation
cv = KFold(n_splits=5, random_state=42, shuffle=True)

accuracy_lst = []
precision_lst = []
f1_lst = []
recall_lst = []
balanced_lst = []
auroc_lst = []
fpr_lst = []
# loop through each fold
for train_index, test_index in cv.split(X):
    # get the training and testing data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # scale the data
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # initialize the model
    model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
    print("Fitting model")
    # train the model
    model.fit(X_train, y_train)
    print("Getting prediction")
    # get the prediction
    y_pred = model.predict(X_test)
    print("Getting metrics")
    # add metrics to the list
    accuracy_lst.append(accuracy_score(y_test, y_pred))
    precision_lst.append(precision_score(y_test, y_pred))
    f1_lst.append(f1_score(y_test, y_pred))
    recall_lst.append(recall_score(y_test, y_pred))
    balanced_lst.append(balanced_accuracy_score(y_test, y_pred))
    auroc_lst.append(roc_auc_score(y_test, y_pred))
    fpr_lst.append(confusion_matrix(y_test, y_pred)[0][1] / (confusion_matrix(y_test, y_pred)[0][1] + confusion_matrix(y_test, y_pred)[0][0]))
    
# print the average metrics
print("Average Accuracy: {}".format(np.mean(accuracy_lst)))
print("Average Balanced Accuracy: {}".format(np.mean(balanced_lst)))
print("Average F1 Score: {}".format(np.mean(f1_lst)))
print("Average Precision: {}".format(np.mean(precision_lst)))
print("Average Recall: {}".format(np.mean(recall_lst)))
print("Average ROC AUC: {}".format(np.mean(auroc_lst)))
print("Average FPR: {}".format(np.mean(fpr_lst)))

Fitting model
Getting prediction
Getting metrics
Fitting model
Getting prediction
Getting metrics
Fitting model
Getting prediction
Getting metrics
Fitting model
Getting prediction
Getting metrics
Fitting model
Getting prediction
Getting metrics
Average Accuracy: 0.8222957563408331
Average Balanced Accuracy: 0.8105698159028074
Average F1 Score: 0.7046806185881656
Average Precision: 0.6393085548879806
Average Recall: 0.7850313680734831
Average ROC AUC: 0.8105698159028074
Average FPR: 0.16389173626786807


# MLP Classifier

In [111]:
# initliaze the scaler
scaler = StandardScaler()


# get the 10 fold cross validation
cv = KFold(n_splits=5, random_state=42, shuffle=True)

accuracy_lst = []
precision_lst = []
f1_lst = []
recall_lst = []
balanced_lst = []
auroc_lst = []
fpr_lst = []
# loop through each fold
for train_index, test_index in cv.split(X):
    # get the training and testing data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # scale the data
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # initialize the model
    model = MLPClassifier(activation= 'relu', alpha= 0.05, hidden_layer_sizes= (10, 30, 10), learning_rate= 'adaptive', solver= 'adam')
    print("Fitting model")
    # train the model
    model.fit(X_train, y_train)
    print("Getting prediction")
    # get the prediction
    y_pred = model.predict(X_test)
    print("Getting metrics")
    # add metrics to the list
    accuracy_lst.append(accuracy_score(y_test, y_pred))
    precision_lst.append(precision_score(y_test, y_pred))
    f1_lst.append(f1_score(y_test, y_pred))
    recall_lst.append(recall_score(y_test, y_pred))
    balanced_lst.append(balanced_accuracy_score(y_test, y_pred))
    auroc_lst.append(roc_auc_score(y_test, y_pred))
    fpr_lst.append(confusion_matrix(y_test, y_pred)[0][1] / (confusion_matrix(y_test, y_pred)[0][1] + confusion_matrix(y_test, y_pred)[0][0]))
    
# print the average metrics
print("Average Accuracy: {}".format(np.mean(accuracy_lst)))
print("Average Balanced Accuracy: {}".format(np.mean(balanced_lst)))
print("Average F1 Score: {}".format(np.mean(f1_lst)))
print("Average Precision: {}".format(np.mean(precision_lst)))
print("Average Recall: {}".format(np.mean(recall_lst)))
print("Average ROC AUC: {}".format(np.mean(auroc_lst)))
print("Average FPR: {}".format(np.mean(fpr_lst)))

Fitting model
Getting prediction
Getting metrics
Fitting model
Getting prediction
Getting metrics
Fitting model
Getting prediction
Getting metrics
Fitting model
Getting prediction
Getting metrics
Fitting model
Getting prediction
Getting metrics
Average Accuracy: 0.8083508523134363
Average Balanced Accuracy: 0.7793903959206103
Average F1 Score: 0.6685621170638198
Average Precision: 0.6274692882199547
Average Recall: 0.7162853978083987
Average ROC AUC: 0.7793903959206103
Average FPR: 0.1575046059671781


# Test the Best Model on the Test Set

In [112]:
clf = RandomForestClassifier(n_estimators=30, max_depth=10, random_state=42)
print(X.columns)

Index(['x_mean', 'x_std', 'x_min', 'x_max', 'x_median', 'x_skew', 'x_fft_dc',
       'x_fft_mean', 'x_fft_std', 'x_fft_aad', 'x_fft_min', 'x_fft_max',
       'x_fft_maxmin_diff', 'x_fft_median', 'x_fft_mad', 'x_fft_IQR',
       'x_fft_neg_count', 'x_fft_pos_count', 'x_fft_above_mean',
       'x_fft_num_peaks', 'x_fft_skew', 'x_fft_kurtosis', 'x_fft_energy',
       'x_fft_sma', 'y_mean', 'y_std', 'y_min', 'y_max', 'y_median', 'y_skew',
       'y_fft_dc', 'y_fft_mean', 'y_fft_std', 'y_fft_aad', 'y_fft_min',
       'y_fft_max', 'y_fft_maxmin_diff', 'y_fft_median', 'y_fft_mad',
       'y_fft_IQR', 'y_fft_neg_count', 'y_fft_pos_count', 'y_fft_above_mean',
       'y_fft_num_peaks', 'y_fft_skew', 'y_fft_kurtosis', 'y_fft_energy',
       'y_fft_sma', 'z_mean', 'z_std', 'z_min', 'z_max', 'z_median', 'z_skew',
       'z_fft_dc', 'z_fft_mean', 'z_fft_std', 'z_fft_aad', 'z_fft_min',
       'z_fft_max', 'z_fft_maxmin_diff', 'z_fft_median', 'z_fft_mad',
       'z_fft_IQR', 'z_fft_neg_count', 'z_fft_

In [113]:
# train the model on the entire training set
clf.fit(X, y)

In [114]:
# set X test to be the the test_df with the timestamp, datetune and subject columns dropped
X_test = test_df.drop(['timestamp', 'datetime', 'subject', 'is_awake'], axis=1)
y_test = test_df['is_awake']
print(X_test.columns)

Index(['x_mean', 'x_std', 'x_min', 'x_max', 'x_median', 'x_skew', 'x_fft_dc',
       'x_fft_mean', 'x_fft_std', 'x_fft_aad', 'x_fft_min', 'x_fft_max',
       'x_fft_maxmin_diff', 'x_fft_median', 'x_fft_mad', 'x_fft_IQR',
       'x_fft_neg_count', 'x_fft_pos_count', 'x_fft_above_mean',
       'x_fft_num_peaks', 'x_fft_skew', 'x_fft_kurtosis', 'x_fft_energy',
       'x_fft_sma', 'y_mean', 'y_std', 'y_min', 'y_max', 'y_median', 'y_skew',
       'y_fft_dc', 'y_fft_mean', 'y_fft_std', 'y_fft_aad', 'y_fft_min',
       'y_fft_max', 'y_fft_maxmin_diff', 'y_fft_median', 'y_fft_mad',
       'y_fft_IQR', 'y_fft_neg_count', 'y_fft_pos_count', 'y_fft_above_mean',
       'y_fft_num_peaks', 'y_fft_skew', 'y_fft_kurtosis', 'y_fft_energy',
       'y_fft_sma', 'z_mean', 'z_std', 'z_min', 'z_max', 'z_median', 'z_skew',
       'z_fft_dc', 'z_fft_mean', 'z_fft_std', 'z_fft_aad', 'z_fft_min',
       'z_fft_max', 'z_fft_maxmin_diff', 'z_fft_median', 'z_fft_mad',
       'z_fft_IQR', 'z_fft_neg_count', 'z_fft_

In [115]:
# get the prediction
y_pred = clf.predict(X_test)
# print the metrics
print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))
print("Balanced Accuracy: {}".format(balanced_accuracy_score(y_test, y_pred)))
print("F1 Score: {}".format(f1_score(y_test, y_pred)))
print("Precision: {}".format(precision_score(y_test, y_pred)))
print("Recall: {}".format(recall_score(y_test, y_pred)))
print("ROC AUC: {}".format(roc_auc_score(y_test, y_pred)))
print("FPR: {}".format(confusion_matrix(y_test, y_pred)[0][1] / (confusion_matrix(y_test, y_pred)[0][1] + confusion_matrix(y_test, y_pred)[0][0])))
print("Confusion Matrix: {}".format(confusion_matrix(y_test, y_pred)))

Accuracy: 0.7849312234977764
Balanced Accuracy: 0.7619668136996209
F1 Score: 0.6499957922071676
Precision: 0.599760388702771
Recall: 0.7094158400251929
ROC AUC: 0.7619668136996209
FPR: 0.18548221262595105
Confusion Matrix: [[158440  36080]
 [ 22146  54066]]


In [116]:
test_df['prediction'] = y_pred

In [117]:
test_df = test_df[['timestamp', 'datetime', 'subject', 'is_awake', 'prediction']]
test_df.to_csv('rf_pred.csv', index=False)

In [118]:
import datetime as dt
import warnings
warnings.filterwarnings('ignore')
test_df['datetime'] = pd.to_datetime(test_df['timestamp'], unit='s')

def cpr(window: int):
    total_night = 0
    correct_prompt = 0
    unique_subjects = test_df['subject'].unique()
    for subject in unique_subjects:
        # get the df
        df = test_df[test_df['subject'] == subject]
        # create a new column called date 
        df['date'] = df['datetime'].dt.date
        # get unique dates
        unique_dates = df['date'].unique()
        # fot each date
        for date in unique_dates:
            try:
                # filter the df
                df_date = df[df['date'] == date]
                # only get the rows with hour < 20
                df_date = df_date[df_date['datetime'].dt.hour < 20]
                # sort the df by datetime
                df_date = df_date.sort_values(by='datetime')
                # get the first row with is_awake == 1
                awake_row = df_date[df_date['is_awake'] == 1].iloc[0]
                # get the time of the awake row
                awake_time = awake_row['datetime']
                # filter all the rows with timestamp 3 minutes around the awake time
                df_date = df_date[(df_date['datetime'] >= awake_time - dt.timedelta(minutes=window)) & (df_date['datetime'] <= awake_time + dt.timedelta(minutes=window))]
                # see if any prediction is 1
                if df_date['prediction'].sum() > 0:
                    correct_prompt += 1
                total_night += 1
            except:
                continue
    return correct_prompt / total_night

print(cpr(3))
print(cpr(5))
print(cpr(10))

0.9444444444444444
0.9444444444444444
0.9629629629629629


In [119]:
# create a mew df with only the timestamp, datetime, subject, is_awake and prediction columns
df = test_df[['timestamp', 'datetime', 'subject', 'is_awake', 'prediction']]
# save the df to a csv file
df.to_csv('rf_pred.csv', index=False)

In [120]:
data_df[data_df['subject'] == 23].head()

Unnamed: 0,timestamp,x_mean,x_std,x_min,x_max,x_median,x_skew,x_fft_dc,x_fft_mean,x_fft_std,...,z_fft_pos_count,z_fft_above_mean,z_fft_num_peaks,z_fft_skew,z_fft_kurtosis,z_fft_energy,z_fft_sma,datetime,is_awake,subject
310654,1648607550,0.047893,0.001755,0.0418,0.0512,0.0482,-1.143045,38.3144,0.014659,0.047398,...,400,65,130,9.258233,102.871534,0.006459,0.024091,2022-03-29 22:32:30,1,23
310655,1648607560,0.04332,0.004711,0.028,0.059,0.0436,0.071216,34.656,0.032554,0.129135,...,400,34,139,10.63016,121.577779,0.89809,0.1495,2022-03-29 22:32:40,1,23
310656,1648607570,-0.059873,0.210373,-0.7244,0.1891,0.035325,-2.007898,47.89825,1.747089,5.68409,...,400,45,102,11.268218,138.400024,242.858444,3.109943,2022-03-29 22:32:50,1,23
310657,1648607580,-0.391247,0.625275,-0.9432,0.7595,-0.709825,0.958636,312.998,3.548556,17.314505,...,400,49,151,11.500758,167.470389,30.687997,1.459172,2022-03-29 22:33:00,1,23
310658,1648607590,-0.166779,0.42619,-0.9392,0.98705,-0.310725,1.684895,133.42285,3.110897,11.638344,...,400,41,117,8.61492,88.425213,24.791086,1.197007,2022-03-29 22:33:10,1,23


In [121]:
# sort the df by datetime
test_df = data_df[data_df['subject'] == 23]
test_df.head()

Unnamed: 0,timestamp,x_mean,x_std,x_min,x_max,x_median,x_skew,x_fft_dc,x_fft_mean,x_fft_std,...,z_fft_pos_count,z_fft_above_mean,z_fft_num_peaks,z_fft_skew,z_fft_kurtosis,z_fft_energy,z_fft_sma,datetime,is_awake,subject
310654,1648607550,0.047893,0.001755,0.0418,0.0512,0.0482,-1.143045,38.3144,0.014659,0.047398,...,400,65,130,9.258233,102.871534,0.006459,0.024091,2022-03-29 22:32:30,1,23
310655,1648607560,0.04332,0.004711,0.028,0.059,0.0436,0.071216,34.656,0.032554,0.129135,...,400,34,139,10.63016,121.577779,0.89809,0.1495,2022-03-29 22:32:40,1,23
310656,1648607570,-0.059873,0.210373,-0.7244,0.1891,0.035325,-2.007898,47.89825,1.747089,5.68409,...,400,45,102,11.268218,138.400024,242.858444,3.109943,2022-03-29 22:32:50,1,23
310657,1648607580,-0.391247,0.625275,-0.9432,0.7595,-0.709825,0.958636,312.998,3.548556,17.314505,...,400,49,151,11.500758,167.470389,30.687997,1.459172,2022-03-29 22:33:00,1,23
310658,1648607590,-0.166779,0.42619,-0.9392,0.98705,-0.310725,1.684895,133.42285,3.110897,11.638344,...,400,41,117,8.61492,88.425213,24.791086,1.197007,2022-03-29 22:33:10,1,23
