# Alternative 1 - Deep Learning on the raw signal

<img src="images/feature_set_summary.jpg" width="1000" height="1000">


In [1]:
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
from sklearn import metrics

tf.random.set_seed(0)
np.random.seed(0)

def classification_results(y, yhat):
    acc = metrics.accuracy_score(y, yhat)
    mcc = metrics.matthews_corrcoef(y, yhat)
    f1_weighted = metrics.f1_score(y, yhat, average="weighted")
    return acc, mcc, f1_weighted

In [2]:
df = pd.read_csv("./datasets/df_raw_features.tar.gz")
print("Number of PIDs:", df["pid"].unique().shape[0])
df.head()

Number of PIDs: 100


Unnamed: 0,pid,time,id,sort,sleep_phase,act_0,act_1,act_2,act_3,act_4,...,hr_1,hr_2,hr_3,hr_4,hr_5,hr_6,hr_7,hr_8,hr_9,hr_10
0,0,29,"(0, 0)",0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.0
1,0,59,"(0, 1)",1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.0,76.0
2,0,89,"(0, 2)",2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.0,76.0,78.0
3,0,119,"(0, 3)",3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,71.0,76.0,78.0,73.0
4,0,149,"(0, 4)",4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,71.0,76.0,78.0,73.0,80.0


In [3]:
WIN_LENGTH = len([k for k in df.keys() if k.startswith("hr_")])

In [4]:
df["sleep_phase"].unique()

# Type of sleep statging problems:
# -------------------------------
#
#      5-class | 4-class | 3-class | 2-Class
# 0 -> Wake    | Wake    | Wake    | Wake
# 1 -> N1      | Light   | NREM    | Sleep
# 2 -> N2      | Light   | NREM    | Sleep
# 3 -> N3      | Deep    | NREM    | Sleep
# 4 -> N4      | Deep    | NREM    | Sleep
# 5 -> REM     | REM     | REM     | Sleep
#
#


array([0., 1., 2., 5., 3., 4.])

In [5]:
df["sleep"] = (df["sleep_phase"] > 0).astype(int)


- Model input (S, 2, 11): 
                       [
                        [hr_0, hr_1, hr_2   ....hr_10]
                        [act_0, act_1, act_2....act_10]
                       ]
- Model output:
    - (S, 1) (sleep)



### Get XY from dataframe

In [6]:
def generate_XY(df, ycol="sleep"):
    
    hr_cols = [k for k in df.keys() if k.startswith("hr_")]
    act_cols = [k for k in df.keys() if k.startswith("act_")]

    hr = df[hr_cols].values
    act = df[act_cols].values
    
    Y  = df[ycol].values.reshape(-1, 1)

    X = np.stack((act,hr))
    X = X.transpose(1, 0, 2)
    
    return X, Y


In [7]:
df_XY = df.groupby("pid").apply(lambda x: generate_XY(x))
df_XY.head()

pid
0    ([[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 2.], [ 0.  0...
1    ([[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.], [ 0.  0...
2    ([[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 12...
3    ([[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.], [ 0.  0...
4    ([[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], [ 0.  0...
dtype: object

In [8]:
idx = 2
df_XY.iloc[idx][0].shape, df_XY.iloc[idx][1].shape

((1075, 2, 11), (1075, 1))

In [9]:
xs, ys = [], []
for row_id, (x, y) in df_XY.items():
    xs.append(x)
    ys.append(y)
    
xs = np.array(xs, dtype=object)
ys = np.array(ys, dtype=object)


In [10]:
# Now we can create a simple trainset from the dataset making sure that
# data from one subject is NOT at the same time in the training and in the test sets
subjects_train_idx = [0, 1, 2, 3, 4]    
np.vstack(xs[subjects_train_idx]).shape, np.vstack(ys[subjects_train_idx]).shape

((4783, 2, 11), (4783, 1))

In [11]:
subjects_train_idx = range(0, 40)
X_train = np.vstack(xs[subjects_train_idx])
Y_train = np.vstack(ys[subjects_train_idx])

subjects_val_idx = range(40, 50)
X_val = np.vstack(xs[subjects_val_idx])
Y_val = np.vstack(ys[subjects_val_idx])

subjects_test_idx = range(50, 100)
X_test = np.vstack(xs[subjects_test_idx])
Y_test = np.vstack(ys[subjects_test_idx])


### Evaluate a few models

In [12]:
def simple_dense_model(input_shape=(2, WIN_LENGTH)):
    
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(32, input_shape=input_shape, activation='relu'))
    model.add(tf.keras.layers.Dense(8, activation='relu'))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

simple_model = simple_dense_model()
  

In [13]:
early_stop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = simple_model.fit(X_train, Y_train, 
                           validation_data=(X_val, Y_val), 
                           epochs=50, 
                           batch_size=8,
                           shuffle=True,
                           callbacks=[early_stop_callback])


Epoch 1/50


2022-09-15 00:44:00.673395: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50


In [14]:
simple_model.evaluate(x=X_test, y=Y_test)



[0.3752657175064087, 0.8446645736694336]

In [15]:
simple_model_predictions = simple_model.predict(x=X_test)
simple_model_predictions



array([[0.0684753 ],
       [0.04976798],
       [0.08227299],
       ...,
       [0.09465394],
       [0.02481614],
       [0.12108617]], dtype=float32)

In [16]:
df_held_out_test = df[df["pid"].isin(range(50, 100))][["pid", "time", "sleep_phase", "sleep"]].copy()
df_held_out_test["yhat"] = simple_model_predictions.round()
df_held_out_test.to_csv("results/simple_nn_raw.csv.tar.gz", index=False)
df_held_out_test

Unnamed: 0,pid,time,sleep_phase,sleep,yhat
51982,50,5099,0.0,0,0.0
51983,50,5129,0.0,0,0.0
51984,50,5159,0.0,0,0.0
51985,50,5189,0.0,0,0.0
51986,50,5219,0.0,0,0.0
...,...,...,...,...,...
102854,99,27569,0.0,0,0.0
102855,99,27599,0.0,0,0.0
102856,99,27629,0.0,0,0.0
102857,99,27659,0.0,0,0.0


In [17]:
# Check the classification results when grouping by participant id
final_results = df_held_out_test.groupby(["pid"])[["sleep", "yhat"]].apply(lambda x:
                                                                          classification_results(x["sleep"].values, 
                                                                                                 x["yhat"].values))

final_results = final_results.apply(pd.Series).rename(columns={0: "Accuracy", 1: "MCC", 2: "F1_weighted"})

final_results.agg(["mean", "std"]).round(3)


Unnamed: 0,Accuracy,MCC,F1_weighted
mean,0.846,0.607,0.838
std,0.063,0.132,0.067


In [18]:
# Combine previous two cells into a small function:

def evaluate_per_pid(df, test_range, nnmodel, x_test, name):
    
    df_held_out_test = df[df["pid"].isin(test_range)][["pid", "time", "sleep_phase", "sleep"]].copy()
    df_held_out_test["yhat"] = nnmodel.predict(x_test).round()
    df_held_out_test.to_csv(f"results/{name}.csv.tar.gz", index=False)
        
    final_results = df_held_out_test.groupby(["pid"])[["sleep", "yhat"]].apply(lambda x:
                                                                          classification_results(x["sleep"].values, 
                                                                                                 x["yhat"].values))

    final_results = final_results.apply(pd.Series).rename(columns={0: "Accuracy", 1: "MCC", 2: "F1_weighted"})

    return final_results.agg(["mean", "std"]).round(3)



In [19]:
evaluate_per_pid(df, range(50, 100), simple_model, X_test, name="simple_nn_raw")



Unnamed: 0,Accuracy,MCC,F1_weighted
mean,0.846,0.607,0.838
std,0.063,0.132,0.067


In [20]:
def cnn_lstm_model(cnn_d = 32, lstm_d = 16):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Conv1D(cnn_d, kernel_size=(3,), padding='same'))
    model.add(tf.keras.layers.BatchNormalization(epsilon=1e-06, axis=-1, momentum=0.9))
    model.add(tf.keras.layers.Activation(tf.nn.relu))
    
    model.add(tf.keras.layers.Dropout(0.05))
    model.add(tf.keras.layers.LSTM(lstm_d, return_sequences=False))
    model.add(tf.keras.layers.Dense(1, activation="sigmoid", name='output'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

cnnlstm_model = cnn_lstm_model()

In [21]:
early_stop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = cnnlstm_model.fit(X_train, Y_train, 
                           validation_data=(X_val, Y_val), 
                           epochs=50, 
                           batch_size=8,
                           callbacks=[early_stop_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50


In [22]:
cnnlstm_model.evaluate(x=X_test, y=Y_test)



[0.38033977150917053, 0.8457849025726318]

In [23]:
evaluate_per_pid(df, range(50, 100), cnnlstm_model, X_test, name="cnnlstm_nn_raw")



Unnamed: 0,Accuracy,MCC,F1_weighted
mean,0.847,0.61,0.84
std,0.062,0.131,0.067
