# Alternative 3 - End-to-end deep learning on raw features

<img src="images/feature_set_summary.jpg" width="1000" height="1000">

Get the pre-computed dataset from: https://www.dropbox.com/scl/fo/5llpuwwtcuo22p9jnfxuo/h?dl=0&rlkey=nm0kqrfbk3z9s8qns8hjh4437

In [1]:
import pandas as pd
import numpy as np
import keras
import tensorflow as tf

def reset_seed():
    tf.random.set_seed(0)
    np.random.seed(0)

from sklearn import metrics

def classification_results(y, yhat):
    acc = metrics.accuracy_score(y, yhat)
    mcc = metrics.matthews_corrcoef(y, yhat)
    f1_weighted = metrics.f1_score(y, yhat, average="weighted")
    return acc, mcc, f1_weighted


reset_seed()

In [2]:
df = pd.read_csv("./datasets/df_raw_features.tar.gz")
df.head()

Unnamed: 0,pid,time,id,sort,sleep_phase,act_0,act_1,act_2,act_3,act_4,...,hr_1,hr_2,hr_3,hr_4,hr_5,hr_6,hr_7,hr_8,hr_9,hr_10
0,0,29,"(0, 0)",0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.0
1,0,59,"(0, 1)",1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.0,76.0
2,0,89,"(0, 2)",2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.0,76.0,78.0
3,0,119,"(0, 3)",3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,71.0,76.0,78.0,73.0
4,0,149,"(0, 4)",4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,71.0,76.0,78.0,73.0,80.0


In [3]:
df["sleep_phase"].unique()

# Type of sleep statging problems:
# -------------------------------
#
#      5-class | 4-class | 3-class | 2-Class
# 0 -> Wake    | Wake    | Wake    | Wake
# 1 -> N1      | Light   | NREM    | Sleep
# 2 -> N2      | Light   | NREM    | Sleep
# 3 -> N3      | Deep    | NREM    | Sleep
# 4 -> N4      | Deep    | NREM    | Sleep
# 5 -> REM     | REM     | REM     | Sleep
#
# 

array([0., 1., 2., 5., 3., 4.])

In [4]:
df["sleep"] = df["sleep_phase"] > 0

In [5]:
MASKING_VALUE = -1000

def generate_XY(df, maxdim, ycol="sleep"):

    hr_cols = dict([(int(k.split("_")[1]), k) for k in df.keys() if k.startswith("hr_")])
    last_hr_key = sorted(hr_cols.items(), key=lambda item: item[0])[-1][1]

    act_cols = dict([(int(k.split("_")[1]), k) for k in df.keys() if k.startswith("act_")])
    last_act_key = sorted(act_cols.items(), key=lambda item: item[0])[-1][1]

    hr = df[last_hr_key].values
    act = df[last_act_key].values
    
    Y  = df[ycol].astype(int).values.reshape(-1, 1)
    X = np.stack((act,hr))
    X = X.transpose(1,0)
    
    if maxdim > X.shape[0]:
        # Pad sequences (e.g., to the maxium length in the sequence or a constant like 8-10 hours)
        X = np.pad(X, ((0, maxdim-X.shape[0]), (0,0)), "constant", constant_values=MASKING_VALUE)
        Y = np.pad(Y, ((0, maxdim-Y.shape[0]), (0,0)), "constant", constant_values=MASKING_VALUE)
    else:
        # Crop the sequence at some maxium length
        X = X[:maxdim]
        Y = Y[:maxdim]
        
    # Expand dims
    X = np.expand_dims(X, axis=0)
    Y = np.expand_dims(Y, axis=0)
    
    X = X.astype(float)
    Y = Y.astype(int)
    
    return X, Y
    

In [6]:
X, Y = generate_XY(df[df["pid"]==1], maxdim=800)

In [7]:
X.shape

(1, 800, 2)

In [8]:
max_dimension = df.groupby("pid").size().max()
print("The largest dimension is %d" % (max_dimension))

quantile08 = int(df.groupby("pid").size().quantile(0.8))
print("Quantile 0.8 %d" % (quantile08))

MAX_DIMENSION = max_dimension
# MAX_DIMENSION = 1200 # 1200 = 10 hours
# MAX_DIMENSION = 400 # 1200 = 10 hours

The largest dimension is 1615
Quantile 0.8 1153


In [9]:
df_XY = df.groupby("pid").apply(lambda x: generate_XY(x, maxdim=MAX_DIMENSION))
df_XY.head()

pid
0    ([[[ 2. 71.], [ 0. 76.], [ 1. 78.], [ 2. 73.],...
1    ([[[ 1. 66.], [ 0. 67.], [ 0. 66.], [ 1. 67.],...
2    ([[[12. 77.], [27. 62.], [ 1. 64.], [ 0. 64.],...
3    ([[[ 1. 73.], [ 0. 69.], [ 0. 70.], [ 0. 70.],...
4    ([[[ 0. 51.], [ 1. 49.], [ 1. 53.], [ 1. 50.],...
dtype: object

In [10]:
idx = 30
df_XY.iloc[idx][0].shape, df_XY.iloc[idx][1].shape

((1, 1615, 2), (1, 1615, 1))

In [11]:
df_XY.iloc[idx][0]

array([[[    1.,    70.],
        [    1.,    69.],
        [    0.,    71.],
        ...,
        [-1000., -1000.],
        [-1000., -1000.],
        [-1000., -1000.]]])

In [12]:
df_XY.iloc[idx][1]

array([[[    1],
        [    1],
        [    1],
        ...,
        [-1000],
        [-1000],
        [-1000]]])

In [13]:
xs, ys = [], []
for row_id, (x, y) in df_XY.items():
    xs.append(x)
    ys.append(y)
    
xs = np.array(xs, dtype=object)
ys = np.array(ys, dtype=object)


In [14]:
subjects_train_idx = range(0, 40)
X_train = np.vstack(xs[subjects_train_idx]).astype(float)
Y_train = np.vstack(ys[subjects_train_idx]).astype(int)

subjects_val_idx = range(40, 50)
X_val = np.vstack(xs[subjects_val_idx]).astype(float)
Y_val = np.vstack(ys[subjects_val_idx]).astype(int)

subjects_test_idx = range(50, 100)
X_test = np.vstack(xs[subjects_test_idx]).astype(float)
Y_test = np.vstack(ys[subjects_test_idx]).astype(int)


In [15]:
def bilstm_model(cnn_d=32, lstm_d=16):
    
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Masking(mask_value=MASKING_VALUE))
        
    model.add(tf.keras.layers.Dropout(0.1))

    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_d, return_sequences=True)))
    model.add(tf.keras.layers.Dense(1, activation="sigmoid", name='output'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

bilstm_model = bilstm_model()

In [17]:
reset_seed()
early_stop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = bilstm_model.fit(X_train, Y_train,
                           validation_data=(X_val, Y_val), 
                           epochs=50, 
                           batch_size=8,
                           shuffle=True,
                           callbacks=[early_stop_callback])



Epoch 1/50


2022-09-15 18:44:44.200468: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50


In [18]:
with tf.device('/cpu:0'):
    bilstm_model.evaluate(x=X_test.astype(float), y=Y_test.astype(int))



In [19]:
with tf.device('/cpu:0'):
    p = bilstm_model.predict(x=X_test.astype(float)).round()
    
print("Have a look at the shape now:", p.shape)
p

Have a look at the shape now: (50, 1615, 1)


array([[[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]],

       ...,

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]],

       [[1.],
        [1.],
        [1.],
        ...,
        [0.],
        [0.],
        [0.]]], dtype=float32)

In [20]:
test_range = range(50, 100)

df_held_out_test = df[df["pid"].isin(test_range)][["pid", "time", "sleep_phase", "sleep"]].copy()
df_held_out_test

Unnamed: 0,pid,time,sleep_phase,sleep
51982,50,5099,0.0,False
51983,50,5129,0.0,False
51984,50,5159,0.0,False
51985,50,5189,0.0,False
51986,50,5219,0.0,False
...,...,...,...,...
102854,99,27569,0.0,False
102855,99,27599,0.0,False
102856,99,27629,0.0,False
102857,99,27659,0.0,False


In [21]:
Y_test.flatten().shape, p.flatten().shape

p[Y_test != MASKING_VALUE].round().sum()

38972.0

In [22]:
df_held_out_test.groupby("pid").tail(400).reset_index(drop=True)

Unnamed: 0,pid,time,sleep_phase,sleep
0,50,26519,1.0,True
1,50,26549,2.0,True
2,50,26579,1.0,True
3,50,26609,1.0,True
4,50,26639,1.0,True
...,...,...,...,...
19995,99,27569,0.0,False
19996,99,27599,0.0,False
19997,99,27629,0.0,False
19998,99,27659,0.0,False


In [23]:
def evaluate_per_pid(df, test_range, maxdim, nnmodel, x_test, y_test, masking_value, name):
    
    df_held_out_test = df[df["pid"].isin(test_range)][["pid", "time", "sleep_phase", "sleep"]].copy()
    df_held_out_test = df_held_out_test.groupby("pid").head(maxdim).reset_index(drop=True)
    
    p = nnmodel.predict(x_test).round()
    df_held_out_test["yhat"] = p[y_test != masking_value].round()
    
    df_held_out_test.to_csv(f"results/{name}.csv.tar.gz", index=False)
        
    final_results = df_held_out_test.groupby(["pid"])[["sleep", "yhat"]].apply(lambda x:
                                                                          classification_results(x["sleep"].values, 
                                                                                                 x["yhat"].values))

    final_results = final_results.apply(pd.Series).rename(columns={0: "Accuracy", 1: "MCC", 2: "F1_weighted"})

    return final_results.agg(["mean", "std"]).round(3)

In [24]:
evaluate_per_pid(df, range(50, 100), MAX_DIMENSION, bilstm_model,
                 X_test.astype(float), Y_test, MASKING_VALUE, name="cnnlstm_model_end2end")



Unnamed: 0,Accuracy,MCC,F1_weighted
mean,0.858,0.634,0.848
std,0.076,0.166,0.08
