In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import h5py # Read and write HDF5 files from Python

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Now we are going into the Dreem 2 Challenge.
The goal is to use Dreem 2 headband data to perform sleep stage scoring on 30 seconds epochs of biophysiological signals.
https://www.kaggle.com/c/ei-dreem-sleep-stages-2020/data

The training dataset is composed of:
- X_train.h5: input Dreem2 headband data: 30s of biosignals including EEG and accelerometer
- y_train: sleep stages {'Wake':0, 'N1':1, 'N2':2, 'N3':3, 'REM':4} 

The challenge is to submit the sleep stages associated to:
- X_test.h5
(it has to be submitted in the right format, see sample_submission.csv)


In [None]:
# filenames
data_path = "/kaggle/input/dreem-2-sleep-classification-challenge-2020/"
file_xtrain = data_path + "X_train.h5/X_train.h5"
file_xtest = data_path + "X_test.h5/X_test.h5"
file_ytrain = data_path + "y_train.csv"

Let's have a look at the data

In [None]:
# training labels
pd.read_csv(file_ytrain)

In [None]:
# what does the h5 file contains ?
with h5py.File(file_xtrain, "r") as hf:
        print(list(hf.keys()))

In [None]:
# How to load data from h5? what is its shape and type?
with h5py.File(file_xtrain, "r") as hf:
        field = list(hf.keys())[0]
        x_data = hf[field][()]
type(x_data), x_data.shape

In this TD, we will only work with one EEG channel.
Let's create dataset functions that will be used for training and testing the model:

*EegEpochDataset*: Eeg Class herited from pytorch Dataset to deal with our data

*get_train_validation_dataset*: 
- return train_dataloader and validation_dataloader
- dataloaders will be used during the training and the tests


In [None]:
""" Load project data
    DataLoader and Dataset for single-channel EEG

"""

import torch
from torch.utils.data import Dataset


def normalize_data(eeg_array):
    """normalize signal between 0 and 1"""

    normalized_array = np.clip(eeg_array, -150, 150)
    normalized_array = normalized_array / 150

    return normalized_array


class EegEpochDataset(Dataset):
    """EEG Epochs dataset."""

    def __init__(self, x_data, y_data, transform=None):
        """
        Args:
            x_data (numpy array): Numpy array of input data.
            y_data (list of numpy array): Sleep Stages
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.y_data = y_data
        self.x_data = x_data
        self.transform = transform

        self.x_data = normalize_data(x_data)

    def __len__(self):
        return len(self.y_data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        signal = np.expand_dims(self.x_data[idx], axis=0)
        stage = self.y_data[idx]

        if self.transform:
            signal = self.transform(signal)

        return signal, stage
    

def get_train_validation_dataset(derivation, validation_ratio=0.2):
    """
    Return train and validation datasets in Dataloader format
    :param derivation: EEG derivation, from eeg_1 to eeg_7
    :param batch_size: size of the batch, usually 16, 3é or 64
    :param validation_ratio:

    :return:
    train_dataloader
    validation_dataloader
    """

    with h5py.File(file_xtrain, "r") as fi:
        x_data = fi[derivation][()]
    y_data = pd.read_csv(file_ytrain)['sleep_stage'].to_numpy()

    # Creating data indices for training and validation splits:
    dataset_size = len(y_data)
    indices = list(range(dataset_size))
    split = int((1 - validation_ratio) * dataset_size)
    np.random.shuffle(indices)
    train_indices, val_indices = indices[:split], indices[split:]

    x_train, x_validation = x_data[train_indices], x_data[val_indices]
    y_train, y_validation = y_data[train_indices], y_data[val_indices]

    # torch dataset
    train_dataset = EegEpochDataset(x_data=x_train, y_data=y_train)
    val_dataset = EegEpochDataset(x_data=x_validation, y_data=y_validation)


    return train_dataset, val_dataset


# load dataloaders - final_val is the dataset for the last validation
train_dataset, final_val_dataset = get_train_validation_dataset('eeg_5')


Now we create the neural network Model:
- convolutionnal neural network
- Fully conencted layers at the end
- takes only a single channel of EEG signal as input

In [None]:
import torch
import torch.nn as nn

class SingleChannelConvNet(nn.Module):

    def __init__(self):
        super(SingleChannelConvNet, self).__init__()
        # convolutionnal mayers
        self.conv_a = nn.Conv1d(1, 128, 7, stride=2, padding=6, padding_mode='zeros')
        self.conv_b = nn.Conv1d(128, 128, 7, stride=2, padding=6, padding_mode='zeros')
        self.conv_c = nn.Conv1d(128, 256, 7, stride=2, padding=6, padding_mode='zeros')
        self.conv_d = nn.Conv1d(256, 256, 5, stride=2, padding=4, padding_mode='zeros')
        self.conv_e = nn.Conv1d(256, 256, 3, stride=2, padding=2, padding_mode='zeros')

        # pool layers
        self.pool = nn.MaxPool1d(2)

        # non linearity
        self.activfunc_a = nn.LeakyReLU(negative_slope=0.1)

        # fully connected layers - at the end
        self.fc1 = nn.Linear(3 * 256, 100)
        self.fc2 = nn.Linear(100, 5)

    def forward(self, x):

        x = self.activfunc_a(self.conv_a(x))
        for _ in range(5):
            x = self.activfunc_a(self.conv_b(x))
        x = self.activfunc_a(self.conv_c(x))
        for _ in range(3):
            x = self.activfunc_a(self.conv_d(x))
        x = self.activfunc_a(self.conv_e(x))
        x = self.activfunc_a(self.conv_e(x))

        x = x.view(-1, self.num_flat_features(x)) # flatten the tensor
        x = self.activfunc_a(self.fc1(x))
        x = self.fc2(x)

        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

You can now start the training on the train dataloader:
- model will train many times on the dataset: n_epochs
- training dataset will be split in three subset (k_fold cross-validation)
- loss_val: mean loss on the validation datasets, computed after each epochs of training

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# device: use GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# parameters
learning_rate = 0.001
n_epoch = 20
k_fold = 3
batch_size = 32

# neural network
my_net = SingleChannelConvNet()
my_net = my_net.to(device) # model into GPU
# loss function
criterion = nn.CrossEntropyLoss()
# optimisation algorithm 
optimizer = optim.Adam(my_net.parameters(), lr=learning_rate)


# function: evaluate the loss of validation subset
def loss_val(net, val_loader):
    with torch.no_grad(): # do not forget to remove gradient computing during evaluation !!!
        val_loss = 0.0
        for data in val_dataloader:
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
        return val_loss


Let's start the loop !

In [None]:
from sklearn.model_selection import KFold
from torch.utils.data.dataset import Subset
from torch.utils.data import DataLoader

# List all the validation loss:
# at the end of each epoch of training, a loss is computed on a subset of data
all_val_loss = []

print('training...')
for epoch in range(n_epoch):  # loop over the dataset multiple times

    # validation losses for this epoch (n=k_fold)
    val_loss = []        
    for train_indices, val_indices in KFold(n_splits=k_fold).split(list(range(len(train_dataset)))):
        # k_fold dataloader (k=3) - Take validation subset for training, to avoid overfit
        train_subset = Subset(train_dataset, train_indices)
        val_subset = Subset(train_dataset, val_indices)

        train_dataloader = DataLoader(train_subset, batch_size=batch_size, shuffle=True, num_workers=4)
        val_dataloader = DataLoader(val_subset, batch_size=batch_size, num_workers=8)

        running_loss = 0.0
        for i, data in enumerate(train_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + loss + backward + optimize
            outputs = my_net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 100 == 99:
                print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss))
            running_loss = 0.0

        # average validation losses
        val_loss += [loss_val(my_net, val_dataloader)]
        
    all_val_loss += [np.round(np.mean(val_loss), 2)]
    print(all_val_loss)

    
print('Finished Training')

In [None]:
from pprint import pprint
from sklearn.metrics import balanced_accuracy_score, cohen_kappa_score, confusion_matrix

# score function
def evaluate(true, pred):
    scores = {'balanced_accuracy': balanced_accuracy_score(true, pred),
            'cohen_kappa': cohen_kappa_score(true, pred),
            'confusion_matrix': confusion_matrix(true, pred)}

    return scores

# params
classes = ['Wake', 'N1', 'N2', 'N3', 'REM']

# final validation dataset: has not be used for the training
val_dataloader = DataLoader(final_val_dataset, batch_size=batch_size, num_workers=8)

# evaluate the performance of the model
with torch.no_grad():
    prediction_list = torch.empty(0).to(device)
    true_list = torch.empty(0).to(device)
    for data in val_dataloader:
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        
        outputs = my_net(inputs)
        _, predicted = torch.max(outputs, 1)
        prediction_list = torch.cat([prediction_list, predicted])
        true_list = torch.cat([true_list, labels])

        
# Scores
true_list = true_list.cpu().numpy()
prediction_list = prediction_list.cpu().numpy()
scores = evaluate(true_list, prediction_list)

print(scores)

During the training, you may have noticed that you could have stopped earlier to have a lower validation, and maybe a better model at the end.
Rewrite the code to save the 3 models with the lower validation loss, and compare them on the final_validation_dataset !