In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split

In [2]:
# create a dataframe called full_dataset wtih columns: Timestamp, ActivityCounts, Barometer, BloodPerfusion, BloodPulseWave, EnergyExpenditure, HR,HRV,RESP,Steps,SkinTemperature,SubjectID
full_dataset = pd.DataFrame(columns=['Timestamp', 'ActivityCounts', 'Barometer', 
                                     'BloodPerfusion', 'BloodPulseWave', 'EnergyExpenditure', 
                                     'HR', 'HRV', 'RESP', 'Steps', 'SkinTemperature', 
                                     'SubjectID'])
date_format = '%d.%m.%y %H:%M'
dfs=[]
num_rows = 0
for i in range(1, 29):
    file_path = 'data/raw/subjectID_' + str(i) + '.csv'
    df = pd.read_csv(file_path)
    # if there is an acitivity class column, drop it
    if 'ActivityClass' in df.columns:
        df = df.drop(['ActivityClass'], axis=1)
    # if there is a galvanic skin response column, drop it
    if 'GalvanicSkinResponse' in df.columns:
        df = df.drop(['GalvanicSkinResponse'], axis=1)
    # if there is a column called 'SkinTemperature.Value', rename it to 'SkinTemperature'
    if 'SkinTemperature.Value' in df.columns:
        df = df.rename(columns={'SkinTemperature.Value': 'SkinTemperature'})
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format=date_format)
    # create a new data frame that contains the 10 minute average of each column
    df = df.set_index('Timestamp')
    df = df.resample('10T').mean()  # '10T' stands for 10 minutes
    df = df.reset_index()
    # create a new column for the subject ID
    df['SubjectID'] = i
    # add rows in df to full_dataset
    dfs.append(df)


full_dataset = pd.concat(dfs, ignore_index=True)

In [3]:
full_dataset

Unnamed: 0,Timestamp,ActivityCounts,Barometer,BloodPerfusion,BloodPulseWave,EnergyExpenditure,HR,HRV,RESP,Steps,SkinTemperature,SubjectID
0,2019-03-14 14:30:00,0.980392,981.225000,0.087500,0.950000,1004.848755,79.201816,,,0.000000,28.860000,1
1,2019-03-14 14:40:00,3.006536,981.166667,0.608000,1.604000,1002.522716,77.051356,,13.490989,0.000000,30.859444,1
2,2019-03-14 14:50:00,3.398693,981.211111,0.584167,1.938333,1154.805566,84.622675,,19.241810,7.888889,29.280556,1
3,2019-03-14 15:00:00,1.176471,981.010000,0.381000,2.132000,1701.043952,71.817909,45.770417,15.745870,9.400000,32.827500,1
4,2019-03-14 15:10:00,9.568627,981.250000,0.345000,2.356000,4240.149158,95.510233,45.626667,18.235506,15.200000,34.298500,1
...,...,...,...,...,...,...,...,...,...,...,...,...
135571,2018-08-16 07:10:00,,,,,,,,,,,28
135572,2018-08-16 07:20:00,,,,,,,,,,,28
135573,2018-08-16 07:30:00,,,,,,,,,,,28
135574,2018-08-16 07:40:00,0.784314,988.100000,0.370000,2.380000,1027.824555,58.299174,83.916667,14.663512,0.000000,34.293333,28


In [4]:
fatiguePROs = pd.read_csv('data/fatiguePROs_processed.csv')

fatiguePROs.reset_index(drop=True, inplace=True)
# convert the Timestamp column to datetime type
fatiguePROs['DateTime'] = pd.to_datetime(fatiguePROs['DateTime'], format=date_format)
# find all timestamp that have missing values in the proanswers column
missing_proanswers = fatiguePROs[fatiguePROs['PROanswer_value'].isnull()].DateTime.unique()
# drop all rows that have the above timestamps
fatiguePROs = fatiguePROs[~fatiguePROs['DateTime'].isin(missing_proanswers)]
fatiguePROs.reset_index(drop=True, inplace=True)
fatiguePROs


Unnamed: 0,SubjectID,DateTime,Timezone,question,PROanswer_value
0,1,2019-03-14 20:01:00,UTC,VAS,2.0
1,1,2019-03-14 20:01:00,UTC,RelP,-1.0
2,1,2019-03-14 20:01:00,UTC,PhF,0.0
3,1,2019-03-14 20:01:00,UTC,MF,1.0
4,1,2019-03-15 20:01:00,UTC,VAS,2.0
...,...,...,...,...,...
2075,28,2018-08-16 00:51:00,CEST,MF,1.0
2076,28,2018-08-16 00:52:00,CEST,VAS,4.0
2077,28,2018-08-16 00:52:00,CEST,RelP,1.0
2078,28,2018-08-16 00:52:00,CEST,PhF,0.0


In [5]:
segments_fulldataset = []
segments_subjectIDs = []
PROlabels_PhF = []
PROlabels_MF = []
PROlabels_VAS = []
PROlabels_RelP = []

for i in range(1, 29):
    # subset full_dataset to get data for subject i
    subject_data = full_dataset[full_dataset['SubjectID'] == i]
    # subset fatiguePROs to get data for subject i
    subject_labels = fatiguePROs[fatiguePROs['SubjectID'] == i]
    unique_date_time = subject_labels['DateTime'].unique()
    
    for j in range(len(unique_date_time)):
        #  find the date of unique_date_time[i]

        start_date_time = max(unique_date_time[j] - pd.Timedelta(days=5), subject_data['Timestamp'].min())
        end_date_time = unique_date_time[j]
        
        segment = subject_data[(subject_data['Timestamp'] > start_date_time) & (subject_data['Timestamp'] <= end_date_time)]  
        # remove the timestamp column
        segment = segment.drop(columns=['Timestamp'])
        # if >=80% of the values in the segment are not NaN or None, append the segment to a new data frame
        if segment.size > 0 and segment.notnull().sum().sum() / segment.size >= 0.8:
            segment = segment.to_numpy()
           
            segments_fulldataset.append(segment)
            segments_subjectIDs.append(i)
            # find the PROlabel that corresponds to the DateTime, append it to PROlabels
            PROlabel = subject_labels[subject_labels['DateTime'] == end_date_time]
            PROlabels_PhF.append(PROlabel[PROlabel['question'] == 'PhF'].PROanswer_value.values[0])
            PROlabels_MF.append(PROlabel[PROlabel['question'] == 'MF'].PROanswer_value.values[0])
            PROlabels_VAS.append(PROlabel[PROlabel['question'] == 'VAS'].PROanswer_value.values[0])
            PROlabels_RelP.append(PROlabel[PROlabel['question'] == 'RelP'].PROanswer_value.values[0])
len(segments_fulldataset)


360

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
import matplotlib.pyplot as plt



In [8]:
import torch
import torch.nn as nn

class SelfAttention(nn.Module):
    def __init__(self, hidden_size):
        super(SelfAttention, self).__init__()
        self.W_q = nn.Linear(hidden_size, hidden_size)
        self.W_k = nn.Linear(hidden_size, hidden_size)
        self.W_v = nn.Linear(hidden_size, hidden_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        query = self.W_q(x)
        key = self.W_k(x)
        value = self.W_v(x)

        attention_weights = torch.matmul(query, key.transpose(-2, -1))
        attention_weights = attention_weights / torch.sqrt(torch.tensor(x.size(-1), dtype=torch.float32))

        attention_weights = self.softmax(attention_weights)
        out = torch.matmul(attention_weights, value)

        return out

class LSTMWithSelfAttention(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(LSTMWithSelfAttention, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.self_attention = SelfAttention(hidden_size)
        self.fc = nn.Linear(hidden_size*2, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        out, _ = self.lstm(x)

        # Apply self-attention
        attention_out = self.self_attention(out)

        # Concatenate LSTM output and attention output
        out = torch.cat((out[:, -1, :], attention_out[:, -1, :]), dim=-1)

        # Fully connected layer
        out = self.fc(out)
        return out



In [11]:
from torch.nn.utils.rnn import pad_sequence
# repeat the above steps for PhF, MF, and RelP, do it in a loop
PROlabels = [PROlabels_PhF, PROlabels_MF, PROlabels_VAS, PROlabels_RelP]
PROlabels_name = ['PhF', 'MF', 'VAS', 'RelP']

lst_outputs = []
lst_targets = []

for i in range(len(PROlabels)):
    np.random.seed(0)
    print(PROlabels_name[i])
    # Assuming your data is stored in 'data' and labels in 'labels'
    X_train, X_test, y_train, y_test = train_test_split(segments_fulldataset, PROlabels[i], test_size=0.2, random_state=42)

    
    # Pad sequences with zeros
    X_train = pad_sequence([torch.tensor(seq, dtype=torch.float32) for seq in X_train], batch_first=True, padding_value=float('nan'))
    X_test = pad_sequence([torch.tensor(seq, dtype=torch.float32) for seq in X_test], batch_first=True, padding_value=float('nan'))
    y_train = torch.tensor(y_train, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)

    X_train = torch.nan_to_num(X_train)
    X_test = torch.nan_to_num(X_test)

    X_train.shape, y_train.shape, X_test.shape, y_test.shape
    # Instantiate the model
    input_size = 10  # Adjust according to your data
    hidden_size = 128
    output_size = 1  # Assuming regression task
    num_layers = 4
    consistency_attention_size = 4
    
    model = LSTMWithSelfAttention(input_size, hidden_size, output_size, num_layers)
    
    # model = LSTMWithAttention(input_size, hidden_size, output_size, num_layers, consistency_attention_size)


    # Define loss and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training the model
    num_epochs = 30

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(X_train)
        loss = criterion(outputs.squeeze(), y_train)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

        # Evaluate the model on the test set
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test)

        lst_outputs.append(test_outputs)
        lst_targets.append(y_test)
        
        test_loss = criterion(test_outputs.squeeze(), y_test)
        print(f'Test Loss: {test_loss.item():.4f}')

        y_test_np = y_test.numpy()
        test_outputs_np = test_outputs.numpy()

        # Create a scatter plot
        # make x and y axis have the same scale
        if PROlabels_name[i] == 'VAS':
            plt.xlim(0.5, 10.5)
            plt.ylim(0.5, 10.5)
        elif PROlabels_name[i] == 'RelP':
            plt.xlim(-1.25, 1.25)
            plt.ylim(-1.25, 1.25)
        else:
            plt.xlim(-0.5, 4.5)
            plt.ylim(-0.5, 4.5)

        plt.gca().set_aspect('equal', adjustable='box')
        plt.scatter(y_test_np, test_outputs_np, alpha=0.5)
        plt.title(PROlabels_name[i], y=-0.2)
        plt.xlabel('Groud Truth')
        plt.ylabel('Predictions')
        plt.show()
        plt.clf()
   



PhF


: 