In [1]:
import datetime
import numpy as np
import os
import pandas as pd
from pandas import Series

#from lstm.ivie_data import BiRNN
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.autograd import Variable

from sklearn.metrics import accuracy_score, confusion_matrix
from lstm import ivie_data

In [107]:
cuda_enabled = torch.cuda.is_available()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
BASELINE_START = "baselinestart"
BASELINE_END = "baselineend"
EASY_START = "easystart"
EASY_END = "easyend"
HARD_START = "hardstart"
HARD_END = "hardend"

INPUT_SIZE = 352 # determined by the shortest test sample

EASY_DIFFICULTY = 0
HARD_DIFFICULTY = 1

In [4]:
"""Gets the row blocks for easy and hard tasks
"""
def read_data(fnirs_path, marker_path):
    fnirs_df =  pd.read_csv(fnirs_path, sep='\t', skiprows=range(4), index_col=False)
    marker_df = pd.read_csv(marker_path, sep='\t', skiprows=range(4), index_col=False)
    
    merged_df = pd.merge(fnirs_df, marker_df, on="Matlab_now", how="left")
    
    return merged_df

In [87]:
def get_row_blocks(merged_df):
    easy_start_rows = merged_df.index[merged_df.Stimulus_Label == EASY_START].tolist()
    easy_end_rows = merged_df.index[merged_df.Stimulus_Label == EASY_END].tolist()
    hard_start_rows = merged_df.index[merged_df.Stimulus_Label == HARD_START].tolist()
    hard_end_rows = merged_df.index[merged_df.Stimulus_Label == HARD_END].tolist()
    
    easy_rows = []
    hard_rows = []
    if len(easy_start_rows) == len(easy_end_rows):
        easy_rows = list(zip(easy_start_rows, easy_end_rows))
        for rows in easy_rows:
            if rows[0] > rows[1]:
                easy_rows = []
                print("Easy mismatch")
                break
    if len(hard_start_rows) == len(hard_end_rows):
        hard_rows = list(zip(hard_start_rows, hard_end_rows))
        for rows in hard_rows:
            if rows[0] > rows[1]:
                hard_rows = []
                print("Hard mismatch")
                break
    if len(easy_start_rows) != len(easy_end_rows):
        print("Easy mismatch")
    if len(hard_start_rows) != len(hard_end_rows):
        print("Hard mismatch")
    if len(easy_start_rows) != len(easy_end_rows) and len(hard_start_rows) != len(hard_end_rows):
        raise ValueError
 
    return (easy_rows, hard_rows)

In [88]:
"""Return subset of df determined by the indices of the row blocks
"""
def get_subsets(merged_df, row_blocks, difficulty):
    tables = []
    column_names = ["Matlab_now", "A-DC1", "A-DC2", "A-DC3", "A-DC4", "A-DC5",
                    "A-DC6", "A-DC7", "A-DC8", "B-DC1", "B-DC2", "B-DC3", 
                    "B-DC4", "B-DC5", "B-DC6", "B-DC7", "B-DC8"]
    column_indices = [merged_df.columns.get_loc(c) for c in column_names]
    for row_block in row_blocks:
        df = merged_df.iloc[row_block[0]:row_block[1], column_indices]
        start_time = df.iloc[0]["Matlab_now"]
        df["Matlab_now"] = df["Matlab_now"] - start_time
        df["Difficulty"] = difficulty

        tables.append(df.iloc[:INPUT_SIZE])
    return tables

In [89]:
"""Extract features from given dataset
    :param data_path: Directory containing the files
    
    :return: gets all the easy and hard features from a given dataset
"""
def get_data(file_name):
    fnirs_path = os.path.join("data/clean_data/") + file_name + "_fNIRSdata.txt"
    marker_path = os.path.join("data/clean_data/") + file_name + "_markers.txt"
    merged_df = read_data(fnirs_path, marker_path)
    easy_rows, hard_rows = get_row_blocks(merged_df)

    easy_tables = get_subsets(merged_df, easy_rows, EASY_DIFFICULTY)
    hard_tables = get_subsets(merged_df, hard_rows, HARD_DIFFICULTY)

    return easy_tables + hard_tables

In [97]:
file_names = ["S703", "S901", "S902", "S903", "S904", "S905",
              "S906", "S907", "S908", "S909", "S910", "S911",
              "S912", "S913", "S1101", "S1102", "S1103", "S1104",
              "S1105", "S1106", "S1107", "S1108", "S1110",
              "S1111", "S1111_2", "S1112", "S1113", "S1114", "S1114_2",
              "S1115", "S1116", "S1117", "S1118", "S1119", "S1120"]
# bad: "S1106", "S1107"
train = []
test = []
for fname in file_names[:-6]:
    try:
        train = train + get_data(fname)
    except ValueError:
        print("File error %s" % fname)
for fname in file_names[-6:]:
    try:
        test = test + get_data(fname)
    except ValueError:
        print("File error %s" % fname)

Easy mismatch
Easy mismatch
Hard mismatch
File error S1106
Easy mismatch
Easy mismatch
Hard mismatch
Easy mismatch
Hard mismatch
Hard mismatch
Easy mismatch
Easy mismatch
Hard mismatch
File error S1112
Easy mismatch
Hard mismatch
File error S1113
Hard mismatch
Hard mismatch
Easy mismatch
Hard mismatch
Easy mismatch
Easy mismatch
Hard mismatch
File error S1116
Hard mismatch
Easy mismatch
Hard mismatch
Easy mismatch
Hard mismatch
Hard mismatch


In [98]:
print(len(train), len(test))

450 25


In [99]:
columns = ["A-DC1", "A-DC2", "A-DC3", "A-DC4", "A-DC5",
           "A-DC6", "A-DC7", "A-DC8", "B-DC1", "B-DC2", "B-DC3", 
           "B-DC4", "B-DC5", "B-DC6", "B-DC7", "B-DC8", "Difficulty"]

In [113]:
train_x = [a.iloc[:,1:-1] for a in train]
train_y = [a.iloc[0,-1] for a in train]
test_x = [a.iloc[:,1:-1] for a in test]
test_y = [a.iloc[0,-1] for a in test]

torch.tensor(targets_df['targets'].values)

In [12]:
torch.tensor(train_x[0].values).shape

torch.Size([352, 16])

In [13]:
torch.tensor(train_y[0])

tensor(0)

In [14]:
train_x[0].shape

(352, 16)

In [101]:
params = {'shuffle': True,
          'num_workers': 6}

In [108]:
class fnirs(Dataset):
    #Characterizes a dataset for PyTorch
    def __init__(self, data, labels):
        #Initialization
        self.data = data
        self.labels = labels

    def __len__(self):
        #Denotes the total number of samples
        return len(self.data)

    def __getitem__(self, index):
        #Generates one sample of data
        x = torch.tensor(self.data[index].values, dtype=torch.float32).to(device)
        y = torch.tensor([self.labels[index]], dtype=torch.long).to(device)
        return x, y            
    
    def _normalize(self, df):
        normalized_df=(df-df.mean())/df.std()
        return df
        

In [109]:
class BiRNN(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(BiRNN, self).__init__()
        self.is_training = False
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_classes = num_classes
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
                            batch_first=True, bidirectional=True)
        self.fc = nn.Dropout(p=0.5, inplace=False)
        self.linear = nn.Linear(self.hidden_size*2, self.num_classes)

        if cuda_enabled:
            self.lstm = self.lstm.cuda()
            self.fc = self.fc.cuda()
            self.linear = self.linear.cuda()

    def forward(self, x): 
        # Set initial states
        h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)).to(device) # 2 for bidirection
        c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)).to(device)
        if cuda_enabled:
            h0 = h0.cuda()  # 2 for bidirection
            c0 = c0.cuda()

        # Forward propagate RNN
        out, _ = self.lstm(x, (h0, c0))
        #out, _ = self.lstm(x, h0)
        
        # Decode hidden state of last time step
        if self.is_training:
            out = self.fc(out[:, -1, :]) 
        else:
            out = out[:, -1, :]

        out = F.log_softmax(self.linear(out), dim=1)
        return out 

In [115]:
train_data = fnirs(train_x, train_y)
test_data = fnirs(test_x, test_y)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=8, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=8, shuffle=False)
for i, (x, y) in enumerate(train_loader):
    print("%s, %s" % (x.dtype, x))
    print("%s, %s" % (y.dtype, y))
    break
    

torch.float32, tensor([[[922.2000, 212.2000,  47.3100,  ...,  44.0000,   9.1740,   3.3400],
         [888.2000, 208.0000,  46.1700,  ...,  43.3900,   8.9140,   3.3320],
         [892.8000, 204.1000,  45.7100,  ...,  43.6000,   8.8830,   3.3110],
         ...,
         [925.9000, 215.8000,  47.7100,  ...,  46.2700,   9.6090,   3.5230],
         [924.8000, 217.1000,  48.2000,  ...,  45.8900,   9.4670,   3.5100],
         [930.3000, 218.2000,  48.9300,  ...,  46.2100,   9.7450,   3.5530]]],
       device='cuda:0')
torch.int64, tensor([[1]], device='cuda:0')


In [143]:
batch_size = 2
hidden_size = 8
num_layers = 5
num_classes = 2
learning_rate = 0.001
num_epochs = 20
sequence_length = 16 # column size. get rid of time unless time difference is consistent
input_size = 352 # longest length of ti
rnn = BiRNN(input_size, hidden_size, num_layers, num_classes)
rnn.is_training = True

train_data = fnirs(train_x, train_y)
test_data = fnirs(test_x, test_y)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)

epoch_loss = 1000000.

# Train it
for epoch in range(num_epochs):
    loss_total = 0.
    iteration_count = 1.
    print("Epoch %d" % epoch)
    for i, (data, label) in enumerate(train_loader):
        data = Variable(data.view(-1, sequence_length, input_size))

        # Forward + Backward + Optimize
        optimizer.zero_grad()
        outputs = rnn(data)

        loss = criterion(outputs, label.squeeze_())
        loss_total += loss.data
        loss.backward()
        optimizer.step()

    current_epoch_loss = loss_total / iteration_count
    print("Current epoch loss: %.4f" % current_epoch_loss)
    print("Epoch loss: %.4f" % epoch_loss)
    # Optimise training epochs: only continue training while the loss drops
    if epoch > 5 and current_epoch_loss >= epoch_loss:
        break
    epoch_loss = current_epoch_loss
print("DONE")
        

Epoch 0
Current epoch loss: 156.0633
Epoch loss: 1000000.0000
Epoch 1
Current epoch loss: 155.5919
Epoch loss: 156.0633
Epoch 2
Current epoch loss: 156.3251
Epoch loss: 155.5919
Epoch 3
Current epoch loss: 155.2809
Epoch loss: 156.3251
Epoch 4
Current epoch loss: 156.7828
Epoch loss: 155.2809
Epoch 5
Current epoch loss: 156.4348
Epoch loss: 156.7828
Epoch 6
Current epoch loss: 156.0774
Epoch loss: 156.4348
Epoch 7
Current epoch loss: 155.6724
Epoch loss: 156.0774
Epoch 8
Current epoch loss: 156.6689
Epoch loss: 155.6724
DONE


In [138]:
#from sklearn.metrics import accuracy_score, confusion_matrix
# Test the Model
rnn.is_training = False
correct = 0.0
total = 0.0
predicted_list = []
label_list = []

print('Testing -----------------------------------------------')
correct = 0.0
total = 0.0
predicted_list = []
label_list = []
for data, label in test_loader:
    data = Variable(data.view(-1, sequence_length, input_size))
    outputs = rnn(data)

    _, predicted = torch.max(outputs.data, 1)
    total += label.size(0)
    for p, l in zip(predicted, label):
        predicted_list.append(p)
        label_list.append(l)
        if p == l:
            correct += 1.0

        
#print(train.get_encoder().classes_)
#print(confusion_matrix(label_list, predicted_list))
#print('=============================================')
#print('Accuracy = %0.4f' % (accuracy_score(label_list, predicted_list)))
#print('=============================================')
print(correct)
print(total)
    


Testing -----------------------------------------------
12.0
25.0
