In [None]:
import torch
import os
import random
import time
import functools
import gc
import numpy as np
import pandas as pd
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from scipy import stats

from sklearn.metrics import accuracy_score, average_precision_score

# Create some useful global parameters.
MAIN_DIR = "/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TARGETS = ["StartHesitation", "Turn", "Walking"]

# **Create Data Subset**

In [None]:
def reduce_memory_usage(df):
    """
    Reduces the memory usage of the passed dataframe.
    Reference: https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65
    
    Args:
        df (pandas.core.frame.DataFrame): The pandas dataframe to be reduced in memory.
    Returns:
        pandas.core.frame.DataFrame: The pandas dataframe with memory usage reduced.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    print('\nStart reducing dataframe memory usage.')
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in tqdm(df.columns):
        col_type = df[col].dtype.name

        if ((col_type != 'datetime64[ns]') & (col_type != 'category')):
            if (col_type != 'object'):
                c_min = df[col].min()
                c_max = df[col].max()

                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)

                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        pass
            else:
                df[col] = df[col].astype('category')
                
    mem_usg = df.memory_usage().sum() / 1024**2 
    
    print("Memory usage became: ",mem_usg," MB")
    
    return df

In [None]:
def encode_target(data, targets_list):
    """
    Encodes the targets from the target list in the passed data to numbers.

    Args:
        data (pandas.core.series.Series): Data that needs to be encoded.
        targets_list (list): List containing the target values to be encoded.
    Returns:
        numpy.ndarray: The label encoded array.
    """
    conditions = []

    for target in targets_list:
        conditions.append((data[target] == 1))

    event = np.select(conditions, targets_list, default='Normal')
    le = LabelEncoder()

    return le.fit_transform(event)

In [None]:
def read_data(dataset, datatype, as_dataframe):
    """
    Based on whether datatype is train or test it returns the train or test
    data as a numpy array. The following columns in the numpy array are as 
    follows: Id, Time, AccV, AccML, AccAP, StartHesitation, Turn, Walking,
    Valid, Task, Target.
    
    Args:
        dataset (str):
        datatype (str):
        subject_id (str):
    Returns:
        numpy.ndarray: The dataset as per the parameters as a numpy array.
    """
    metadata = pd.read_csv(MAIN_DIR + dataset + "_metadata.csv")
    DATA_ROOT = MAIN_DIR + datatype + "/" + dataset
    
    # Loop over over the files in the directory and append to the 
    # resulting dataframe.
    df_res = pd.DataFrame()
    
    print("\nStart reading files.")
    for root, dirs, files in os.walk(DATA_ROOT):
        for name in tqdm(files):
            f = os.path.join(root, name)
            query_datatype = pd.read_csv(f)
            query_datatype['File'] = name.replace(".csv", "")
            df_res = pd.concat([df_res,query_datatype])
    
    # Merge the metadata and accdata together in one dataframe.
    df_res = metadata.merge(df_res, how='inner', left_on='Id', right_on='File')
    df_res = df_res.drop(['File','Subject','Visit','Medication'], axis=1)

    # Only if we are creating a training set we encode the features for usage.
    if datatype == 'train':
        if dataset == 'tdcsfog':
            df_res['Valid'] = 1
            df_res['Task'] = 1
        df_res['Target'] = encode_target(df_res, TARGETS)
    
    if dataset == 'tdcsfog':
            df_res = df_res.drop(['Test'], axis=1)

    df_res = reduce_memory_usage(df_res)        
    
    if as_dataframe:
        return df_res
    
    return df_res.to_numpy()

In [None]:
    def check_ids(data, index, window):
        """
        Checks if the window around index only contains the same ids.
        This function was created because numpy is faster for large 
        datasets than Pandas alternatives.
        
        Args:
            index (int): The index to check the window for.
            window (int): The future and past window size.
        Returns:
            bool: True if all values in file_ids are the same, False otherwise.
        """
        # Put all file IDs of the current window into an array.
        file_ids = data[index - window : index + window,0]
        
        # Return False if no correct window can be made for file_ids.
        if len(file_ids)==0:
            return False
        
        return (file_ids[0] == file_ids).all()

In [None]:
def get_windows(data, window_past, window_future, threshold):
    feature_windows = []
    target_windows = []
    t_windows = []

    print("\nStart creating windows.")
    for idx, item in enumerate(tqdm(data)):
        if(
          (idx - window_past < 0) or
          (idx + window_future > len(data)) or
          (np.sum(np.ediff1d(data[idx - window_past : idx + window_future,1])) > ((window_past + window_future) - 1)) or
          (not check_ids(data, idx, window_past + window_future))
          ): 
            continue
        
        # If we dont want thresholding, we can set threshold to None
        if threshold == None:
            target = 4
        else:
            candidates = pd.Series(data[idx - window_past : idx + window_future,-1]).value_counts(normalize=True)
            target = None

            # Check if the largest candidate is above the threshold.
            if (candidates.to_numpy()[0] >= threshold):
                target = candidates.index[0]
                
        # Match the target and transform to correct binary array instance.
        # Will only 'continue' if candidates check fails.
        match target:
            case 0:
                targets = [0,0,0]
            case 1:
                targets = [1,0,0]
            case 2:
                targets = [0,1,0]
            case 3:
                targets = [0,0,1]
            case 4:
                targets = data[idx, 5:8]
            case _:
                continue
        
        feature_windows.append(data[idx - window_past : idx + window_future,1:5])
        target_windows.append(targets)
        t_windows.append(data[idx, -3]*data[idx, -2])

    return stats.zscore(np.array(feature_windows, dtype='float')), np.array(target_windows, dtype='float'), np.array(t_windows, dtype='float')

In [None]:
def get_test_windows(data, window_past, window_future):
    feature_windows = []
    _ids = []

    print("\nStart creating test windows.")
    for idx, _ in enumerate(tqdm(data)):
        if(idx - window_past < 0):
            feature_windows.append(np.concatenate([np.zeros((window_past, 4)), data[idx - window_past : idx + window_future,1:5]], axis=0))
            _ids.append(data[idx,0])
        if (idx + window_future > len(data)):
            feature_windows.append(np.concatenate([data[idx - window_past : idx + window_future,1:5],np.zeros((window_future, 4))], axis=0))
            _ids.append(data[idx,0])
        else:
            feature_windows.append(data[idx - window_past : idx + window_future,1:5])
            _ids.append(data[idx,0])
            
    return stats.zscore(np.array(feature_windows, dtype='float')), _ids

In [None]:
def sample_data(dataset, datatype):
    dataframe = read_data(dataset, datatype, True)
    ones = dataframe.loc[dataframe["Target"] == 1, :]
    twos = dataframe.loc[dataframe["Target"] == 2, :]
    threes = dataframe.loc[dataframe["Target"] == 3, :]
    zeros = dataframe.loc[dataframe["Target"] == 0, :]
#     zeros = zeros.loc[zeros["Task"] == 1, :]
    zeros = zeros.sample(len(ones) + len(twos) + len(threes))
    sampled_df = pd.concat([ones, twos, threes, zeros], ignore_index=False)
#     sampled_df = sampled_df.loc[sampled_df["Task"] == 1, :]
    
    return sampled_df.to_numpy()

# **Dataset Class**

In [None]:
class FOGDataset(Dataset):
    
    def __init__(self, datatype, threshold, window_future, window_past):
        self.datatype = datatype
        
        # We only set the targets variable if we are making a training dataset.
        # The train features in order are: Id, Time, AccV, AccML, AccAP, 
        # StartHesitation, Turn, Walking, Valid, Task, Target.
        # The test features in order are: Id, Time, AccV, AccML, AccAP
        if datatype == "train":
            defog_data = sample_data('defog', datatype)
            defog_features, defog_targets, defog_t = get_windows(defog_data, window_past, window_future, threshold)
            
            tdcsfog_data = sample_data('tdcsfog', datatype)
            tdcsfog_features, tdcsfog_targets, tdcsfog_t = get_windows(tdcsfog_data, window_past, window_future, threshold)
            
            self.features = np.concatenate((defog_features ,tdcsfog_features))
            self.targets = np.concatenate((defog_targets ,tdcsfog_targets))
            self.t = np.concatenate((defog_t ,tdcsfog_t))
            self.length = len(self.features)
            
        # If datatype is test we want to pad.         
        else:
            defog_data = read_data('defog', 'test', False)
            defog_features, defog_ids = get_test_windows(defog_data, window_past, window_future)
            
            tdcsfog_data = read_data('tdcsfog', 'test', False)             
            tdcsfog_features, tdcsfog_ids = get_test_windows(tdcsfog_data, window_past, window_future)
            
            self.features = np.concatenate((defog_features, tdcsfog_features))
            self.ids = np.concatenate((defog_ids, tdcsfog_ids))
            self.length = len(self.features)
            
    def __len__(self):
        """
        Returns:
            int: The length of the data object of the dataset class.
        """
        return self.length

    def __getitem__(self, idx):
        """
        Creates the batches for the dataloader.
        Returns:
            torch.Tensor: A tensor with the features per batch.
            torch.Tensor: A tensor with the target for each sample in the batch.
        """
        # Check if the datatype passed is train, we only create targets if the datatype is train.
        if self.datatype == "train":
            features = torch.tensor(self.features[idx].astype('float'))
            targets = torch.tensor(self.targets[idx].astype('float'))
            t = torch.tensor(self.t[idx].astype('float'))
            
            return features, targets, t
        else:
            _id = self.ids[idx] + "_" + str(idx)
        # Commented out line goes in pair with the reshape in forward() function to select only n = 8th row of window.
#             features = self.features[::8, :][::-1, :]
            features = torch.tensor(self.features[idx].astype('float'))
    
            return _id, features

# **Model**

In [None]:
def _block(in_features, out_features, drop_rate):
    return nn.Sequential(
        nn.Linear(in_features, out_features),
        nn.BatchNorm1d(out_features),
        nn.ReLU(),
        nn.Dropout(drop_rate)
    )

class FOGModel(nn.Module):
    def __init__(self, p, dim, nblocks, window_size):
        super(FOGModel, self).__init__()
        self.window_size = window_size
        self.dropout = nn.Dropout(p)
        # Commented out line goes in pair with the slicing in the __getitem__() function to select only n = 8th row of window.
#         self.in_layer = nn.Linear(int(window_size / 8) * 3, dim)
        self.in_layer = nn.Linear(window_size * 4, dim)
        self.blocks = nn.Sequential(*[_block(dim, dim, p) for _ in range(nblocks)])
        self.out_layer = nn.Linear(dim, 3)#output dimension, 3 if binary case, I am guessing 1 with non binary case
        
    def forward(self, x):
        # Commented out line goes in pair with the slicing in the __getitem__() function to select only n = 8th row of window.
#         x = x.view(-1, int(self.window_size / 8) * 3)
        x = x.view(-1, self.window_size * 4)
        x = self.in_layer(x)
        for block in self.blocks:
            x = block(x)
        x = self.out_layer(x)
        return x

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
from torch.cuda.amp import GradScaler

def train_one_epoch(model, loader, optimizer, criterion):
    loss_sum = 0.
    scaler = GradScaler()
    
    model.train()
    for x,y,t in tqdm(loader):
        x = x.to(DEVICE).float()
        y = y.to(DEVICE).float()
        t = t.to(DEVICE).float()
        
        y_pred = model(x)
        loss = criterion(y_pred, y)
        loss = torch.mean(loss*t.unsqueeze(-1), dim=1)
        
        t_sum = torch.sum(t)
        if t_sum > 0:
            loss = torch.sum(loss)/t_sum
        else:
            loss = torch.sum(loss)*0.
        
        # loss.backward()
        scaler.scale(loss).backward()
        # optimizer.step()
        scaler.step(optimizer)
        scaler.update()
        
        optimizer.zero_grad()
        
        loss_sum += loss.item()
    
    print(f"Train Loss: {(loss_sum/len(loader)):.04f}")
    return loss_sum/len(loader)

def validation_one_epoch(model, loader, criterion):
    loss_sum = 0.
    y_true_epoch = []
    y_pred_epoch = []
    t_valid_epoch = []
    
    model.eval()
    for x,y,t in tqdm(loader):
        x = x.to(DEVICE).float()
        y = y.to(DEVICE).float()
        t = t.to(DEVICE).float()
        
        with torch.no_grad():
            y_pred = model(x)
            loss = criterion(y_pred, y)
            loss = torch.mean(loss*t.unsqueeze(-1), dim=1)
            
            t_sum = torch.sum(t)
            if t_sum > 0:
                loss = torch.sum(loss)/t_sum
            else:
                loss = torch.sum(loss)*0.
        
        loss_sum += loss.item()
        y_true_epoch.append(y.cpu().numpy())
        y_pred_epoch.append(y_pred.cpu().numpy())
        t_valid_epoch.append(t.cpu().numpy())
        
    y_true_epoch = np.concatenate(y_true_epoch, axis=0)
    y_pred_epoch = np.concatenate(y_pred_epoch, axis=0)
    
    t_valid_epoch = np.concatenate(t_valid_epoch, axis=0)
    y_true_epoch = y_true_epoch[t_valid_epoch > 0, :]
    y_pred_epoch = y_pred_epoch[t_valid_epoch > 0, :]
    
    scores = [average_precision_score(y_true_epoch[:,i], y_pred_epoch[:,i]) for i in range(3)]
    mean_score = np.mean(scores)
    print(f"Validation Loss: {(loss_sum/len(loader)):.04f}, Validation Score: {mean_score:.03f}, ClassWise: {scores[0]:.03f},{scores[1]:.03f},{scores[2]:.03f}")
    
    return loss_sum/len(loader), mean_score

# **Training**

**Initialize the data set for the model**

In [None]:
# window_size = 256
datatype = "train"
threshold = None
window_future = 128
window_past = 128

num_epochs = 15
model_dropout = 0.2
model_hidden = 512
model_nblocks = 3

batch_size = 1024
lr = 0.0015

print("Initialising data:")
dataset_train = FOGDataset(
    datatype = datatype,
    threshold = threshold,
    window_future = window_future,
    window_past = window_past
)

**Train and validate the model**

In [None]:
model = FOGModel(model_dropout, model_hidden, model_nblocks, window_past + window_future).to(DEVICE)
# print(model)
print(f"\nNumber of parameters in model - {count_parameters(model):,}")

train_size = int(0.8 * len(dataset_train))
test_size = len(dataset_train) - train_size
train_dataset, valid_dataset = torch.utils.data.random_split(dataset_train, [train_size, test_size])

print(f"\nLength of datasets: train - {len(train_dataset)}, valid - {len(valid_dataset)}")

train_loader = DataLoader(train_dataset, batch_size = batch_size, num_workers=2, shuffle = True)
valid_loader = DataLoader(valid_dataset, batch_size = batch_size, num_workers=2, shuffle = True)
# Only shuffle on train and validation, do not shuffle on test.

optimizer = torch.optim.Adam(model.parameters(), lr = lr)
criterion = torch.nn.BCEWithLogitsLoss(reduction = 'none').to(DEVICE)
# sched = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.85)

print("="*50)
print("Start training:\n")

max_score = 0.0

print("="*50)
for epoch in range(num_epochs):
    print(f"Epoch: {epoch}")
    train_loss  = train_one_epoch(model, train_loader, optimizer, criterion)
    valid_loss, valid_score = validation_one_epoch(model, valid_loader, criterion)
    # sched.step()

    if valid_score > max_score:
        max_score = valid_score
        torch.save(model.state_dict(), "best_model_state.h5")
        print("Saving Model ...")

    print("="*50)

# **Testing**

In [None]:
# ONLY USED WHEN DOING THE LEADERBOARD SUBMISSION
datatype = "test"
threshold = None
window_future = 128
window_past = 128

dataset_test = FOGDataset(
    datatype = datatype,
    threshold = threshold,
    window_future = window_future,
    window_past = window_past
)

test_loader = DataLoader(dataset_test, batch_size = batch_size, num_workers=2, shuffle = False)
# Shuffle should be False for test set.

model.eval()

ids = []
preds = []

for _id, x in tqdm(test_loader):
    x = x.to(DEVICE).float()
    with torch.no_grad():
        y_pred = model(x)*0.1
    
    ids.extend(_id)
    preds.extend(list(np.nan_to_num(y_pred.cpu().numpy())))
    
preds = np.array(preds)
submission = pd.DataFrame({'Id': ids, 'StartHesitation': np.round(preds[:,0],5), \
                           'Turn': np.round(preds[:,1],5), 'Walking': np.round(preds[:,2],5)})

In [None]:
sample_submission = pd.read_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/sample_submission.csv")
submission = pd.merge(sample_submission[['Id']], submission, how='left', on='Id').fillna(0.0)
submission.to_csv("submission.csv", index=False)