<a href="https://colab.research.google.com/github/hoky1227/Parkinsons-Disease/blob/main/PreProc_Gait_in_Parkinson's_Disease.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparation

In [1]:
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/MyDrive/medical'
DG_path = path + '/datasets/dataset_fog_release/dataset/'
GPD_path = path + '/datasets/gait-in-parkinsons-disease-1.0.0/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy
import easydict
from glob import glob
import random
import torch
from torch.utils.data import Dataset, DataLoader

In [3]:
seed = 42

np.random.seed(seed)
random.seed(seed)

In [4]:
GPD_fl = sorted(glob(GPD_path + '*_*.txt'))
GPD_co_fl = sorted(glob(GPD_path + '*Co*_*.txt'))
GPD_pt_fl = sorted(glob(GPD_path + '*Pt*_*.txt'))
print(GPD_fl[:5])
print(GPD_co_fl[:5])
GPD_pt_fl[:5]

['/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaCo01_01.txt', '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaCo02_01.txt', '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaCo02_02.txt', '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaCo03_01.txt', '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaCo03_02.txt']
['/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaCo01_01.txt', '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaCo02_01.txt', '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaCo02_02.txt', '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaCo03_01.txt', '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaCo03_02.txt']


['/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaPt03_01.txt',
 '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaPt04_01.txt',
 '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaPt05_01.txt',
 '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaPt06_01.txt',
 '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaPt07_01.txt']

## Format

Each line contains 19 columns:

Column      1:   Time (in seconds)
Columns   2-9:   Vertical ground reaction force (VGRF, in Newton) on each of 8
	  	  sensors located under the left foot
Columns 10-17:   VGRF on each of the 8 sensors located under the right foot
Column     18:   Total force under the left foot
Column     19:   Total force under the right foot.

Sensor       X      Y

L1         -500   -800

L2         -700   -400

L3         -300   -400

L4         -700      0

L5         -300      0

L6         -700    400

L7         -300    400

L8         -500    800



R1          500   -800

R2          700   -400

R3          300   -400

R4          700      0

R5          300      0

R6          700    400

R7          300    400

R8          500    800

The X and Y numbers are in an arbitrary coordinate system reflecting
the relative (arbitrarily scaled) positions of the sensors within each
insole. During walking, the sensors inside each insole remain at the
same relative position, but the two feet are no longer parallel to
each other. Thus, this coordinate system enables a calculation of a
proxy for the location of the center of pressure (COP) under each
foot.

Ga, Ju or Si – indicate the study from which the data originated:
    
    Ga - Galit Yogev et al (dual tasking in PD; Eur J Neuro, 2005)
    
    Ju – Hausdorff et al (RAS in PD; Eur J Neuro, 2007)
    
    Si - Silvi Frenkel-Toledo et al (Treadmill walking in PD; Mov Disorders,
          2005)

Co or Pt: Control subject or a PD Patient

01: Subject number in the group

The sampling rate was 100 Hz.

## Codes

In [5]:
# COM? COP? COF?
L_x = np.array([-500, -700, -300, -700, -300, -700, -300, -500])
L_y = np.array([-800, -400, -400, 0, 0, 400, 400, 800])
R_x = np.array([500, 700, 300, 700, 300, 700, 300, 500])
R_y = np.array([-800, -400, -400, 0, 0, 400, 400, 800])

In [6]:
print(len(GPD_co_fl))
print(len(GPD_pt_fl))
len(GPD_fl)

92
214


306

In [7]:
# 60 : 20 : 20

train_len = round(0.6 * len(GPD_fl))
val_len = round(0.2 * len(GPD_fl))
test_len = len(GPD_fl) - train_len - val_len

print(train_len, val_len, test_len)

print(GPD_fl[:5])
random.shuffle(GPD_fl)
print(GPD_fl[:5])

184 61 61
['/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaCo01_01.txt', '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaCo02_01.txt', '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaCo02_02.txt', '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaCo03_01.txt', '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaCo03_02.txt']
['/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaPt21_01.txt', '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaPt20_02.txt', '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/JuPt15_07.txt', '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/GaCo11_01.txt', '/content/drive/MyDrive/medical/datasets/gait-in-parkinsons-disease-1.0.0/SiPt02_01.txt']


In [8]:
col = ['Time', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6',
       'L7', 'L8', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'L_tot', 'R_tot']

# df_co = pd.read_csv(GPD_co_fl[0], sep='\t', names=col)
# df_pt = pd.read_csv(GPD_pt_fl[0], sep='\t', names=col)

# for f in GPD_co_fl[1:]:
#     df_co_ = pd.read_csv(f, sep='\t', names=col)
#     df_co = pd.concat([df_co, df_co_], ignore_index=True)

# for f in GPD_pt_fl[1:]:
#     df_pt_ = pd.read_csv(f, sep='\t', names=col)
#     df_pt = pd.concat([df_pt, df_pt_], ignore_index=True)

# display(df_co.head())
# print(df_co.shape)
# display(df_pt.head())
# print(df_pt.shape)

# ------------------------------------------------------------------------------

# for i, f in enumerate(GPD_co_fl):
#     globals()[f'df_co_{i}'] = pd.read_csv(f, sep='\t', names=col)


# for i, f in enumerate(GPD_pt_fl):
#     globals()[f'df_pt_{i}'] = pd.read_csv(f, sep='\t', names=col)

# df_co = pd.concat([globals()[f'df_co_{i}'] for i in range(len(GPD_co_fl))], ignore_index=True)
# df_pt = pd.concat([globals()[f'df_pt_{i}'] for i in range(len(GPD_pt_fl))], ignore_index=True)

# display(df_co.head())
# print(df_co.shape)
# display(df_pt.head())
# print(df_pt.shape)

# ------------------------------------------------------------------------------

# for i, f in enumerate(GPD_fl[:train_len]):
#     if 'Co' in f:
#         globals()[f'train_co_{i}'] = pd.read_csv(f, sep='\t', names=col)
#     else:
#         globals()[f'train_pt_{i}'] = pd.read_csv(f, sep='\t', names=col)        

# for i, f in enumerate(GPD_fl[train_len:train_len + val_len]):
#     if 'Co' in f:
#         globals()[f'val_co_{i}'] = pd.read_csv(f, sep='\t', names=col)
#     else:
#         globals()[f'val_pt_{i}'] = pd.read_csv(f, sep='\t', names=col) 

# for i, f in enumerate(GPD_fl[train_len + val_len:]):
#     if 'Co' in f:
#         globals()[f'test_co_{i}'] = pd.read_csv(f, sep='\t', names=col)
#     else:
#         globals()[f'test_pt_{i}'] = pd.read_csv(f, sep='\t', names=col)

# ------------------------------------------------------------------------------

for i, f in enumerate(GPD_fl[:train_len]):
    globals()[f'train_{i}'] = pd.read_csv(f, sep='\t', names=col)
    if 'Co' in f:
        globals()[f'train_{i}']['label'] = [0] * len(globals()[f'train_{i}'])
    else:
        globals()[f'train_{i}']['label'] = [1] * len(globals()[f'train_{i}'])

for i, f in enumerate(GPD_fl[train_len:train_len + val_len]):
    globals()[f'val_{i}'] = pd.read_csv(f, sep='\t', names=col)
    if 'Co' in f:
        globals()[f'val_{i}']['label'] = [0] * len(globals()[f'val_{i}'])
    else:
        globals()[f'val_{i}']['label'] = [1] * len(globals()[f'val_{i}'])

for i, f in enumerate(GPD_fl[train_len + val_len:]):
    globals()[f'test_{i}'] = pd.read_csv(f, sep='\t', names=col)
    if 'Co' in f:
        globals()[f'test_{i}']['label'] = [0] * len(globals()[f'test_{i}'])
    else:
        globals()[f'test_{i}']['label'] = [1] * len(globals()[f'test_{i}'])

In [9]:
config = {
    'max_len' : 128,
    'epochs' : 100,
    'kernel' : 3,
    'lr' : 1e-3
}

In [10]:
def preproc_window(df, window, drop_time=True):
    _df = df.copy()

    if drop_time:
        _df.pop('Time')
    _df.pop('label')

    for i in range(1, window - (len(_df) % window)):
        _df.loc[len(_df) + i] = [0] * len(_df.columns)
    
    assert len(_df) % window != 0

    df_new = pd.DataFrame()

    _df['index'] = list(range(len(_df)))
    _df['group'] = _df.apply(lambda x: x['index'] // window, axis=1)

    for col in _df.columns[:-2]:
        df_new[str(col)] = _df.groupby(by='group')[str(col)].apply(list)

    del _df

    df_new['label'] = [df['label'][0]] * len(df_new)

    df_new = df_new.reset_index()
    df_new.pop('group')

    return df_new

In [11]:
train = pd.concat([preproc_window(globals()[f'train_{i}'], config['max_len']) for i in range(train_len)], ignore_index=True)
val = pd.concat([preproc_window(globals()[f'val_{i}'], config['max_len']) for i in range(val_len)], ignore_index=True)
test = pd.concat([preproc_window(globals()[f'test_{i}'], config['max_len']) for i in range(test_len)], ignore_index=True)

In [15]:
print(train.shape)
print(val.shape)
print(test.shape)

(15521, 19)
(5219, 19)
(5296, 19)


In [16]:
train.to_csv(GPD_path + 'train_128.csv', index=False)
val.to_csv(GPD_path + 'val_128.csv', index=False)
test.to_csv(GPD_path + 'test_128.csv', index=False)