<p style="font-size: 24px; font-weight: bold;">Hello there!</p>

<p style="font-size: 16px;">This notebook introduces a super simple way to create a submission file for the competition of <b>"Parkinson's Freezing of Gait Prediction"</b>.</p>

<p style="font-size: 16px;">In this notebook, you will create features by combining 3-dimensional accelerometer data values with metadata about subjects in order to detect FoG (Freezing of Gait) events.</p>

<p style="font-size: 16px;">To predict FoG events (<code>'StartHesitation'</code>, <code>'Turn'</code>, <code>'Walking'</code>) at a given time point <code><b>t</b></code>, you will use the accelerometer data values and subject information at the same time point <code><b>t</b></code>.</p>

<p style="font-size: 16px;">However, since this procedure does not handle temporal information well, it must be necessary to conduct innovative feature engineering to achieve better performance.</p>

<p style="font-size: 16px;">The purpose of publishing this notebook is to demonstrate the rough procedure up to submitting results for the competition using simple code as much as possible.</p>

<p style="font-size: 16px;">I hope that the release of this notebook will contribute even a little to the excitement of the competition.</p>

<p style="font-size: 16px;">Let's enjoy Kaggle together!</p>

<h1>Import Modules</h1>

In [None]:
import os
import tqdm
import glob
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

In [None]:
# parent directory
pdir = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction'

# load meta data

In [None]:
df_tdcs_meta = pd.read_csv(os.path.join(pdir, 'tdcsfog_metadata.csv'))
df_tdcs_meta.head()

In [None]:
df_defog_meta = pd.read_csv(os.path.join(pdir, 'defog_metadata.csv'))
df_defog_meta.head()

In [None]:
df_subjects = pd.read_csv(os.path.join(pdir, 'subjects.csv'))
df_subjects.head()

# Load tdcsfog data

In [None]:
# list of all tdcsfog csv file path
tdcs_file_path = glob.glob(os.path.join(pdir, 'train', 'tdcsfog', '*.csv'), recursive=True)

    # In this notebook, we limit the number of files to be read in order to reduce the time required for model training.

tdcs_file_path = tdcs_file_path[::10] #weghalen!!!!!!!!!!!!!

print(f'the number of files to be read: {len(tdcs_file_path)}')

pd.read_csv(tdcs_file_path[0]).head()

In [None]:
# Initialize a DataFrame to combine data from multiple CSV files.
df_tdcs = pd.DataFrame()

# load tdcsfog time series in combination with metadata.
for fp in tqdm.tqdm(tdcs_file_path):    
    
    # load data into a variable 'tmp'.
    tmp = pd.read_csv(fp)
    
    # get file Id from csv file name.
    file_id = os.path.basename(fp).replace(".csv", "")
    
    # get subject Id.
    subject = df_tdcs_meta.loc[df_tdcs_meta['Id'] == file_id, 'Subject'].iloc[0]
    
    # add metadata.
    tmp['Medication'] = df_tdcs_meta.loc[df_tdcs_meta['Id'] == file_id, 'Medication'].iloc[0]
    tmp['Age'] = df_subjects.loc[df_subjects['Subject'] == subject, 'Age'].iloc[0]
    tmp['Sex'] = df_subjects.loc[df_subjects['Subject'] == subject, 'Sex'].iloc[0]
    tmp['YearsSinceDx'] = df_subjects.loc[df_subjects['Subject'] == subject, 'YearsSinceDx'].iloc[0]
    tmp['NFOGQ'] =df_subjects.loc[df_subjects['Subject'] == subject, 'NFOGQ'].iloc[0]
    
    # concat the data
    df_tdcs = pd.concat([df_tdcs, tmp]).reset_index(drop=True)

In [None]:
# check the contents of the df_tdcs
df_tdcs.head()

# Load defog data

In [None]:
# list of all tdcsfog csv file path
defog_file_path = glob.glob(os.path.join(pdir, 'train', 'defog', '*.csv'), recursive=True)

# In this notebook, we limit the number of files to be read in order to reduce the time required for model training.
defog_file_path = defog_file_path[::1]

print(f'the number of files to be read: {len(defog_file_path)}')

In [None]:
# Initialize a DataFrame to combine data from multiple CSV files.
df_defog = pd.DataFrame()

for fp in tqdm.tqdm(defog_file_path):
    # load data into a variable 'tmp'.
    tmp = pd.read_csv(fp)
    
    # get file Id from csv file name.
    file_id = os.path.basename(fp).replace(".csv", "")
    
    # get subject Id.
    subject = df_defog_meta.loc[df_defog_meta['Id'] == file_id, 'Subject'].iloc[0]
    
    # add metadata.
    tmp['Medication'] = df_defog_meta.loc[df_defog_meta['Id'] == file_id, 'Medication'].iloc[0]
    tmp['Age'] = df_subjects.loc[df_subjects['Subject'] == subject, 'Age'].iloc[0]
    tmp['Sex'] = df_subjects.loc[df_subjects['Subject'] == subject, 'Sex'].iloc[0]
    tmp['YearsSinceDx'] = df_subjects.loc[df_subjects['Subject'] == subject, 'YearsSinceDx'].iloc[0]
    tmp['NFOGQ'] =df_subjects.loc[df_subjects['Subject'] == subject, 'NFOGQ'].iloc[0]
    
    # extract data from the time period where Valid and Task are both True.
    tmp = tmp[(tmp['Valid'] == True) & (tmp['Task']==True)]
    tmp = tmp.drop(['Valid', 'Task'], axis=1)
    
    # concat the data
    df_defog = pd.concat([df_defog, tmp]).reset_index(drop=True)

In [None]:
# check the contents of the df_defog
df_defog.head()

In [None]:
import numpy as np
print("walking == 1:         ", len(np.where(df_defog['Walking']==1)[0]))
print("turn == 1:            ",len(np.where(df_defog['Turn']==1)[0]))
print("StartHesitation == 1: ",len(np.where(df_defog['StartHesitation']==1)[0]))

# Prepare train data

In [None]:
# concat tdcs and defog data.
df_train = pd.concat([df_tdcs, df_defog]).reset_index(drop=True)
# df_train.head(100)

SH = len(np.where(df_train['StartHesitation'] == 1)[0])
W = len(np.where(df_train['Walking'] == 1)[0])
T = len(np.where(df_train['Turn'] == 1)[0])
N = len(df_train)-SH-W-T
print("No events:",N)
print("StartHesitation:",SH)
print("Walking:",W)
print("Turning:",T)

In [None]:
SH_ = len(np.where(df_train['StartHesitation'] == 0)[0])
W_ = len(np.where(df_train['Walking'] == 0)[0])
T_ = len(np.where(df_train['Turn'] == 0)[0])
print("Not StartHesitation:",SH_)
print("Not Walking:",W_)
print("Not Turning:",T_)

In [None]:
from matplotlib import pyplot as plt
plt.bar(["StartHesitation", "Walking", "Turning", "None"], [SH,W,T,N])
plt.title("Number of occurrences per FOG event")
plt.show()

In [None]:
# encode string columns into 0/1 format
df_train['Medication'] = np.where(df_train['Medication']=='on', 1, 0)
df_train['Sex'] = np.where(df_train['Sex']=='M', 1, 0)
df_train.head()

# Split train data for 3 classes

In [None]:
df_train_turn = df_train[df_train['Turn'] == 1]
df_train_walking = df_train[df_train['Walking'] == 1]
df_train_start = df_train[df_train['StartHesitation'] == 1]
df_train_none = df_train[np.where(df_train['Turn']==0, df_train['Walking']==0, df_train['StartHesitation']==0)]
df_train_turn_y = pd.DataFrame(np.zeros(len(df_train_turn)))
df_train_none_y = pd.DataFrame(np.ones(len(df_train_none)))


In [None]:
df_train_walk_y = pd.DataFrame(np.full(len(df_train_walking), 2))
df_train_start_y = pd.DataFrame(np.full(len(df_train_start), 3))

In [None]:
df_train_turn_none_undersampling_X = pd.concat([df_train_turn, df_train_none])
df_train_turn_none_undersampling_y = pd.concat([df_train_turn_y, df_train_none_y])

In [None]:
# # split data into features and target.
# y_turn = df_train['Turn']                       # target
# X_turn = df_train.drop(['StartHesitation', 'Turn', 'Walking', 'Time'], axis=1)  # feature

# y_walking = df_train['Walking']                       # target
# X_walking = df_train.drop(['StartHesitation', 'Turn', 'Walking', 'Time'], axis=1)  # feature

# y_start = df_train['StartHesitation']                       # target
# X_start = df_train.drop(['StartHesitation', 'Turn', 'Walking', 'Time'], axis=1)  # feature

In [None]:
# df_train['label'] = np.where(df_train['Turn'] == 1, 1,
#                     np.where(df_train['Walking'] == 1, 2,
#                     np.where(df_train['StartHesitation'] == 1, 3, 0)))

# df_train_sampling
# print(np.where(df_train['Turn'] + df_train['Walking'] + df_train['StartHesitation'] >1))

In [None]:
# X_data = df_train.drop(['StartHesitation', 'Turn', 'Walking', 'Time', 'label'], axis=1)  # feature
# y_data = df_train['label']

# Undersampling majority class

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0, sampling_strategy='not minority')
X_data_undersampled, y_data_undersampled = rus.fit_resample(df_train_turn_none_undersampling_X, df_train_turn_none_undersampling_y)
print(len(X_data_undersampled))
print(len(y_data_undersampled))

In [None]:
print(type(X_data_undersampled))

# SMOTE

Undersampled is alle 4 classes samen waarvan none undersampled is van 8m naar 2m (size van turn). Hierdoor worden alle classes met smote 2m.

In [None]:
X_undersampled = pd.concat([X_data_undersampled, df_train_walking, df_train_start])
y_undersampled = pd.concat([y_data_undersampled, df_train_walk_y, df_train_start_y])

In [None]:
smote = SMOTE(random_state = 4, k_neighbors=100)
X_syn, y_syn = smote.fit_resample(X_undersampled, y_undersampled)

In [None]:
print(len(np.where(y_syn == 0)[0]))
print(len(np.where(y_syn == 1)[0]))
print(len(np.where(y_syn == 2)[0]))
print(len(np.where(y_syn == 3)[0]))

In [None]:
del df_train, y_undersampled,X_undersampled, X_data_undersampled, y_data_undersampled, df_train_walk_y, df_train_start_y, df_train_turn, df_train_walking, df_train_start, df_train_none, df_train_turn_y,df_train_none_y, df_train_turn_none_undersampling_X, df_train_turn_none_undersampling_y, SH, W, T,N

In [None]:
X_syn.head()

In [None]:
X_syn_fixed = X_syn.drop(["StartHesitation", "Turn", "Walking"], axis=1)

In [None]:
X_syn_fixed.head()

In [None]:
y_syn = np.array(y_syn)  # Convert y_syn to NumPy array

labels = y_syn.flatten().astype(int)  # Flatten and convert to int

zeros_array = np.zeros(len(labels), dtype=int)

# Create arrays for each column using vectorized operations
start_hesitation = np.where(labels == 3, 1, zeros_array)
turn = np.where(labels == 0, 1, zeros_array)
walking = np.where(labels == 2, 1, zeros_array)

# Create the DataFrame using the NumPy arrays
y_syn_fixed = pd.DataFrame({
    "StartHesitation": start_hesitation,
    "Turn": turn,
    "Walking": walking
})

y_syn_fixed.head()

# Random Search CV

In [None]:
import numpy as np
#### Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1, stop = 40, num = 1)]

# Number of features to consider at every split
# max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(start = 3, stop = 5, num = 1)]

# Minimum number of samples required to split a node
min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 500, num = 1)]

# Minimum number of samples required at each leaf node
min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 500, num = 1)]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# # Whether to use out-of-bag samples to estimate the generalization score. Only available if bootstrap=True.
# oob_score = [True, False]

#Class weight 
# class_weight = #hier een dictionary maken met de weight distributions 

#

In [None]:
# Create the param grid
param_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
             }
               #'class_weight': class_weight}
print(param_grid)

# Train RandomForestClassifier

In [None]:
# train the model with default parameter
rfSH = RandomForestClassifier(random_state=0, n_estimators=10, n_jobs=-1, class_weight={0:4648007,1:5281})
rfW = RandomForestClassifier(random_state=0, n_estimators=10, n_jobs=-1, class_weight={0:4553008,1:100280})
rfT = RandomForestClassifier(random_state=0, n_estimators=10, n_jobs=-1, class_weight={0:4010847,1:642441})

from sklearn.model_selection import GridSearchCV
rfSH_Grid = GridSearchCV(estimator = rfSH, param_grid = param_grid, cv = 10, verbose=2, n_jobs = -1, return_train_score=True)
rfW_Grid = GridSearchCV(estimator = rfW, param_grid = param_grid, cv = 10, verbose=2, n_jobs = -1, return_train_score=True)
rfT_Grid = GridSearchCV(estimator = rfT, param_grid = param_grid, cv = 10, verbose=2, n_jobs = -1, return_train_score=True)


rfSH_Grid.fit(X_syn_fixed, y_syn_fixed['StartHesitation'])
print("SH model fitted")
rfW_Grid.fit(X_syn_fixed, y_syn_fixed['Walking'])
print("Walking model fitted")
rfT_Grid.fit(X_syn_fixed, y_syn_fixed['Turn'])
print("Turn model fitted")

In [None]:
print("Grid search results:")
print(rfSH_Grid.cv_results_.keys())
print(rfW_Grid.cv_results_.keys())
print(rfT_Grid.cv_results_.keys())

# Prepare test data

## Process the test data in the same way as the training data

In [None]:
# list of all tdcsfog csv file path
tdcs_test_file_path = glob.glob(os.path.join(pdir, 'test', 'tdcsfog', '*.csv'), recursive=True)
print(f'the number of files to be read: {len(tdcs_test_file_path)}')

In [None]:
# Initialize a DataFrame to combine data from multiple CSV files.
df_tdcs_test = pd.DataFrame()

for fp in tqdm.tqdm(tdcs_test_file_path):
    
    # load data into a variable 'tmp'.
    tmp = pd.read_csv(fp)
    
    # get file Id from csv file name.
    file_id = os.path.basename(fp).replace(".csv", "")
    
    # get subject Id.
    subject = df_tdcs_meta.loc[df_tdcs_meta['Id'] == file_id, 'Subject'].iloc[0]
    
    # add metadata.
    tmp['Medication'] = df_tdcs_meta.loc[df_tdcs_meta['Id'] == file_id, 'Medication'].iloc[0]
    tmp['Age'] = df_subjects.loc[df_subjects['Subject'] == subject, 'Age'].iloc[0]
    tmp['Sex'] = df_subjects.loc[df_subjects['Subject'] == subject, 'Sex'].iloc[0]
    tmp['YearsSinceDx'] = df_subjects.loc[df_subjects['Subject'] == subject, 'YearsSinceDx'].iloc[0]
    tmp['NFOGQ'] =df_subjects.loc[df_subjects['Subject'] == subject, 'NFOGQ'].iloc[0]
    
    # add Id data to submit.
    tmp['Id'] = file_id + '_' + tmp['Time'].astype(str)
    
    # concat the data
    df_tdcs_test = pd.concat([df_tdcs_test, tmp]).reset_index(drop=True)
print(df_tdcs_test[0:4])

In [None]:
# check the contents of the df_tdcs_test
df_tdcs_test.head()

In [None]:
# list of all tdcsfog csv file path
defog_test_file_path = glob.glob(os.path.join(pdir, 'test', 'defog', '*.csv'), recursive=True)
print(f'the number of files to be read: {len(defog_test_file_path)}')

In [None]:
# Initialize a DataFrame to combine data from multiple CSV files.
df_defog_test = pd.DataFrame()

for fp in tqdm.tqdm(defog_test_file_path):
    # load data into a variable 'tmp'.
    tmp = pd.read_csv(fp)
    
    # get file Id from csv file name.
    file_id = os.path.basename(fp).replace(".csv", "")
    
    # get subject Id.
    subject = df_defog_meta.loc[df_defog_meta['Id'] == file_id, 'Subject'].iloc[0]
    
    # add metadata.
    tmp['Medication'] = df_defog_meta.loc[df_defog_meta['Id'] == file_id, 'Medication'].iloc[0]
    tmp['Age'] = df_subjects.loc[df_subjects['Subject'] == subject, 'Age'].iloc[0]
    tmp['Sex'] = df_subjects.loc[df_subjects['Subject'] == subject, 'Sex'].iloc[0]
    tmp['YearsSinceDx'] = df_subjects.loc[df_subjects['Subject'] == subject, 'YearsSinceDx'].iloc[0]
    tmp['NFOGQ'] =df_subjects.loc[df_subjects['Subject'] == subject, 'NFOGQ'].iloc[0]
    
    # add Id data to submit.
    tmp['Id'] = file_id + '_' + tmp['Time'].astype(str)
    
    # concat the data
    df_defog_test = pd.concat([df_defog_test, tmp]).reset_index(drop=True)

In [None]:
# check the contents of the df_defog_test
df_defog_test.head()

In [None]:
# concat tdcs and defog data.
df_test = pd.concat([df_tdcs_test, df_defog_test]).reset_index(drop=True)

# encode string columns into 0/1 format
df_test['Medication'] = np.where(df_test['Medication']=='on', 1, 0)
df_test['Sex'] = np.where(df_test['Sex']=='M', 1, 0)
display(df_test)

In [None]:
# split data into submission Id and feature.
Id = df_test['Id']                             # Id for submission data
# X_test = df_test.drop(['Time', 'Id'], axis=1)  # feature of test data
# X_test.head()
X_test = df_test.drop(['Id'], axis=1)
X_test.head()

# Predict and submit

In [None]:
# calculate prediction using trained RandomForestClassifier model.
predictionSH = rfSH_Grid.predict_proba(X_test)
predictionW = rfW_Grid.predict_proba(X_test)
predictionT = rfT_Grid.predict_proba(X_test)

In [None]:
SHprediction = predictionSH
Wprediction = predictionW
Tprediction =predictionT
print("Start hesitation predictions:")
print(SHprediction)
print("Walking predictions:")
print(Wprediction)
print("Turn predictions:")
print(Tprediction)

In [None]:
# Prepare submit data
submit = pd.DataFrame(Id, columns=['Id'])
submit['StartHesitation'] = SHprediction[0,:]
submit['Turn'] = Tprediction[0,:]
submit['Walking'] = Wprediction[0,:]

In [None]:
display(submit)

In [None]:
# Save the created submission data.
submit.to_csv('submission.csv', index=False)

In [None]:
submit

<p style="font-size: 24px; font-weight: bold;">Congratulations!</p>

<p style="font-size: 16px;">You're now ready to submit your work on Kaggle!</p>

<p style="font-size: 16px;">Enjoy your experience on Kaggle!</p>