In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns   
from scipy.signal import butter, filtfilt
# import seglearn as sglearn        # For windowing and sequence modeling
import tsfresh     
import os
 
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import polars as pl
import dask.dataframe as dd
from pathlib import Path

# Data Exploration

In [3]:
# File paths for three training datasets
defog = Path('/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog')
notype = Path('/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/notype')
tdcsfog = Path('/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog')

In [4]:
defog_files = [f for f in os.listdir(defog) if f.endswith('.csv')]

# List to store individual DataFrames
defog_list = []

for path in defog.glob("*.csv"):
    patient_id = path.stem  # removes .csv

    df = pl.read_csv(path)
    df = df.with_columns([
        pl.lit(patient_id).alias("patient_id")
    ])
    
    defog_list.append(df)

defog_df = pl.concat(defog_list)
# for f in defog_files:
#     file_path = os.path.join(defog, f)
#     df = pl.read_csv(file_path)
#     df = df.with_columns([
#         pl.lit(f).alias('file')  # Add filename as identifier
#     ])
#     defog_list.append(df)

# # Concatenate into one large DataFrame
# defog_df = pl.concat(defog_list)

In [5]:
defog_df.head()

Time,AccV,AccML,AccAP,StartHesitation,Turn,Walking,Valid,Task,patient_id
i64,f64,f64,f64,i64,i64,i64,bool,bool,str
0,-1.002697,0.022371,0.068304,0,0,0,False,False,"""be9d33541d"""
1,-1.002641,0.019173,0.066162,0,0,0,False,False,"""be9d33541d"""
2,-0.99982,0.019142,0.067536,0,0,0,False,False,"""be9d33541d"""
3,-0.998023,0.018378,0.068409,0,0,0,False,False,"""be9d33541d"""
4,-0.998359,0.016726,0.066448,0,0,0,False,False,"""be9d33541d"""


Load dataset

In [6]:
notype_files = [f for f in os.listdir(notype) if f.endswith('.csv')]

# List to store individual DataFrames
notype_list = []

for path in notype.glob("*.csv"):
    patient_id = path.stem  # removes .csv

    df = pl.read_csv(path)
    df = df.with_columns([
        pl.lit(patient_id).alias("patient_id")
    ])
    
    notype_list.append(df)

notype_df = pl.concat(notype_list)

In [7]:
notype_df.head()

Time,AccV,AccML,AccAP,Event,Valid,Task,patient_id
i64,f64,f64,f64,i64,bool,bool,str
0,-0.991926,-0.119916,0.050087,0,False,False,"""1e8d55d48d"""
1,-0.994243,-0.118624,0.049909,0,False,False,"""1e8d55d48d"""
2,-0.99584,-0.118602,0.048774,0,False,False,"""1e8d55d48d"""
3,-0.995865,-0.121627,0.04809,0,False,False,"""1e8d55d48d"""
4,-0.99233,-0.122146,0.048878,0,False,False,"""1e8d55d48d"""


In [8]:
tdcsfog_files = [f for f in os.listdir(tdcsfog) if f.endswith('.csv')]

# List to store individual DataFrames
tdcsfog_list = []

for path in tdcsfog.glob("*.csv"):
    patient_id = path.stem  # removes .csv

    df = pl.read_csv(path)
    df = df.with_columns([
        pl.lit(patient_id).alias("patient_id")
    ])
    
    tdcsfog_list.append(df)

tdcsfog_df = pl.concat(tdcsfog_list)

In [9]:
tdcsfog_df.head()

Time,AccV,AccML,AccAP,StartHesitation,Turn,Walking,patient_id
i64,f64,f64,f64,i64,i64,i64,str
0,-9.66589,0.04255,0.184744,0,0,0,"""a171e61840"""
1,-9.672969,0.049217,0.184644,0,0,0,"""a171e61840"""
2,-9.67026,0.03362,0.19379,0,0,0,"""a171e61840"""
3,-9.673356,0.035159,0.184369,0,0,0,"""a171e61840"""
4,-9.671458,0.043913,0.197814,0,0,0,"""a171e61840"""


Inspect Structure
* What are the columns/features?
* What do they represent (accelerometer, gyroscope, frequency, time)?
* Are there multiple subjects? Sessions? Devices?
* What is the sampling frequency?
* How many rows per subject/session?
* Distribution of target labels (e.g., freezing events)

In [10]:
print(defog_df.head())
# print(defog_df.info())
print(defog_df.describe())
print(defog_df.shape)     # (rows, columns)
print(defog_df.columns)   # list of column names
print(defog_df.dtypes)    # list of column types

shape: (5, 10)
┌──────┬───────────┬──────────┬──────────┬───┬─────────┬───────┬───────┬────────────┐
│ Time ┆ AccV      ┆ AccML    ┆ AccAP    ┆ … ┆ Walking ┆ Valid ┆ Task  ┆ patient_id │
│ ---  ┆ ---       ┆ ---      ┆ ---      ┆   ┆ ---     ┆ ---   ┆ ---   ┆ ---        │
│ i64  ┆ f64       ┆ f64      ┆ f64      ┆   ┆ i64     ┆ bool  ┆ bool  ┆ str        │
╞══════╪═══════════╪══════════╪══════════╪═══╪═════════╪═══════╪═══════╪════════════╡
│ 0    ┆ -1.002697 ┆ 0.022371 ┆ 0.068304 ┆ … ┆ 0       ┆ false ┆ false ┆ be9d33541d │
│ 1    ┆ -1.002641 ┆ 0.019173 ┆ 0.066162 ┆ … ┆ 0       ┆ false ┆ false ┆ be9d33541d │
│ 2    ┆ -0.99982  ┆ 0.019142 ┆ 0.067536 ┆ … ┆ 0       ┆ false ┆ false ┆ be9d33541d │
│ 3    ┆ -0.998023 ┆ 0.018378 ┆ 0.068409 ┆ … ┆ 0       ┆ false ┆ false ┆ be9d33541d │
│ 4    ┆ -0.998359 ┆ 0.016726 ┆ 0.066448 ┆ … ┆ 0       ┆ false ┆ false ┆ be9d33541d │
└──────┴───────────┴──────────┴──────────┴───┴─────────┴───────┴───────┴────────────┘
shape: (9, 11)
┌───────────┬───────────

In [11]:
print(notype_df.head())
# print(notype_df.info())
print(notype_df.shape)     # (rows, columns)
print(notype_df.columns)   # list of column names
print(notype_df.dtypes) 
print(notype_df.describe())

shape: (5, 8)
┌──────┬───────────┬───────────┬──────────┬───────┬───────┬───────┬────────────┐
│ Time ┆ AccV      ┆ AccML     ┆ AccAP    ┆ Event ┆ Valid ┆ Task  ┆ patient_id │
│ ---  ┆ ---       ┆ ---       ┆ ---      ┆ ---   ┆ ---   ┆ ---   ┆ ---        │
│ i64  ┆ f64       ┆ f64       ┆ f64      ┆ i64   ┆ bool  ┆ bool  ┆ str        │
╞══════╪═══════════╪═══════════╪══════════╪═══════╪═══════╪═══════╪════════════╡
│ 0    ┆ -0.991926 ┆ -0.119916 ┆ 0.050087 ┆ 0     ┆ false ┆ false ┆ 1e8d55d48d │
│ 1    ┆ -0.994243 ┆ -0.118624 ┆ 0.049909 ┆ 0     ┆ false ┆ false ┆ 1e8d55d48d │
│ 2    ┆ -0.99584  ┆ -0.118602 ┆ 0.048774 ┆ 0     ┆ false ┆ false ┆ 1e8d55d48d │
│ 3    ┆ -0.995865 ┆ -0.121627 ┆ 0.04809  ┆ 0     ┆ false ┆ false ┆ 1e8d55d48d │
│ 4    ┆ -0.99233  ┆ -0.122146 ┆ 0.048878 ┆ 0     ┆ false ┆ false ┆ 1e8d55d48d │
└──────┴───────────┴───────────┴──────────┴───────┴───────┴───────┴────────────┘
(10251114, 8)
['Time', 'AccV', 'AccML', 'AccAP', 'Event', 'Valid', 'Task', 'patient_id']
[Int64

In [12]:
print(tdcsfog_df.head())
# print(tdcsfog_df.info())
print(tdcsfog_df.shape)     # (rows, columns)
print(tdcsfog_df.columns)   # list of column names
print(tdcsfog_df.dtypes) 
print(tdcsfog_df.describe())

shape: (5, 8)
┌──────┬───────────┬──────────┬──────────┬─────────────────┬──────┬─────────┬────────────┐
│ Time ┆ AccV      ┆ AccML    ┆ AccAP    ┆ StartHesitation ┆ Turn ┆ Walking ┆ patient_id │
│ ---  ┆ ---       ┆ ---      ┆ ---      ┆ ---             ┆ ---  ┆ ---     ┆ ---        │
│ i64  ┆ f64       ┆ f64      ┆ f64      ┆ i64             ┆ i64  ┆ i64     ┆ str        │
╞══════╪═══════════╪══════════╪══════════╪═════════════════╪══════╪═════════╪════════════╡
│ 0    ┆ -9.66589  ┆ 0.04255  ┆ 0.184744 ┆ 0               ┆ 0    ┆ 0       ┆ a171e61840 │
│ 1    ┆ -9.672969 ┆ 0.049217 ┆ 0.184644 ┆ 0               ┆ 0    ┆ 0       ┆ a171e61840 │
│ 2    ┆ -9.67026  ┆ 0.03362  ┆ 0.19379  ┆ 0               ┆ 0    ┆ 0       ┆ a171e61840 │
│ 3    ┆ -9.673356 ┆ 0.035159 ┆ 0.184369 ┆ 0               ┆ 0    ┆ 0       ┆ a171e61840 │
│ 4    ┆ -9.671458 ┆ 0.043913 ┆ 0.197814 ┆ 0               ┆ 0    ┆ 0       ┆ a171e61840 │
└──────┴───────────┴──────────┴──────────┴─────────────────┴──────┴─────────

Columns/features: Time, Vertical acceleration, Mediolateral acceleration, Anterposterior accerleration, StartHesitation, Turn, Walking, Valid Task

Visualize sample data

Recommended libraries: 
* matplotlib, seaborn
* plotly
* tsfel or tsfresh (time series visualization)

Useful visualizations:
* Distribution plots (sns.histplot)
* Box plots for outliers
* Heatmaps for correlation
* Missing value heatmaps (sns.heatmap(df.isnull()))

In [13]:
# plt.plot(df_defog['Time'], df_defog['AccV'])
# plt.title("AccV over Time")
# plt.xlabel("Time")
# plt.ylabel("AccV")

In [14]:
# plt.plot(df_notype['Time'], df_notype['AccV'])
# plt.title("AccV over Time")
# plt.xlabel("Time")
# plt.ylabel("AccV")

In [15]:
# plt.plot(df_tdcsfog['Time'], df_tdcsfog['AccV'])
# plt.title("AccV over Time")
# plt.xlabel("Time")
# plt.ylabel("AccV")

In [16]:
# plt.figure(figsize=(10, 6))
# sns.heatmap(df_defog.corr(), annot=True, cmap='coolwarm')
# plt.title('Correlation Between Features and Task Labels')
# plt.show()

# Data Cleaning

Handle Missing Values
* Check missingness
* decide: drop columns or rows of too many missing values or impute (forward fill, backward fill, or interpolate)

Detect and correct erroneous values
* Spikes or dropouts in sensor readings
* Duplicated timestamps
* Values outside physical limits (e.g., g-force beyond ±16g if using IMUs)

Check for:
* Outliers: Z-score, IQR method
* Signal gaps
* Negative time values or duplicates

Normalize or standardize data using StandardScaler or MinMaxScaler

In [17]:
print(defog_df.null_count())
# missing_rows = defog_df[defog_df.isnull().any(axis=1)]
# print(missing_rows)


shape: (1, 10)
┌──────┬──────┬───────┬───────┬───┬─────────┬───────┬──────┬────────────┐
│ Time ┆ AccV ┆ AccML ┆ AccAP ┆ … ┆ Walking ┆ Valid ┆ Task ┆ patient_id │
│ ---  ┆ ---  ┆ ---   ┆ ---   ┆   ┆ ---     ┆ ---   ┆ ---  ┆ ---        │
│ u32  ┆ u32  ┆ u32   ┆ u32   ┆   ┆ u32     ┆ u32   ┆ u32  ┆ u32        │
╞══════╪══════╪═══════╪═══════╪═══╪═════════╪═══════╪══════╪════════════╡
│ 0    ┆ 0    ┆ 0     ┆ 0     ┆ … ┆ 0       ┆ 0     ┆ 0    ┆ 0          │
└──────┴──────┴───────┴───────┴───┴─────────┴───────┴──────┴────────────┘


In [18]:
print(notype_df.null_count())
# print(notype_df.isnull().sum())
# missing_rows = notype_df[notype_df.isnull().any(axis=1)]
# print(missing_rows)

shape: (1, 8)
┌──────┬──────┬───────┬───────┬───────┬───────┬──────┬────────────┐
│ Time ┆ AccV ┆ AccML ┆ AccAP ┆ Event ┆ Valid ┆ Task ┆ patient_id │
│ ---  ┆ ---  ┆ ---   ┆ ---   ┆ ---   ┆ ---   ┆ ---  ┆ ---        │
│ u32  ┆ u32  ┆ u32   ┆ u32   ┆ u32   ┆ u32   ┆ u32  ┆ u32        │
╞══════╪══════╪═══════╪═══════╪═══════╪═══════╪══════╪════════════╡
│ 0    ┆ 0    ┆ 0     ┆ 0     ┆ 0     ┆ 0     ┆ 0    ┆ 0          │
└──────┴──────┴───────┴───────┴───────┴───────┴──────┴────────────┘


In [19]:
print(tdcsfog_df.null_count())
# print(tdcsfog_df.isnull().sum())
# missing_rows = tdcsfog_df[tdcsfog_df.isnull().any(axis=1)]
# print(missing_rows)

shape: (1, 8)
┌──────┬──────┬───────┬───────┬─────────────────┬──────┬─────────┬────────────┐
│ Time ┆ AccV ┆ AccML ┆ AccAP ┆ StartHesitation ┆ Turn ┆ Walking ┆ patient_id │
│ ---  ┆ ---  ┆ ---   ┆ ---   ┆ ---             ┆ ---  ┆ ---     ┆ ---        │
│ u32  ┆ u32  ┆ u32   ┆ u32   ┆ u32             ┆ u32  ┆ u32     ┆ u32        │
╞══════╪══════╪═══════╪═══════╪═════════════════╪══════╪═════════╪════════════╡
│ 0    ┆ 0    ┆ 0     ┆ 0     ┆ 0               ┆ 0    ┆ 0       ┆ 0          │
└──────┴──────┴───────┴───────┴─────────────────┴──────┴─────────┴────────────┘


In [20]:
G_CONVERSION = 9.80665

# Convert Acceleration columns to 'g' units
df_converted = tdcsfog_df.with_columns([
    (pl.col("AccV") / G_CONVERSION).alias("AccV_g"),
    (pl.col("AccML") / G_CONVERSION).alias("AccML_g"),
    (pl.col("AccAP") / G_CONVERSION).alias("AccAP_g")
])

print(df_converted)


shape: (7_062_672, 11)
┌──────┬───────────┬───────────┬───────────┬───┬────────────┬───────────┬───────────┬───────────┐
│ Time ┆ AccV      ┆ AccML     ┆ AccAP     ┆ … ┆ patient_id ┆ AccV_g    ┆ AccML_g   ┆ AccAP_g   │
│ ---  ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---        ┆ ---       ┆ ---       ┆ ---       │
│ i64  ┆ f64       ┆ f64       ┆ f64       ┆   ┆ str        ┆ f64       ┆ f64       ┆ f64       │
╞══════╪═══════════╪═══════════╪═══════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡
│ 0    ┆ -9.66589  ┆ 0.04255   ┆ 0.184744  ┆ … ┆ a171e61840 ┆ -0.985646 ┆ 0.004339  ┆ 0.018839  │
│ 1    ┆ -9.672969 ┆ 0.049217  ┆ 0.184644  ┆ … ┆ a171e61840 ┆ -0.986368 ┆ 0.005019  ┆ 0.018828  │
│ 2    ┆ -9.67026  ┆ 0.03362   ┆ 0.19379   ┆ … ┆ a171e61840 ┆ -0.986092 ┆ 0.003428  ┆ 0.019761  │
│ 3    ┆ -9.673356 ┆ 0.035159  ┆ 0.184369  ┆ … ┆ a171e61840 ┆ -0.986408 ┆ 0.003585  ┆ 0.0188    │
│ 4    ┆ -9.671458 ┆ 0.043913  ┆ 0.197814  ┆ … ┆ a171e61840 ┆ -0.986214 ┆ 0.004478  ┆ 0.020171 