In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns   
from scipy.signal import butter, filtfilt
# import seglearn as sglearn        # For windowing and sequence modeling
import tsfresh     
import os
from sklearn.preprocessing import StandardScaler

 
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import polars as pl
import dask.dataframe as dd
from pathlib import Path

# Data Exploration

In [None]:
# File paths for three training datasets
defog = Path('/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog')
notype = Path('/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/notype')
tdcsfog = Path('/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog')

In [None]:
defog_files = [f for f in os.listdir(defog) if f.endswith('.csv')]

# List to store individual DataFrames
defog_list = []

for path in defog.glob("*.csv"):
    patient_id = path.stem  # removes .csv

    df = pl.read_csv(path)
    df = df.with_columns([
        pl.lit(patient_id).alias("patient_id")
    ])
    
    defog_list.append(df)

defog_df = pl.concat(defog_list)
# for f in defog_files:
#     file_path = os.path.join(defog, f)
#     df = pl.read_csv(file_path)
#     df = df.with_columns([
#         pl.lit(f).alias('file')  # Add filename as identifier
#     ])
#     defog_list.append(df)

# # Concatenate into one large DataFrame
# defog_df = pl.concat(defog_list)

In [None]:
defog_df.head()

In [None]:
tdcsfog_files = [f for f in os.listdir(tdcsfog) if f.endswith('.csv')]

# List to store individual DataFrames
tdcsfog_list = []

for path in tdcsfog.glob("*.csv"):
    patient_id = path.stem  # removes .csv

    df = pl.read_csv(path)
    df = df.with_columns([
        pl.lit(patient_id).alias("patient_id")
    ])
    
    tdcsfog_list.append(df)

tdcsfog_df = pl.concat(tdcsfog_list)

In [None]:
tdcsfog_df.head()

In [None]:
print(defog_df.head())
# print(defog_df.info())
print(defog_df.describe())
print(defog_df.shape)     # (rows, columns)
print(defog_df.columns)   # list of column names
print(defog_df.dtypes)    # list of column types

In [None]:
print(tdcsfog_df.head())
# print(tdcsfog_df.info())
print(tdcsfog_df.shape)     # (rows, columns)
print(tdcsfog_df.columns)   # list of column names
print(tdcsfog_df.dtypes) 
print(tdcsfog_df.describe())

In [None]:
events_df = pd.read_csv('/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/events.csv')
print(events_df.head())
print(events_df.shape)   
print(events_df.columns)   
print(events_df.dtypes) 
print(events_df.describe())

In [None]:
unique_defog_patients = defog_df["patient_id"].unique()

print(unique_defog_patients)

In [None]:
# 1. Filter your Polars DF for a single patient and convert to pandas
df = defog_df.filter(pl.col("patient_id") == 'be9d33541d').to_pandas()

# 2. Plot
plt.figure(figsize=(15, 6))

# Plot acceleration
plt.plot(df['Time'], df['AccV'], label='AccV', alpha=0.7)
plt.plot(df['Time'], df['AccML'], label='AccML', alpha=0.7)
plt.plot(df['Time'], df['AccAP'], label='AccAP', alpha=0.7)

# 3. Plot events
plt.plot(df['Time'], df['StartHesitation'], label='StartHesitation', alpha=0.7)
plt.plot(df['Time'], df['Turn'], label='Turn', alpha=0.7)
plt.plot(df['Time'], df['Walking'], label='Walking', alpha=0.7)


# 4. Final touches
plt.xlabel("Time")
plt.ylabel("Acceleration (g)")
plt.title(f"Patient: {patient_id} - Acceleration + FOG Events")
plt.legend(loc="upper right")
plt.grid(True)
plt.tight_layout()
plt.show()

# Data Cleaning

In [None]:
# Data types of features 
print(f'DEFOG DATA TYPES:\n{defog_df.dtypes}\n')
print(f'TDCSFOG DATA TYPES:\n{tdcsfog_df.dtypes}\n')

In [None]:
print(tdcsfog_df.null_count())

In [None]:
# Convert accerlations in defog to m/s^2
G_CONVERSION = 9.80665
defog_df[["AccV", "AccML", "AccAP"]] *= G_CONVERSION
print(defog_df)

In [None]:
# Convert the Valid and Task Columns to Integer Columns
def convert_valid_and_t(df):
    df = df.with_columns(
        pl.col("Valid").cast(pl.Int8).alias("Valid")
    )
    
    df = df.with_columns(
        pl.col("Task").cast(pl.Int8).alias("Task")
    )
    return df
defog_df = convert_valid_and_t(defog_df)
# tdcsfog_df = convert_valid_and_t(tdcsfog_df)


print(defog_df)

In [None]:
# Create a new column that contains the acceleration magnitude
def acc_magnitude(df):
    df = df.with_columns(
        (
            (pl.col("AccV") ** 2 + pl.col("AccML") ** 2 + pl.col("AccAP") ** 2).sqrt()
        ).alias("Acc_MAGNITUDE")
    )

    return df

tdcsfog_df = acc_magnitude(tdcsfog_df)
defog_df = acc_magnitude(defog_df)
defog_df

In [None]:
# Standardize acceleration per patient for each training dataframe
def standardize_acc_by_patient(df: pl.DataFrame):
    acc_cols = ['AccV', 'AccML', 'AccAP']
    for col in acc_cols:
        df = df.with_columns(
            (
                (pl.col(col) - pl.col(col).mean().over("patient_id")) /
                pl.col(col).std().over("patient_id")
            ).alias(col)  # overwrite original column
        )
    return df

tdcsfog_df = standardize_acc_by_patient(tdcsfog_df)
defog_df = standardize_acc_by_patient(defog_df)
defog_df

In [None]:
# Band-pass Filter 
def infer_fs(time_seconds: np.ndarray) -> float:
    dt = np.diff(np.asarray(time_seconds, dtype=float))
    dt = dt[np.isfinite(dt) & (dt > 0)]
    if dt.size == 0:
        raise ValueError("Cannot infer sampling frequency from Time column.")
    return 1.0 / np.median(dt)

def design_bandpass(low_hz: float, high_hz: float, fs: float, order: int = 4):
    nyq = fs / 2.0
    low = max(1e-6, low_hz / nyq)
    high = min(0.999999, high_hz / nyq)
    if not (0 < low < high < 1):
        raise ValueError(f"Invalid band for fs={fs:.3f}Hz: low={low_hz}Hz, high={high_hz}Hz")
    b, a = butter(order, [low, high], btype="band")
    return b, a

def bandpass_series(y: pd.Series, b, a) -> np.ndarray:
    sig = pd.to_numeric(y, errors="coerce").interpolate(limit_direction="both").to_numpy(float)
    return filtfilt(b, a, sig, method="pad")

def bandpass_dataframe(df: pd.DataFrame, cols=('AccV','AccML','AccAP'),
                       low_hz=0.1, high_hz=30.0, order=4) -> pd.DataFrame:
    out = df.copy()
    # Only keep columns that exist
    cols = tuple([c for c in cols if c in out.columns])
    if len(cols) == 0:
        return out

    fs = infer_fs(out['Time'].to_numpy())
    b, a = design_bandpass(low_hz, high_hz, fs, order)
    for col in cols:
        out[f"{col}_bp"] = bandpass_series(out[col], b, a)
    return out



In [None]:
# Apply Band-pass to all patients 
def add_bandpass_to_all_patients(pl_df: pl.DataFrame,
                                 cols=('AccV','AccML','AccAP'),
                                 low_hz=0.1, high_hz=30.0, order=4) -> pl.DataFrame:
    if "patient_id" not in pl_df.columns:
        raise ValueError("Expected a 'patient_id' column.")

    out_chunks = []
    # Unique patient list
    patient_ids = pl_df.select("patient_id").unique().to_series().to_list()

    for pid in patient_ids:
        g = pl_df.filter(pl.col("patient_id") == pid).to_pandas()
        # Skip tiny or malformed groups
        if "Time" not in g.columns or len(g) < 5:
            out_chunks.append(pl.from_pandas(g))  # just pass through
            continue

        try:
            g_bp = bandpass_dataframe(g, cols=cols, low_hz=low_hz, high_hz=high_hz, order=order)
        except Exception as e:
            print(f"[WARN] Skipping bandpass for patient {pid}: {e}")
            g_bp = g  # pass through raw if something fails

        out_chunks.append(pl.from_pandas(g_bp))

    return pl.concat(out_chunks, how="vertical_relaxed")

defog_df_bp   = add_bandpass_to_all_patients(defog_df,   cols=('AccV','AccML','AccAP'),
                                             low_hz=0.1, high_hz=30.0, order=4)
tdcsfog_df_bp = add_bandpass_to_all_patients(tdcsfog_df, cols=('AccV','AccML','AccAP'),
                                             low_hz=0.1, high_hz=30.0, order=4)

print("DEFOG with band-pass columns:", [c for c in defog_df_bp.columns if c.endswith("_bp")][:6], "...")
print("TDCSFOG with band-pass columns:", [c for c in tdcsfog_df_bp.columns if c.endswith("_bp")][:6], "...")

In [None]:
# Create a new column that contains Time as seconds
def time_to_seconds(df, hertz):
    df = df.with_columns(
        (
            (pl.col("Time") / hertz)
        ).alias("Time (seconds)")
    )

    return df

tdcsfog_df = time_to_seconds(tdcsfog_df, 128)
defog_df = time_to_seconds(tdcsfog_df, 100)
defog_df

In [None]:
# Check for outliers from acceleration
def detect_outliers(df: pl.DataFrame):
    acc_cols = ['AccV', 'AccML', 'AccAP']
    for col in acc_cols: 
        z_col = col
        outlier_df = df.filter(pl.col(z_col).abs() > 3.0)
    return outlier_df
print(detect_outliers(defog_df))
print(detect_outliers(tdcsfog_df))

## Visualize Acceleration  Signals During FoG Events

In [None]:
# Get unique patient IDs with a StartHesitation, Turn, and Walking event
# Take a subset of 3 patients for each event
StartHesPatients = (
    defog_df.filter(pl.col("StartHesitation") == 1)
            .select("patient_id")
            .unique()
            .to_series()[:3]  # take first 3
)
print(f"Patients with Start Hesitation: {StartHesPatients.to_list()}")

TurnPatients = (
    defog_df.filter(pl.col("Turn") == 1)
            .select("patient_id")
            .unique()
            .to_series()[:3]
)
print(f"Patients with Turn: {TurnPatients.to_list()}")

WalkingPatients = (
    defog_df.filter(pl.col("Walking") == 1)
            .select("patient_id")
            .unique()
            .to_series()[:3]
)
print(f"Patients with Walking: {WalkingPatients.to_list()}")

In [None]:
# Get unique patient IDs with a StartHesitation, Turn, and Walking event 
# (including band-pass)
if {"StartHesitation","Turn","Walking"}.issubset(set(defog_df_bp.columns)):
    StartHesPatients = (
        defog_df_bp.filter(pl.col("StartHesitation") == 1)
                   .select("patient_id").unique().to_series()[:3]
    )
    TurnPatients = (
        defog_df_bp.filter(pl.col("Turn") == 1)
                   .select("patient_id").unique().to_series()[:3]
    )
    WalkingPatients = (
        defog_df_bp.filter(pl.col("Walking") == 1)
                   .select("patient_id").unique().to_series()[:3]
    )
    print(f"Patients with Start Hesitation: {StartHesPatients.to_list()}")
    print(f"Patients with Turn: {TurnPatients.to_list()}")
    print(f"Patients with Walking: {WalkingPatients.to_list()}")

In [None]:
# Start Hestitation
# 1. Filter your Polars DF for a single patient and convert to pandas
df = defog_df.filter(pl.col("patient_id") == '81262644e7').to_pandas()

# 2. Plot
plt.figure(figsize=(15, 6))

# Plot acceleration
plt.plot(df['Time'], df['AccV'], label='AccV', alpha=0.7)
plt.plot(df['Time'], df['AccML'], label='AccML', alpha=0.7)
plt.plot(df['Time'], df['AccAP'], label='AccAP', alpha=0.7)

# 3. Plot events
plt.plot(df['Time'], df['StartHesitation'], label='StartHesitation', alpha=0.7)


# 4. Final touches
plt.xlabel("Time")
plt.ylabel("Acceleration (g)")
plt.title(f"Patient: {patient_id} - Acceleration + FOG Events")
plt.legend(loc="upper right")
plt.grid(True)
plt.tight_layout()
plt.show()


# Start Hestitation
# 1. Filter your Polars DF for a single patient and convert to pandas
df = defog_df.filter(pl.col("patient_id") == '3ba3590a08').to_pandas()

# 2. Plot
plt.figure(figsize=(15, 6))

# Plot acceleration
plt.plot(df['Time'], df['AccV'], label='AccV', alpha=0.7)
plt.plot(df['Time'], df['AccML'], label='AccML', alpha=0.7)
plt.plot(df['Time'], df['AccAP'], label='AccAP', alpha=0.7)

# 3. Plot events
plt.plot(df['Time'], df['StartHesitation'], label='StartHesitation', alpha=0.7)


# 4. Final touches
plt.xlabel("Time")
plt.ylabel("Acceleration (g)")
plt.title(f"Patient: {patient_id} - Acceleration + FOG Events")
plt.legend(loc="upper right")
plt.grid(True)
plt.tight_layout()
plt.show()



# Start Hestitation
# 1. Filter your Polars DF for a single patient and convert to pandas
df = defog_df.filter(pl.col("patient_id") == 'd98358a75f').to_pandas()

# 2. Plot
plt.figure(figsize=(15, 6))

# Plot acceleration
plt.plot(df['Time'], df['AccV'], label='AccV', alpha=0.7)
plt.plot(df['Time'], df['AccML'], label='AccML', alpha=0.7)
plt.plot(df['Time'], df['AccAP'], label='AccAP', alpha=0.7)

# 3. Plot events
plt.plot(df['Time'], df['StartHesitation'], label='StartHesitation', alpha=0.7)


# 4. Final touches
plt.xlabel("Time")
plt.ylabel("Acceleration (g)")
plt.title(f"Patient: {patient_id} - Acceleration + FOG Events")
plt.legend(loc="upper right")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Patient in defog bp with Start Hesitation 
patient_id = "e069a57511" 
dfp = defog_df_bp.filter(pl.col("patient_id") == patient_id).to_pandas()

plt.figure(figsize=(16,7))

# Raw
plt.plot(dfp["Time"], dfp.get("AccV", pd.Series()),   label="AccV (raw)",   alpha=0.35)
plt.plot(dfp["Time"], dfp.get("AccML", pd.Series()),  label="AccML (raw)",  alpha=0.35)
plt.plot(dfp["Time"], dfp.get("AccAP", pd.Series()),  label="AccAP (raw)",  alpha=0.35)

# Filtered (if present)
if "AccV_bp" in dfp:
    plt.plot(dfp["Time"], dfp["AccV_bp"],  label="AccV (0.1–30 Hz)",  linewidth=1.5)
if "AccML_bp" in dfp:
    plt.plot(dfp["Time"], dfp["AccML_bp"], label="AccML (0.1–30 Hz)", linewidth=1.5)
if "AccAP_bp" in dfp:
    plt.plot(dfp["Time"], dfp["AccAP_bp"], label="AccAP (0.1–30 Hz)", linewidth=1.5)

# Event overlays
for ev in ["StartHesitation", "Turn", "Walking"]:
    if ev in dfp.columns:
        plt.plot(dfp["Time"], dfp[ev], label=ev, alpha=0.6)

plt.xlabel("Time (s)")
plt.ylabel("Acceleration (m/s²)  (or z-score if standardized)")
plt.title(f"Patient {patient_id} – Raw vs Band-pass (Start Hesitation)")
plt.legend(ncol=3)
plt.grid(True)
plt.tight_layout()
plt.show()

# Patient in defog bp that has turn 
patient_id = "771d69d829" 
dfp = defog_df_bp.filter(pl.col("patient_id") == patient_id).to_pandas()

plt.figure(figsize=(16,7))

# Raw
plt.plot(dfp["Time"], dfp.get("AccV", pd.Series()),   label="AccV (raw)",   alpha=0.35)
plt.plot(dfp["Time"], dfp.get("AccML", pd.Series()),  label="AccML (raw)",  alpha=0.35)
plt.plot(dfp["Time"], dfp.get("AccAP", pd.Series()),  label="AccAP (raw)",  alpha=0.35)

# Filtered (if present)
if "AccV_bp" in dfp:
    plt.plot(dfp["Time"], dfp["AccV_bp"],  label="AccV (0.1–30 Hz)",  linewidth=1.5)
if "AccML_bp" in dfp:
    plt.plot(dfp["Time"], dfp["AccML_bp"], label="AccML (0.1–30 Hz)", linewidth=1.5)
if "AccAP_bp" in dfp:
    plt.plot(dfp["Time"], dfp["AccAP_bp"], label="AccAP (0.1–30 Hz)", linewidth=1.5)

# Event overlays 
for ev in ["StartHesitation", "Turn", "Walking"]:
    if ev in dfp.columns:
        plt.plot(dfp["Time"], dfp[ev], label=ev, alpha=0.6)

plt.xlabel("Time (s)")
plt.ylabel("Acceleration (m/s²)  (or z-score if standardized)")
plt.title(f"Patient {patient_id} – Raw vs Band-pass (Turn)")
plt.legend(ncol=3)
plt.grid(True)
plt.tight_layout()
plt.show()

# Patient in defog bp that has Walking 
patient_id = "4c3aa8ea6e" 
dfp = defog_df_bp.filter(pl.col("patient_id") == patient_id).to_pandas()

plt.figure(figsize=(16,7))

# Raw
plt.plot(dfp["Time"], dfp.get("AccV", pd.Series()),   label="AccV (raw)",   alpha=0.35)
plt.plot(dfp["Time"], dfp.get("AccML", pd.Series()),  label="AccML (raw)",  alpha=0.35)
plt.plot(dfp["Time"], dfp.get("AccAP", pd.Series()),  label="AccAP (raw)",  alpha=0.35)

# Filtered (if present)
if "AccV_bp" in dfp:
    plt.plot(dfp["Time"], dfp["AccV_bp"],  label="AccV (0.1–30 Hz)",  linewidth=1.5)
if "AccML_bp" in dfp:
    plt.plot(dfp["Time"], dfp["AccML_bp"], label="AccML (0.1–30 Hz)", linewidth=1.5)
if "AccAP_bp" in dfp:
    plt.plot(dfp["Time"], dfp["AccAP_bp"], label="AccAP (0.1–30 Hz)", linewidth=1.5)

for ev in ["StartHesitation", "Turn", "Walking"]:
    if ev in dfp.columns:
        plt.plot(dfp["Time"], dfp[ev], label=ev, alpha=0.6)

plt.xlabel("Time (s)")
plt.ylabel("Acceleration (m/s²)  (or z-score if standardized)")
plt.title(f"Patient {patient_id} – Raw vs Band-pass (Walking)")
plt.legend(ncol=3)
plt.grid(True)
plt.tight_layout()
plt.show()