In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os, sys, math, re
from pathlib import Path
from typing import Optional, Tuple, Dict, List

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

print("Python:", sys.version)
print("Pandas:", pd.__version__)
pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 160)

# -------------------------------
# Locate the competition directory
# -------------------------------
DEFAULT_SLUG = "tlvmc-parkinsons-freezing-gait-prediction"

def guess_competition_dir() -> Path:
    base = Path("/kaggle/input")
    d = base / DEFAULT_SLUG
    if d.exists():
        return d
    # Fallback: keyword guess if the slug differs
    cands = [
        p for p in base.iterdir()
        if p.is_dir() and any(k in p.name.lower() for k in ["parkin", "freez", "gait", "fog"])
    ]
    if cands:
        print("Guessed competition dir:", cands[0])
        return cands[0]
    raise FileNotFoundError(
        "Cannot find the competition folder under /kaggle/input. "
        "Make sure you've added the dataset via 'Add data -> Competitions'."
    )

COMP_DIR = guess_competition_dir()
print("Competition dir:", COMP_DIR)
print("Root files/folders:", sorted(os.listdir(COMP_DIR)))

# -----------------------------------
# Task 1: Load and inspect root CSVs
# -----------------------------------
# We will load all CSVs at the root (the ones you showed in your screenshot)
root_csvs = sorted(COMP_DIR.glob("*.csv"))
print("\nRoot CSV files detected:", [p.name for p in root_csvs])

meta_tables: Dict[str, pd.DataFrame] = {}
for p in root_csvs:
    try:
        df = pd.read_csv(p, low_memory=False)
        meta_tables[p.stem] = df
        print(f"[LOAD] {p.name} -> shape={df.shape}")
        display(df.head(5))
    except Exception as e:
        print(f"[WARN] Failed to read {p.name}: {e}")

# Also list the train/test/unlabeled folders if present
for sub in ["train", "test", "unlabeled"]:
    d = COMP_DIR / sub
    if d.exists():
        print(f"\n{sub.upper()} folder contents:", sorted(os.listdir(d))[:10], "...")

# ------------------------------------------------
# Enumerate time-series CSVs for the two train sets
# ------------------------------------------------
train_defog_dir = COMP_DIR / "train" / "defog"
train_tdcs_dir  = COMP_DIR / "train" / "tdcsfog"

def list_csvs(d: Path) -> List[Path]:
    return sorted(d.glob("*.csv")) if d.exists() else []

defog_files = list_csvs(train_defog_dir)
tdcs_files  = list_csvs(train_tdcs_dir)
print(f"\n#defog files: {len(defog_files)} | #tdcsfog files: {len(tdcs_files)}")

# Peek one example file to inspect columns
example_path = defog_files[0] if defog_files else (tdcs_files[0] if tdcs_files else None)
if example_path:
    ex = pd.read_csv(example_path, nrows=5)
    print("Example series file:", example_path.name, ex.shape)
    print("Columns:", list(ex.columns))
    display(ex.head())
else:
    print("[WARN] No series CSV found under train/defog or train/tdcsfog")

# ----------------------------------------------------------
# Task 2: Missing values & outliers handling for time-series
# ----------------------------------------------------------
# Strategy
# - Detect a time column (common names: Time, time, Timestamp, timestamp). If found:
#     * coerce to numeric, drop NaNs in time, sort, drop duplicate timestamps.
# - If a 'Valid' column exists (0/1), drop rows where Valid == 0.
# - For numeric columns (excluding the time column):
#     * Robust z-score using median & MAD; mark |z| > threshold as outliers -> set NaN.
#     * Linear interpolation (limit consecutive gap length), then ffill/bfill as fallback.
# - Save cleaned CSVs to /kaggle/working/cleaned/{defog|tdcsfog}/
# - Produce a per-file summary CSV with counts.

CLEAN_ROOT = Path("/kaggle/working/cleaned")
(CLEAN_ROOT / "defog").mkdir(parents=True, exist_ok=True)
(CLEAN_ROOT / "tdcsfog").mkdir(parents=True, exist_ok=True)

OUTLIER_Z_THRESHOLD = 8.0   # smaller -> stricter outlier removal
INTERP_LIMIT = 10           # max length of consecutive NaN segment for interpolation

def robust_zscore(x: pd.Series) -> pd.Series:
    """Return robust z-score: (x - median) / (1.4826 * MAD). If MAD==0, return zeros."""
    x = pd.to_numeric(x, errors="coerce")
    med = np.nanmedian(x)
    mad = np.nanmedian(np.abs(x - med))
    if not np.isfinite(mad) or mad == 0:
        return pd.Series(np.zeros(len(x)), index=x.index)
    return (x - med) / (1.4826 * mad)

def detect_time_col(df: pd.DataFrame) -> Optional[str]:
    for c in ["Time", "time", "Timestamp", "timestamp"]:
        if c in df.columns:
            return c
    # Fallback: if first column is numeric and non-decreasing, treat it as time
    first = df.columns[0]
    s = pd.to_numeric(df[first], errors="coerce")
    if s.is_monotonic_increasing:
        return first
    return None

def clean_timeseries_df(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, int]]:
    report: Dict[str, int] = {}
    d = df.copy()

    # 1) Time column checks
    tcol = detect_time_col(d)
    if tcol:
        d[tcol] = pd.to_numeric(d[tcol], errors="coerce")
        before = len(d)
        d = d.dropna(subset=[tcol])
        d = d.sort_values(tcol)
        d = d.loc[~d[tcol].duplicated(keep="first")]
        report["dropped_bad_time_rows"] = int(before - len(d))
    else:
        report["dropped_bad_time_rows"] = 0

    # 2) Drop invalid rows if 'Valid' exists
    if "Valid" in d.columns:
        before = len(d)
        d = d.loc[pd.to_numeric(d["Valid"], errors="coerce") != 0]
        report["dropped_valid0"] = int(before - len(d))
    else:
        report["dropped_valid0"] = 0

    # 3) Outliers & missing for numeric columns (excluding time)
    num_cols = d.select_dtypes(include=[np.number]).columns.tolist()
    if tcol in num_cols:
        num_cols.remove(tcol)

    outlier_total = 0
    if tcol:
        d = d.set_index(tcol)

    for c in num_cols:
        z = robust_zscore(d[c])
        mask = z.abs() > OUTLIER_Z_THRESHOLD
        outlier_total += int(mask.sum())
        d.loc[mask, c] = np.nan
        # Interpolate small gaps, then ffill/bfill
        d[c] = d[c].interpolate(method="linear", limit=INTERP_LIMIT, limit_direction="both")
        d[c] = d[c].ffill().bfill()

    if tcol:
        d = d.reset_index()

    report["outliers_total"] = outlier_total
    report["final_nans"] = int(d.isna().sum().sum())
    return d, report

def process_folder(src_dir: Path, dst_dir: Path, limit: Optional[int] = None) -> pd.DataFrame:
    files = list_csvs(src_dir)
    if limit is not None:
        files = files[:limit]

    logs = []
    for i, p in enumerate(files, 1):
        try:
            raw = pd.read_csv(p)
            cleaned, rep = clean_timeseries_df(raw)
            out_path = dst_dir / p.name
            cleaned.to_csv(out_path, index=False)
            logs.append({
                "file": p.name,
                "rows_in": int(len(raw)),
                "rows_out": int(len(cleaned)),
                **rep
            })
        except Exception as e:
            logs.append({"file": p.name, "error": str(e)})

        if i % 50 == 0:
            print(f"...processed {i}/{len(files)}")

    log_df = pd.DataFrame(logs)
    display(log_df.head(10))
    return log_df

summaries = {}
if defog_files:
    print("\n[Clean] DEFoG series")
    summaries["defog"] = process_folder(train_defog_dir, CLEAN_ROOT / "defog")
    summaries["defog"].to_csv(CLEAN_ROOT / "clean_summary_defog.csv", index=False)
    print("Saved:", CLEAN_ROOT / "clean_summary_defog.csv")

if tdcs_files:
    print("\n[Clean] tDCSFoG series")
    summaries["tdcsfog"] = process_folder(train_tdcs_dir, CLEAN_ROOT / "tdcsfog")
    summaries["tdcsfog"].to_csv(CLEAN_ROOT / "clean_summary_tdcsfog.csv", index=False)
    print("Saved:", CLEAN_ROOT / "clean_summary_tdcsfog.csv")

# ---------------------------
# Light cleaning for meta CSV
# ---------------------------
meta_out = CLEAN_ROOT / "meta"
meta_out.mkdir(parents=True, exist_ok=True)
for name, df in meta_tables.items():
    dd = df.drop_duplicates().convert_dtypes()
    dd.to_csv(meta_out / f"{name}.csv", index=False)
    print(f"[META SAVED] {name}.csv -> {dd.shape}")
    display(dd.head(5))

print("\n=== DONE ===")
print("Cleaned data root:", CLEAN_ROOT)

/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/sample_submission.csv
/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/subjects.csv
/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/tasks.csv
/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/defog_metadata.csv
/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/daily_metadata.csv
/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/events.csv
/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/tdcsfog_metadata.csv
/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/unlabeled/48b636e0f5.parquet
/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/unlabeled/9fb7805d99.parquet
/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/unlabeled/e658b0aa3d.parquet
/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/unlabeled/1c3719ea59.parquet
/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/unlabeled/164adaed7b.parquet
/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/unlabeled/24

Unnamed: 0,Id,Subject,Visit,Beginning of recording [00:00-23:59]
0,00c4c9313d,fba3a3,1,10:19
1,07a96f89ec,7da72f,1,07:30
2,0d1bc672a8,056372,2,08:30
3,0e333c9833,b4bd22,1,11:30
4,164adaed7b,9f72eb,1,13:00


[LOAD] defog_metadata.csv -> shape=(137, 4)


Unnamed: 0,Id,Subject,Visit,Medication
0,02ab235146,e1f62e,2,on
1,02ea782681,ae2d35,2,on
2,06414383cf,8c1f5e,2,off
3,092b4c1819,2874c5,1,off
4,0a900ed8a2,0e3d49,2,on


[LOAD] events.csv -> shape=(3544, 5)


Unnamed: 0,Id,Init,Completion,Type,Kinetic
0,003f117e14,8.61312,14.7731,Turn,1.0
1,009ee11563,11.3847,41.1847,Turn,1.0
2,009ee11563,54.6647,58.7847,Turn,1.0
3,011322847a,28.0966,30.2966,Turn,1.0
4,01d0fe7266,30.3184,31.8784,Turn,1.0


[LOAD] sample_submission.csv -> shape=(286370, 4)


Unnamed: 0,Id,StartHesitation,Turn,Walking
0,003f117e14_0,0,0,0
1,003f117e14_1,0,0,0
2,003f117e14_2,0,0,0
3,003f117e14_3,0,0,0
4,003f117e14_4,0,0,0


[LOAD] subjects.csv -> shape=(173, 8)


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Subject,Visit,Age,Sex,YearsSinceDx,UPDRSIII_On,UPDRSIII_Off,NFOGQ
0,00f674,2.0,63,M,27.0,43.0,49.0,24
1,00f674,1.0,63,M,27.0,31.0,30.0,26
2,02bc69,,69,M,4.0,21.0,,22
3,040587,2.0,75,M,26.0,52.0,69.0,21
4,040587,1.0,75,M,26.0,47.0,75.0,24


[LOAD] tasks.csv -> shape=(2817, 4)


Unnamed: 0,Id,Begin,End,Task
0,02ab235146,10.0,190.48,Rest1
1,02ab235146,211.24,271.56,Rest2
2,02ab235146,505.88,522.4,4MW
3,02ab235146,577.96,594.64,4MW-C
4,02ab235146,701.32,715.28,MB1


[LOAD] tdcsfog_metadata.csv -> shape=(833, 5)


Unnamed: 0,Id,Subject,Visit,Test,Medication
0,003f117e14,4dc2f8,3,2,on
1,009ee11563,f62eec,4,2,on
2,011322847a,231c3b,2,2,on
3,01d0fe7266,231c3b,2,1,off
4,024418ba39,fa8764,19,3,on



TRAIN folder contents: ['defog', 'notype', 'tdcsfog'] ...

TEST folder contents: ['defog', 'tdcsfog'] ...

UNLABELED folder contents: ['00c4c9313d.parquet', '07a96f89ec.parquet', '0d1bc672a8.parquet', '0e333c9833.parquet', '164adaed7b.parquet', '17e0c0dc86.parquet', '1c3719ea59.parquet', '1cf80df2d6.parquet', '24016102f2.parquet', '276630050d.parquet'] ...

#defog files: 91 | #tdcsfog files: 833
Example series file: 02ea782681.csv (5, 9)
Columns: ['Time', 'AccV', 'AccML', 'AccAP', 'StartHesitation', 'Turn', 'Walking', 'Valid', 'Task']


Unnamed: 0,Time,AccV,AccML,AccAP,StartHesitation,Turn,Walking,Valid,Task
0,0,-1.0,0.044129,-0.25,0,0,0,False,False
1,1,-1.0,0.034431,-0.25,0,0,0,False,False
2,2,-1.0,0.03125,-0.25,0,0,0,False,False
3,3,-1.0,0.03125,-0.25,0,0,0,False,False
4,4,-1.0,0.03125,-0.25,0,0,0,False,False



[Clean] DEFoG series
...processed 50/91


Unnamed: 0,file,rows_in,rows_out,dropped_bad_time_rows,dropped_valid0,outliers_total,final_nans
0,02ea782681.csv,162907,36873,0,126034,1919,0
1,06414383cf.csv,109125,39442,0,69683,67,0
2,092b4c1819.csv,182660,20176,0,162484,6,0
3,0c55be4384.csv,122506,25949,0,96557,14,0
4,0d7ab3a9f9.csv,95685,37727,0,57958,221,0
5,0eaac04f17.csv,197621,47292,0,150329,1348,0
6,0ec76d2d8e.csv,159153,33231,0,125922,740,0
7,139f60d29b.csv,93762,25505,0,68257,178,0
8,13a4fe5159.csv,189216,75466,0,113750,3128,0
9,15508c7f41.csv,127373,30084,0,97289,293,0


Saved: /kaggle/working/cleaned/clean_summary_defog.csv

[Clean] tDCSFoG series
...processed 50/833
...processed 100/833
...processed 150/833
...processed 200/833
...processed 250/833
...processed 300/833
...processed 350/833
...processed 400/833
...processed 450/833
...processed 500/833
...processed 550/833
...processed 600/833
...processed 650/833
...processed 700/833
...processed 750/833
...processed 800/833


Unnamed: 0,file,rows_in,rows_out,dropped_bad_time_rows,dropped_valid0,outliers_total,final_nans
0,003f117e14.csv,4682,4682,0,0,15,0
1,009ee11563.csv,9920,9920,0,0,20,0
2,011322847a.csv,5187,5187,0,0,8,0
3,01d0fe7266.csv,7120,7120,0,0,100,0
4,024418ba39.csv,5254,5254,0,0,89,0
5,024ba3ebd5.csv,3275,3275,0,0,4,0
6,02e8454f57.csv,3749,3749,0,0,3,0
7,02edc527c0.csv,5661,5661,0,0,79,0
8,0330ea6680.csv,5507,5507,0,0,2,0
9,0383be6b43.csv,9215,9215,0,0,28,0


Saved: /kaggle/working/cleaned/clean_summary_tdcsfog.csv
[META SAVED] daily_metadata.csv -> (65, 4)


Unnamed: 0,Id,Subject,Visit,Beginning of recording [00:00-23:59]
0,00c4c9313d,fba3a3,1,10:19
1,07a96f89ec,7da72f,1,07:30
2,0d1bc672a8,056372,2,08:30
3,0e333c9833,b4bd22,1,11:30
4,164adaed7b,9f72eb,1,13:00


[META SAVED] defog_metadata.csv -> (137, 4)


Unnamed: 0,Id,Subject,Visit,Medication
0,02ab235146,e1f62e,2,on
1,02ea782681,ae2d35,2,on
2,06414383cf,8c1f5e,2,off
3,092b4c1819,2874c5,1,off
4,0a900ed8a2,0e3d49,2,on


[META SAVED] events.csv -> (3544, 5)


Unnamed: 0,Id,Init,Completion,Type,Kinetic
0,003f117e14,8.61312,14.7731,Turn,1
1,009ee11563,11.3847,41.1847,Turn,1
2,009ee11563,54.6647,58.7847,Turn,1
3,011322847a,28.0966,30.2966,Turn,1
4,01d0fe7266,30.3184,31.8784,Turn,1


[META SAVED] sample_submission.csv -> (286370, 4)


Unnamed: 0,Id,StartHesitation,Turn,Walking
0,003f117e14_0,0,0,0
1,003f117e14_1,0,0,0
2,003f117e14_2,0,0,0
3,003f117e14_3,0,0,0
4,003f117e14_4,0,0,0


[META SAVED] subjects.csv -> (173, 8)


Unnamed: 0,Subject,Visit,Age,Sex,YearsSinceDx,UPDRSIII_On,UPDRSIII_Off,NFOGQ
0,00f674,2.0,63,M,27.0,43,49.0,24
1,00f674,1.0,63,M,27.0,31,30.0,26
2,02bc69,,69,M,4.0,21,,22
3,040587,2.0,75,M,26.0,52,69.0,21
4,040587,1.0,75,M,26.0,47,75.0,24


[META SAVED] tasks.csv -> (2817, 4)


Unnamed: 0,Id,Begin,End,Task
0,02ab235146,10.0,190.48,Rest1
1,02ab235146,211.24,271.56,Rest2
2,02ab235146,505.88,522.4,4MW
3,02ab235146,577.96,594.64,4MW-C
4,02ab235146,701.32,715.28,MB1


[META SAVED] tdcsfog_metadata.csv -> (833, 5)


Unnamed: 0,Id,Subject,Visit,Test,Medication
0,003f117e14,4dc2f8,3,2,on
1,009ee11563,f62eec,4,2,on
2,011322847a,231c3b,2,2,on
3,01d0fe7266,231c3b,2,1,off
4,024418ba39,fa8764,19,3,on



=== DONE ===
Cleaned data root: /kaggle/working/cleaned
