In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold

In [8]:
metadata_tdcsfog = pd.read_csv("./data/tdcsfog_metadata.csv")
metadata_tdcsfog["fold"] = None
train_dir_tdcsfog = "./data/train/tdcsfog/"
metadata_defog = pd.read_csv("./data/defog_metadata.csv")
metadata_defog["fold"] = None
train_dir_defog = "./data/train/defog/"

sgkf = StratifiedGroupKFold(n_splits=5, random_state=42, shuffle=True)

# tdcsfog
for i, (train_index, valid_index) in enumerate(
    sgkf.split(
        X=metadata_tdcsfog["Id"],
        y=[1] * len(metadata_tdcsfog),
        groups=metadata_tdcsfog["Subject"],
    )
):

    metadata_tdcsfog.loc[valid_index, "fold"] = i

metadata_tdcsfog["fpath"] = [f"{train_dir_tdcsfog}{_id}.csv" for _id in metadata_tdcsfog["Id"]]

# defog
# Remove entries with no events first
metadata_defog["n1_sum"] = 0
metadata_defog["n2_sum"] = 0
metadata_defog["n3_sum"] = 0
metadata_defog["count"] = 0

for f in metadata_defog["Id"]:
    fpath = f"{train_dir_defog}{f}.csv"
    if not os.path.exists(fpath):
        continue

    df = pd.read_csv(fpath)
    metadata_defog.loc[metadata_defog["Id"] == f, "n1_sum"] = np.sum(df["StartHesitation"])
    metadata_defog.loc[metadata_defog["Id"] == f, "n2_sum"] = np.sum(df["Turn"])
    metadata_defog.loc[metadata_defog["Id"] == f, "n3_sum"] = np.sum(df["Walking"])
    metadata_defog.loc[metadata_defog["Id"] == f, "count"] = len(df)

metadata_defog = metadata_defog[metadata_defog["count"] > 0].reset_index()

for i, (train_index, valid_index) in enumerate(
    sgkf.split(
        X=metadata_defog["Id"],
        y=[1] * len(metadata_defog),
        groups=metadata_defog["Subject"],
    )
):

    metadata_defog.loc[valid_index, "fold"] = i


metadata_defog["fpath"] = [f"{train_dir_defog}{_id}.csv" for _id in metadata_defog["Id"]]

metadata_tdcsfog["type"] = "tdcs"
metadata_defog["type"] = "defog"

metadata = pd.concat([metadata_defog[["fpath", "type", "fold"]], metadata_tdcsfog[["fpath", "type", "fold"]]], ignore_index=True)

In [9]:
metadata.head()

Unnamed: 0,fpath,type,fold
0,./data/train/defog/02ea782681.csv,defog,4
1,./data/train/defog/06414383cf.csv,defog,3
2,./data/train/defog/092b4c1819.csv,defog,0
3,./data/train/defog/0c55be4384.csv,defog,4
4,./data/train/defog/0d7ab3a9f9.csv,defog,3


In [10]:
if not os.path.exists("data/preprocessed"):
    os.mkdir("data/preprocessed")
metadata.to_csv("data/preprocessed/metadata.csv", index=False)