Create a merged csv for the late fusion model:

In [2]:
import pandas as pd

In [None]:
mri_csv = "PATH-on-Charité-HPC/MRI_DIAGNOSIS.csv"
clinical_csv = "PATH-on-Charité-HPC/clinical-5.csv"

In [9]:
mri_df = pd.read_csv(mri_csv, dtype={"RID": str})
clinical_df = pd.read_csv(clinical_csv)

Keep only patients with label AD=1 and CN=0 for MRI metadata:

In [14]:
mri_df = mri_df[mri_df["Diagnosis_Code"].isin([0, 2])].copy()
mri_df["EXAMDATE"] = pd.to_datetime(mri_df["EXAMDATE"])
mri_df["Diagnosis_Code"] = mri_df["Diagnosis_Code"].replace({2: 1})

In [15]:
mri_df["Diagnosis_Code"].unique()

array([0, 1])

Remove unnecessary columns:

In [25]:
mri_cols_to_keep = ["PTID", "EXAMDATE", "Diagnosis_Code"]
mri_df = mri_df[[col for col in mri_df.columns if col in mri_cols_to_keep]]

In [26]:
mri_df.columns

Index(['PTID', 'EXAMDATE', 'Diagnosis_Code'], dtype='object')

Keep only patients with label AD=1 and CN=0 for Clinical data:

In [16]:
clinical_df = clinical_df[clinical_df["label"].isin([0, 2])].copy()
clinical_df["label"] = clinical_df["label"].replace({2: 1})

In [6]:
clinical_df.columns

Index(['PTID', 'RID', 'label', 'NXVISUAL', 'NXAUDITO', 'NXTREMOR', 'NXCONSCI',
       'NXNERVE', 'NXMOTOR', 'NXFINGER', 'NXHEEL', 'NXSENSOR', 'NXTENDON',
       'NXPLANTA', 'NXGAIT', 'NXABNORM', 'TOTAL11', 'TOTALMOD', 'MMSCORE',
       'GDTOTAL', 'HMSCORE', 'MH4CARD', 'MH5RESP', 'NPISCORE', 'GENOTYPE'],
      dtype='object')

One-Hot encoding of the Genotype column:

In [17]:
genotype_dummies = pd.get_dummies(clinical_df["GENOTYPE"], prefix="GENO")
clinical_df = clinical_df.drop(columns=["GENOTYPE"])
clinical_df = pd.concat([clinical_df, genotype_dummies], axis=1)

In [18]:
clinical_df.columns

Index(['PTID', 'RID', 'label', 'NXVISUAL', 'NXAUDITO', 'NXTREMOR', 'NXCONSCI',
       'NXNERVE', 'NXMOTOR', 'NXFINGER', 'NXHEEL', 'NXSENSOR', 'NXTENDON',
       'NXPLANTA', 'NXGAIT', 'NXABNORM', 'TOTAL11', 'TOTALMOD', 'MMSCORE',
       'GDTOTAL', 'HMSCORE', 'MH4CARD', 'MH5RESP', 'NPISCORE', 'GENO_2/2',
       'GENO_2/3', 'GENO_2/4', 'GENO_3/3', 'GENO_3/4', 'GENO_4/4'],
      dtype='object')

Drop RID (equivalent to PTID):

In [21]:
clinical_df = clinical_df.drop(columns=["RID"], errors="ignore")

Merge clinical and MRI metadata:

In [27]:
merged = pd.merge(mri_df, clinical_df, on="PTID", how="inner")

In [30]:
merged.columns

Index(['PTID', 'EXAMDATE', 'Diagnosis_Code', 'NXVISUAL', 'NXAUDITO',
       'NXTREMOR', 'NXCONSCI', 'NXNERVE', 'NXMOTOR', 'NXFINGER', 'NXHEEL',
       'NXSENSOR', 'NXTENDON', 'NXPLANTA', 'NXGAIT', 'NXABNORM', 'TOTAL11',
       'TOTALMOD', 'MMSCORE', 'GDTOTAL', 'HMSCORE', 'MH4CARD', 'MH5RESP',
       'NPISCORE', 'GENO_2/2', 'GENO_2/3', 'GENO_2/4', 'GENO_3/3', 'GENO_3/4',
       'GENO_4/4'],
      dtype='object')

Keep only one column with diagnoses:

In [29]:
merged = merged.drop(columns=["label"], errors="ignore")

Check if there are patients without diagnoses:

In [34]:
merged["Diagnosis_Code"].isna().unique()

array([False])

Final version:

In [None]:
output_csv = "PATH-on-Charité-HPC/merged_multimodal.csv"
merged.to_csv(output_csv, index=False)

Check the Input dimension for the clinical model:

In [None]:
df = pd.read_csv("merged_multimodal.csv")
df.head()

In [4]:
clinical_cols = [col for col in df.columns if col not in ["PTID", "EXAMDATE", "Diagnosis_Code"]]
print("Input dim:", len(clinical_cols))

Input dim: 27
