## Create "meta_csv" from metadata and diagnose files:

In [2]:
import pandas as pd

In [8]:
meta_df = pd.read_csv("/sc-projects/sc-proj-ukb-cvd/projects/theses/data/adni/Clinical_Data/MRI/MRI_MRIMETA_13Apr2025.csv")
diag_df = pd.read_csv("/sc-projects/sc-proj-ukb-cvd/projects/theses/data/adni/Clinical_Data/MRI/MRI_DXSUM_13Apr2025.csv")

Duplicate entries for PTIDs:

In [None]:
print("Duplicate Patients: ", meta_df['PTID'].value_counts()[lambda x: x > 1])

Sort by patient ID and exam date:

In [None]:
meta_df["EXAMDATE"] = pd.to_datetime(meta_df["EXAMDATE"])
meta_df = meta_df.dropna(subset=["EXAMDATE"])

# Sort by date, most recent first
meta_df_sorted = meta_df.sort_values(by=["RID", "EXAMDATE"], ascending=[True, False])
meta_df_sorted

Set the diagnose to match the closest exam date

In [12]:
diag_df["EXAMDATE"] = pd.to_datetime(diag_df["EXAMDATE"])

In [15]:
def find_closest_diagnosis(row):
    subject_dx = diag_df[diag_df["RID"] == row["RID"]].copy()
    subject_dx = subject_dx.dropna(subset=["DIAGNOSIS", "EXAMDATE"])

    if subject_dx.empty:
        return None

    subject_dx["timediff"] = (subject_dx["EXAMDATE"] - row["EXAMDATE"]).abs()
    closest_idx = subject_dx["timediff"].idxmin()
    return subject_dx.loc[closest_idx, "DIAGNOSIS"]

meta_df = meta_df.copy()
meta_df["DIAGNOSIS"] = meta_df.apply(find_closest_diagnosis, axis=1)

In [16]:
meta_clean = meta_df.dropna(subset=["DIAGNOSIS"])

Map diagnosis numbers: 1 → 0 (CN), 2 → 1 (MCI), 3 → 2 (AD)

In [17]:
diagnosis_map = {1.0: 0, 2.0: 1, 3.0: 2}

In [18]:
meta_clean = meta_clean.copy()
meta_clean["Diagnosis_Code"] = meta_clean["DIAGNOSIS"].map(diagnosis_map)

In [20]:
meta_clean.to_csv("MRI_DIAGNOSIS.csv", index=False)

Check for class imbalance:

In [3]:
df = pd.read_csv("MRI_DIAGNOSIS.csv")

In [23]:
label_counts = df["Diagnosis_Code"].value_counts()
print(label_counts)
print("Total:", label_counts.sum(), "scans")

Diagnosis_Code
1    1452
0    1122
2    1033
Name: count, dtype: int64
Total: 3607 scans


In [4]:
df = df[df["Diagnosis_Code"].isin([0, 2])].copy()
df["Diagnosis_Code"] = df["Diagnosis_Code"].map({0: 0, 2: 1})

In [5]:
grouped = df.groupby("PTID")

In [None]:
label_counts = grouped["Diagnosis_Code"].nunique()
label_counts

In [9]:
mixed_label_ptids = label_counts[label_counts > 1]
print(f"PTIDs with more than one label: {len(mixed_label_ptids)}")

PTIDs with more than one label: 11


In [None]:
single_label_ptids = label_counts[label_counts == 1]
single_label_ptids

In [13]:
duplicated_ptids = df["PTID"].value_counts()[lambda x: x > 1].index
duplicated_df = df[df["PTID"].isin(duplicated_ptids)]
duplicated_grouped = duplicated_df.groupby("PTID")["Diagnosis_Code"].nunique()
pure_duplicated_ptids = duplicated_grouped[duplicated_grouped == 1].index

In [14]:
pure_dup_df = df[df["PTID"].isin(pure_duplicated_ptids)]
pure_dup_counts = pure_dup_df.groupby("PTID").first()["Diagnosis_Code"].value_counts()
print(f"Duplicated PTIDs with only label 0: {pure_dup_counts.get(0, 0)}")
print(f"Duplicated PTIDs with only label 1: {pure_dup_counts.get(1, 0)}")

Duplicated PTIDs with only label 0: 178
Duplicated PTIDs with only label 1: 259


In [15]:
ptid_counts = df["PTID"].value_counts()
single_scan_ptids = ptid_counts[ptid_counts == 1].index
single_scan_df = df[df["PTID"].isin(single_scan_ptids)]

In [16]:
single_label_distribution = single_scan_df["Diagnosis_Code"].value_counts()
print(f"Non-duplicated PTIDs with label 0: {single_label_distribution.get(0, 0)}")
print(f"Non-duplicated PTIDs with label 1: {single_label_distribution.get(1, 0)}")

Non-duplicated PTIDs with label 0: 8
Non-duplicated PTIDs with label 1: 53


In [17]:
ptid_label_df = df.groupby("PTID").first().reset_index()

In [19]:
ptid_counts = ptid_label_df["Diagnosis_Code"].value_counts()
print("Unique PTIDs per label:")
print(f"CN (0): {ptid_counts.get(0, 0)}")
print(f"AD (1): {ptid_counts.get(1, 0)}")

Unique PTIDs per label:
CN (0): 197
AD (1): 312


In [20]:
df = pd.read_csv("MRI_DIAGNOSIS.csv", usecols=["PTID", "EXAMDATE", "Diagnosis_Code"])
df = df[df["Diagnosis_Code"].isin([0, 2])].copy()
df["Diagnosis_Code"] = df["Diagnosis_Code"].map({0: 0, 2: 1})
df["EXAMDATE"] = pd.to_datetime(df["EXAMDATE"])

latest_scans = df.sort_values("EXAMDATE").groupby("PTID").tail(1)
print("Label distribution for latest scan per PTID:")
print(latest_scans["Diagnosis_Code"].value_counts())

Label distribution for latest scan per PTID:
Diagnosis_Code
1    323
0    186
Name: count, dtype: int64
