In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
file_clinical = 'ADNIMERGE/ADNIMERGE_08Jan2026.csv'
file_proteomics = 'RBM/adni_proteomicsv2/adni_plasma_qc_multiplex_11Nov2010.csv'
file_expression = 'ADNI_Gene_Expression_Profile/ADNI_Gene_Expression_Profile.csv'

In [3]:
output_dir = "Processed_Data"
os.makedirs(output_dir, exist_ok=True)

In [5]:
print("Loading Clinical Data (ADNIMERGE)...")
df_clin = pd.read_csv(file_clinical, low_memory=False)

# FILTER: Keeping only Baseline ('bl') visits
# We anchor to baseline to match the timing of the omics samples.
df_clin_bl = df_clin[df_clin['VISCODE'] == 'bl'].copy()

# CLEANUP: Selecting essential columns
# We keep Diagnosis (DX_bl), Scores (MMSE, CDRSB), and Demographics
cols_to_keep = ['RID', 'PTID', 'VISCODE', 'EXAMDATE', 'AGE', 'PTGENDER', 
                'PTEDUCAT', 'PTETHCAT', 'PTRACCAT', 'APOE4', 'DX_bl', 
                'MMSE', 'CDRSB', 'ADAS13']

# Robust selection: only keep columns that actually exist in the file
actual_cols = [c for c in cols_to_keep if c in df_clin_bl.columns]
df_clin_clean = df_clin_bl[actual_cols].copy()

# Fix RIDs: Ensure they are integers for merging
df_clin_clean['RID'] = pd.to_numeric(df_clin_clean['RID'], errors='coerce')
df_clin_clean = df_clin_clean.dropna(subset=['RID'])
df_clin_clean['RID'] = df_clin_clean['RID'].astype(int)

print(f"Baseline Clinical Subjects: {len(df_clin_clean)}")

Loading Clinical Data (ADNIMERGE)...
Baseline Clinical Subjects: 2430


In [6]:
# PREPARE PROTEOMICS (TIER 1)

print("Loading Raw Proteomics (RBM Plasma)...")
df_prot = pd.read_csv(file_proteomics)

# CLEANUP: 
# 1. Ensure RID is integer
df_prot['RID'] = pd.to_numeric(df_prot['RID'], errors='coerce')
df_prot = df_prot.dropna(subset=['RID'])
df_prot['RID'] = df_prot['RID'].astype(int)

# 2. Remove duplicates (Keep first instance per RID)
# Some patients might have duplicate entries; we take the first valid one.
df_prot_clean = df_prot.drop_duplicates(subset=['RID'])

print(f"Total Subjects with Proteomics: {len(df_prot_clean)}")

Loading Raw Proteomics (RBM Plasma)...
Total Subjects with Proteomics: 566


In [7]:
# PREPARE GENOMICS (TIER 2)
print("Loading Raw Gene Expression ...")
# This file is complex because it is transposed (Genes = Rows).
# We read it without a header first to locate the data.
df_gene_raw = pd.read_csv(file_expression, header=None, low_memory=False)

# Logic to find the row containing Subject IDs (like '011_S_1234')
id_row_idx = -1
for i in range(10): # Scan first 10 rows
    row_str = df_gene_raw.iloc[i].astype(str)
    # Check if row has multiple IDs containing '_S_'
    if row_str.str.contains('_S_').sum() > 5:
        id_row_idx = i
        break

if id_row_idx != -1:
    # Extract Data
    subject_ids = df_gene_raw.iloc[id_row_idx, 2:].values # Skip probe cols
    gene_data = df_gene_raw.iloc[id_row_idx+1:, 2:].values
    gene_names = df_gene_raw.iloc[id_row_idx+1:, 1].values # Gene Symbols
    
    # Transpose: Create DataFrame where Rows = Subjects, Cols = Genes
    df_gene_clean = pd.DataFrame(gene_data.T, columns=gene_names)
    df_gene_clean.insert(0, 'PTID', subject_ids)
    
    # Extract RID from PTID ('011_S_1234' -> 1234) for merging
    df_gene_clean['RID'] = df_gene_clean['PTID'].apply(lambda x: int(x.split('_')[-1]) if '_S_' in str(x) else np.nan)
    df_gene_clean.dropna(subset=['RID'], inplace=True)
    df_gene_clean['RID'] = df_gene_clean['RID'].astype(int)
    
    # Handle duplicate RIDs in gene file if any
    df_gene_clean = df_gene_clean.drop_duplicates(subset=['RID'])
    
    print(f"Total Subjects with Transcriptomics: {len(df_gene_clean)}")
else:
    print("   ! ERROR: Could not parse Gene file headers. Check file format.")
    df_gene_clean = pd.DataFrame(columns=['RID']) 

Loading Raw Gene Expression ...
Total Subjects with Transcriptomics: 744


In [10]:
# MERGE AND EXPORT COHORTS
print("Generating Tiered Cohorts...")

# TIER 1: PROTEOMIC COHORT (Aim 1 Training)
# Merge Clinical + Proteomics
# How='inner' keeps only intersection
tier1_df = pd.merge(df_clin_clean, df_prot_clean, on='RID', how='inner', suffixes=('', '_prot'))

# TIER 2: MULTI-OMICS COHORT (Aim 1 Refinement & Aim 2 Fusion)
# Merge Tier 1 + Genomics
tier2_df = pd.merge(tier1_df, df_gene_clean, on='RID', how='inner', suffixes=('', '_gene'))

# SAVE RESULTS

print(f"TIER 1 (Proteomics + Clinical): {len(tier1_df)} Subjects")
print(f"TIER 2 (Multi-Omics + Clinical): {len(tier2_df)} Subjects")

# Export to CSV
tier1_path = os.path.join(output_dir, 'Tier1_Proteomics_Cohort.csv')
tier2_path = os.path.join(output_dir, 'Tier2_MultiOmics_Cohort.csv')

tier1_df.to_csv(tier1_path, index=False)
tier2_df.to_csv(tier2_path, index=False)


Generating Tiered Cohorts...
TIER 1 (Proteomics + Clinical): 566 Subjects
TIER 2 (Multi-Omics + Clinical): 170 Subjects
