In [1]:
import pandas as pd
import numpy as np

In [2]:
file_clinical = 'DXSUM_12Jan2026.csv'
file_proteomics = 'RBM/adni_proteomicsv2/adni_plasma_qc_multiplex_11Nov2010.csv'
file_expression = 'ADNI_Gene_Expression_Profile/ADNI_Gene_Expression_Profile.csv'

In [3]:
# LOADING CLINICAL DATA
df_clin = pd.read_csv(file_clinical, low_memory=False)
rids_clinical = set(df_clin['RID'].astype(int).unique())
print(f"   -> Found {len(rids_clinical)} patients in Clinical Master File")

# LOADING PROTEOMICS
df_prot = pd.read_csv(file_proteomics)
rids_prot = set(df_prot['RID'].astype(int).unique())
print(f"   -> Found {len(rids_prot)} patients with Inflammatory Markers")

# LOADING TRANSCRIPTOMICS 
# Read first 10 rows with NO header to see raw structure
df_check = pd.read_csv(file_expression, header=None, nrows=10, low_memory=False)

rids_rna = set()
found_row = False

   -> Found 3788 patients in Clinical Master File
   -> Found 566 patients with Inflammatory Markers


In [4]:
# Loop through the first 10 rows to find the one containing IDs
for i in range(10):
    row_values = df_check.iloc[i].values.astype(str)
    # looking for the "_S_" pattern
    # We check if at least 3 columns have that pattern to be sure
    matches = [x for x in row_values if '_S_' in x]
    
    if len(matches) > 5:  # Found it!
        print(f"   -> Detected Patient IDs in Row Index {i}")
        found_row = True
        
        # Process this specific row
        for ptid in matches:
            try:
                # Convert '116_S_1249' -> 1249
                rid_part = int(ptid.split('_')[-1])
                rids_rna.add(rid_part)
            except:
                continue
        break

if not found_row:
    print("   ! ERROR: Could not locate a row with '_S_' IDs in the first 10 lines.")
else:
    print(f"   -> Found {len(rids_rna)} patients with Gene Expression Data")


   -> Detected Patient IDs in Row Index 2
   -> Found 744 patients with Gene Expression Data


In [5]:
# FINDING INTERSECTION
# Find patients who have ALL THREE
golden_cohort = rids_clinical.intersection(rids_prot).intersection(rids_rna)
count = len(golden_cohort)

print(f"FINAL RESULT: {count} PATIENTS HAVE ALL 3 DATA TYPES")

FINAL RESULT: 170 PATIENTS HAVE ALL 3 DATA TYPES
