In this notebook, I want to test whether my scripts to get the adni params are correct. 

In [87]:
import pandas as pd 
import numpy as np 
from typing import List, Dict, Tuple, Optional
from kde_ebm.mixture_model import fit_all_gmm_models
from collections import Counter
import json 

In [88]:
import pandas as pd 
import numpy as np 
from typing import List, Dict, Tuple, Optional

def get_adni_filtered(
        raw:str, 
        meta_data:List[str], 
        select_biomarkers:List[str], 
        diagnosis_list:List[str]
    ) -> pd.DataFrame:
    """Get the filtered data. 
    meta_data = ['PTID', 'DX_bl', 'VISCODE', 'COLPROT']

    select_biomarkers = ['MMSE_bl', 'Ventricles_bl', 'WholeBrain_bl', 
                'MidTemp_bl', 'Fusiform_bl', 'Entorhinal_bl', 
                'Hippocampus_bl', 'ADAS13_bl', 'PTAU_bl', 
                'TAU_bl', 'ABETA_bl', 'RAVLT_immediate_bl'
    ]

    diagnosis_list = ['CN', 'EMCI', 'LMCI', 'AD']
    """
    df = pd.read_csv(raw, usecols=meta_data + select_biomarkers, low_memory=False)
    # 2. Filter to baseline and known diagnoses
    df = df[df['VISCODE'] == 'bl']
    df = df[df['DX_bl'].isin(diagnosis_list)]

    # 3. Convert biomarker columns to numeric (handles garbage strings like '--')
    for col in select_biomarkers:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # 4. Drop rows with any NaN in biomarkers
    df = df.dropna(subset=select_biomarkers).reset_index(drop=True)
    df = df.drop_duplicates().reset_index(drop=True)
    print(len(df))
    if len(df.PTID.unique()) == len(df):
        print('No duplicates!')
    else:
        print('Data has duplicates!')
    
    # Print DX distribution
    counts = Counter(df['DX_bl'])
    total = sum(counts.values())

    for k, v in counts.items():
        perc = 100 * v / total
        print(f"{k}: {v} ({perc:.1f}%)")
    
    print('----------------------------------------------------')
    
    # Print Cohort distribution
    counts = Counter(df['COLPROT'])
    total = sum(counts.values())

    for k, v in counts.items():
        perc = 100 * v / total
        print(f"{k}: {v} ({perc:.1f}%)")

    return df 

In [89]:
meta_data = ['PTID', 'DX_bl', 'VISCODE', 'COLPROT']

select_biomarkers = ['MMSE_bl', 'Ventricles_bl', 'WholeBrain_bl', 
            'MidTemp_bl', 'Fusiform_bl', 'Entorhinal_bl', 
            'Hippocampus_bl', 'ADAS13_bl', 'PTAU_bl', 
            'TAU_bl', 'ABETA_bl', 'RAVLT_immediate_bl', 'ICV_bl'
]

diagnosis_list = ['CN', 'EMCI', 'LMCI', 'AD']

raw = '../../mlhc_sub/ADNIMERGE.csv'

adni_filtered = get_adni_filtered(
    raw, meta_data, select_biomarkers, diagnosis_list
)

726
No duplicates!
AD: 153 (21.1%)
LMCI: 236 (32.5%)
CN: 155 (21.3%)
EMCI: 182 (25.1%)
----------------------------------------------------
ADNI1: 275 (37.9%)
ADNI2: 375 (51.7%)
ADNIGO: 76 (10.5%)


In [90]:
df = adni_filtered.copy()
df.head()

Unnamed: 0,COLPROT,PTID,VISCODE,DX_bl,ADAS13_bl,MMSE_bl,RAVLT_immediate_bl,Ventricles_bl,Hippocampus_bl,WholeBrain_bl,Entorhinal_bl,Fusiform_bl,MidTemp_bl,ICV_bl,ABETA_bl,TAU_bl,PTAU_bl
0,ADNI1,011_S_0003,bl,AD,31.0,20.0,22.0,84599.0,5319.0,1129830.0,1791.0,15506.0,18422.0,1920690.0,741.5,239.7,22.83
1,ADNI1,022_S_0004,bl,LMCI,21.33,27.0,37.0,39605.0,6869.0,1154980.0,3983.0,19036.0,19615.0,1679440.0,1501.0,153.1,13.29
2,ADNI1,011_S_0005,bl,CN,14.67,29.0,37.0,34062.0,7075.0,1116630.0,4433.0,24788.0,21614.0,1640770.0,547.3,337.0,33.43
3,ADNI1,011_S_0010,bl,AD,24.33,24.0,20.0,26820.0,5485.0,1033540.0,2676.0,16761.0,19741.0,1471180.0,357.4,329.9,31.26
4,ADNI1,022_S_0014,bl,CN,8.33,29.0,45.0,46279.0,6730.0,861749.0,3581.0,13779.0,17798.0,1269540.0,1582.0,203.6,16.68


In [91]:
df.columns = df.columns.str.replace('_bl', '', regex=False)

# ICV normalization because brain sizes vary a lot 
df['VentricleNorm']  = df['Ventricles']  / df['ICV']
df['HippocampusNorm'] = df['Hippocampus'] / df['ICV']
df['WholeBrainNorm']  = df['WholeBrain']  / df['ICV']
df['EntorhinalNorm']  = df['Entorhinal']  / df['ICV']
df['FusiformNorm']    = df['Fusiform']    / df['ICV']
df['MidTempNorm']     = df['MidTemp']     / df['ICV']

df.drop([
    'VISCODE', 'COLPROT', 'ICV', 'Ventricles', 
    'Hippocampus', 'WholeBrain', 'Entorhinal', 'Fusiform', 'MidTemp', 'PTID'
], axis=1, inplace=True)

df['diseased'] = [int(dx != 'CN') for dx in df.DX]

df.drop(['DX'], axis=1, inplace=True)

# Ordered biomarkers, to match the ordering outputs later
ordered_biomarkers = df.columns[:-1].to_numpy()
# for ucl
data_matrix = df.to_numpy()

In [92]:
data = data_matrix[:, :-1].astype(np.float64)
target = data_matrix[:, -1].astype(np.int64)

mixtures = fit_all_gmm_models(data, target)
# Extract likelihoods for each biomarker
L_yes = np.zeros(data.shape)
L_no = np.zeros(data.shape)
for i in range(data.shape[1]):
    L_no[:, i], L_yes[:, i] = mixtures[i].pdf(None, data[:, i])

print(L_yes.shape, L_no.shape)  # Should both be (num_subjects, num_biomarkers)
print("NaN in L_yes:", np.isnan(L_yes).any())
print("NaN in L_no:", np.isnan(L_no).any())
print("Min L_yes:", np.min(L_yes))
print("Min L_no:", np.min(L_no))

# Create the dictionary directly
gmm_params = {}
for i, biomarker_name in enumerate(ordered_biomarkers):
    mixture = mixtures[i]
    gmm_params[biomarker_name] = {
        "theta_mean": float(mixture.ad_comp.mu),
        "theta_std": float(mixture.ad_comp.sigma), 
        "phi_mean": float(mixture.cn_comp.mu),
        "phi_std": float(mixture.cn_comp.sigma)
    }

(726, 12) (726, 12)
NaN in L_yes: False
NaN in L_no: False
Min L_yes: 1.1745101119173378e-09
Min L_no: 5.635078332563648e-62


  fx = wrapped_fun(x)


In [93]:
with open('../adni_params_ucl_gmm.json', 'w') as f:
    json.dump(gmm_params, f, indent=4)