In [1]:
import pandas as pd

df = pd.read_parquet('NACC_cleaned.parquet', engine = 'fastparquet')
covariates = df[['NACCID', 'DEPD', 'HYPERTEN', 'SMOKYRS', 'ALCOHOL', 'HEARING', 'EDUC', 'NACCBMI', 'NACCAGE', 'SEX', 'RACE', 'NACCAPOE', 'ad_dx']]
lancet_covs = ['DEPD', 'HYPERTEN', 'SMOKYRS', 'ALCOHOL', 'HEARING', 'EDUC', 'NACCBMI']

In [5]:
df['NACCID'].unique()

array(['NACC002909', 'NACC003487', 'NACC004352', ..., 'NACC998475',
       'NACC999391', 'NACC999420'], shape=(54025,), dtype=object)

In [2]:
df['EDUC']

0         16.0
1         16.0
2         16.0
3         16.0
4         12.0
          ... 
201347    13.0
201348    13.0
201349    13.0
201350    13.0
201351    18.0
Name: EDUC, Length: 201352, dtype: float64

In [None]:
# handle missing data
covariates['DEPD'] = covariates['DEPD'].fillna(0) # if missing depression, assume no depression
covariates['HYPERTEN'] = covariates['HYPERTEN'].fillna(0) # if missing hypertension, assume no hypertension
covariates['SMOKYRS'] = covariates['SMOKYRS'].fillna(covariates['SMOKYRS'].mean()) # if missing smoking years, assume mean
covariates['ALCOHOL'] = covariates['ALCOHOL'].fillna(0) # if missing alcohol, assume no alcohol
covariates['HEARING'] = covariates['HEARING'].fillna(0) # if missing hearing loss, assume no hearing loss
covariates['NACCBMI'] = covariates['NACCBMI'].fillna(covariates['NACCBMI'].mean()) # if missing BMI, assume mean

covariates = covariates.dropna(subset=['NACCAPOE', 'EDUC', 'RACE'])
# covariates['NACCAPOE'] = covariates['NACCAPOE'].ffill()
# covariates['EDUC'] = covariates['EDUC'].fillna(covariates['EDUC'].mean())
# covariates['RACE'] = covariates['RACE'].ffill().bfill()

In [None]:
import sys
sys.path.append('../ukb')  # adjust the path as needed
from doubleml_utils import encode

covariates = encode(covariates, 'NACCAPOE', 'apoe')


In [None]:
covariates

In [None]:
import sys
import sys
sys.path.append('../ukb')  # adjust the path as needed
from doubleml_utils import run_dml

lancet_covs = ['DEPD', 'HYPERTEN', 'SMOKYRS', 'ALCOHOL', 'HEARING', 'EDUC', 'NACCBMI']

def assess_lancets_dml(features): 
    feature_summaries = {}
    for feature in features: 
        print(f"Running DML for feature: {feature}")
        covariate = covariates[['NACCAGE', 'SEX', 'apoe_e2/e2', 'apoe_e2/e3', 'apoe_e2/e4', 'apoe_e3/e3', 'apoe_e3/e4', 'apoe_e4/e4']]
        outcome = covariates['ad_dx']
        exposure = covariates[feature]

        dml_model = run_dml(covariate, outcome, exposure)
        feature_summaries[feature] = dml_model.summary
    
    return feature_summaries

results = assess_lancets_dml(lancet_covs)

In [21]:
rows = []

for test_id, df in results.items():
    # Pull the row for `d` as a dictionary and tag with the test_id
    row = df.loc['d'].to_dict()
    row['test_id'] = test_id
    rows.append(row)

# Convert to a DataFrame
summary_df = pd.DataFrame(rows)

# Move 'test_id' to the front
summary_df = summary_df[['test_id'] + [col for col in summary_df.columns if col != 'test_id']]

In [23]:
summary_df.to_csv('./double_ml/nacc_lancet_meta.txt')