NACC - Education, Imaging Data, AD Risk

In [None]:
import pandas as pd
nacc_df = pd.read_parquet('../../../randy/rfb/raw_data/NACC/investigator_nacc69.parquet', engine = 'fastparquet')

In [None]:
import pandas as pd
covariates_df = pd.read_parquet('doubleML_dep_AD_covariates.parquet', engine = 'fastparquet')
covariates_df


UKB - Education, Imaging Data, AD Risk

In [None]:
import pandas as pd
covariates_df = pd.read_parquet('doubleML_covariates.parquet', engine = 'fastparquet')
imaging_df = pd.read_parquet('../../../randy/rfb/tidy_data/UKBiobank/dementia/neuroimaging/X.parquet', engine = 'fastparquet')

covariates_df.rename(columns = {'IID': 'eid'}, inplace = True)
imaging_only = imaging_df[['eid'] + [col for col in imaging_df.columns if (col.startswith('25') or col.startswith('27') or col.startswith('26'))]]


In [None]:
imaging_only

In [None]:
covariates_df = covariates_df.merge(imaging_only, on='eid', how='inner')

In [None]:
# overlapping only
covariates_df = covariates_df.merge(imaging_only[['eid'] + overlap_image], on='eid', how='inner')
covariates_df = covariates_df.merge(ct_df[['eid'] + overlap_cog], on='eid', how='inner')

In [None]:
imaging_only.shape[0]

In [None]:
from doubleml_utils import run_dml_instrument, run_dml 

results = {}

fold = 0

# Impute education years ONCE
covariates_df.dropna(subset = ['education_years'], inplace=True)

startidx = int(imaging_only.shape[0] / 100 * fold)
endidx = int(imaging_only.shape[0] / 100 * (fold + 1))

print(f"Processing fold {fold}: indices {startidx} to {endidx}")

for i in imaging_only.drop(columns=['eid']).columns.tolist()[:1]:
    # Create boolean mask for non-missing values
    valid_mask = ~covariates_df[i].isna()
    
    # Check sample size before proceeding
    n_valid = valid_mask.sum()
    if n_valid < 100:
        print(f"Skipping {i} due to insufficient data ({n_valid} samples)")
        continue

    # Use boolean indexing - no copying!
    covariates_df = covariates_df[valid_mask]

    outcome = covariates_df.loc[valid_mask, i]
    exposure = covariates_df.loc[valid_mask, 'education_years']
    covariates = covariates_df.loc[valid_mask, ['curr_age', '31-0.0', 'e2/e2', 'e3/e3', 'e2/e3', 'e3/e4', 'e2/e4', 'e4/e4']]
    #instrument = covariates_df.loc[valid_mask, '31-0.0']

    try:
        result = run_dml(covariates=covariates, outcome=outcome, exposure=exposure)
        results[i] = result.summary
        print(f"Completed {i}: {n_valid} samples")
    except Exception as e:
        print(f"Error processing {i}: {str(e)}")
        continue

In [None]:
import pandas as pd
image_to_AD = pd.read_parquet('double_ml/imaging/imaging_image_to_AD.parquet', engine = 'fastparquet')
image_to_AD.sort_values(by='P>|t|', inplace=True)
image_to_AD = image_to_AD[image_to_AD['P>|t|'] < 0.05]

ed_to_image = pd.read_parquet('double_ml/imaging/imaging_ed_to_image.parquet', engine = 'fastparquet')
ed_to_image.sort_values(by='P>|t|', inplace=True)
ed_to_image = ed_to_image[ed_to_image['P>|t|'] < 0.05]
ed_to_image.sort_values(by='coef', key=abs, ascending=False, inplace=True)

overlap = ed_to_image.merge(image_to_AD, on='test_id', how = 'inner')

In [None]:
overlap_image = list(overlap['test_id'])

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
imaging_df_scaled = pd.DataFrame(
    scaler.fit_transform(imaging_only[overlap_image]),
    columns=overlap_image,
    index=imaging_only.index
)
imaging_df_scaled['eid'] = imaging_only['eid'].values


Cognitive Data

In [None]:
covariates_df = pd.read_parquet('doubleML_dep_AD_covariates.parquet', engine = 'fastparquet').rename(columns = {'IID': 'eid'})
#ct_df = pd.read_parquet('cognitive_test_results.parquet', engine = 'fastparquet')

In [None]:
import pandas as pd
cogtest_ids = [f"{x}-2.0" for x in [4282, 20023, 6348, 6349, 6350, 6351, 6333, 20197, 20018, 20016]]
ct_df = pd.read_csv('../../../uk_biobank/project_52887_669338/ukb669338.csv', usecols=['eid'] + cogtest_ids)

In [None]:
ct_df.to_parquet('cognitive_test_results_2.parquet', engine = 'fastparquet', index=False)

In [None]:
ct_df

In [None]:
covariates_df = covariates_df.merge(ct_df, on='eid', how='inner')

In [None]:
from doubleml_utils import run_dml
results = {}
covariates_df.dropna(subset=['curr_age', '31-0.0', 'groups', 'education_years'], inplace=True)

for test in ct_df.drop(columns=['eid']).columns:
    # Create boolean mask for non-missing values
    valid_mask = ~covariates_df[test].isna()
    
    # Check sample size before proceeding
    n_valid = valid_mask.sum()
    if n_valid < 100:
        print(f"Skipping {test} due to insufficient data ({n_valid} samples)")
        continue

    # Use boolean indexing - no copying!
    outcome = covariates_df.loc[valid_mask, 'groups']
    exposure = covariates_df.loc[valid_mask, test]
    covariates = covariates_df.loc[valid_mask, ['curr_age', '31-0.0', 'e2/e2', 'e3/e3', 'e2/e3', 'e3/e4', 'e2/e4', 'e4/e4']]

    try:
        result = run_dml(covariates, outcome, exposure)
        results[test] = result.summary
        print(f"Completed {test}: {n_valid} samples")
    except Exception as e:
        print(f"Error processing {test}: {str(e)}")
        continue


In [None]:
results

In [None]:
from doubleml_utils import summarize_results
results_df = summarize_results(results)
results_df = results_df[results_df['P>|t|'] < 0.05]


In [None]:
results_df

In [None]:
results_df.to_csv('double_ml/ed_to_cogtest2.csv', index=False)

In [None]:
ed_to_cogtest = pd.read_csv('double_ml/ed_to_cogtest2.csv')
cogtest_to_AD = pd.read_csv('double_ml/cogtest2_to_AD.csv')

overlap = set(ed_to_cogtest['test_id']).intersection(set(cogtest_to_AD['test_id']))
overlap_cog = list(overlap)
overlap_cog

In [None]:
df = pd.read_csv('../../../uk_biobank/project_52887_669338/ukb669338.csv', usecols=['eid', '20132-0.0'])

In [None]:
df = pd.read_parquet('imaging_ed_cog_brain_FOR_R.parquet', engine = 'fastparquet')
# X = df.drop(columns=overlap_cog + ['education_years'])
df['education_years'].isna().sum()

Imaging to Cognitive Level - Explaining Variance with Education?

In [None]:
import pandas as pd
overlap_cog = ['4282-2.0', '20016-2.0', '20023-2.0', '20197-2.0']

covariates_df = pd.read_parquet('doubleML_covariates.parquet', engine = 'fastparquet', columns=['IID', 'education_years', 'curr_age', '31-0.0', 'e2/e2', 'e3/e3', 'e2/e3', 'e3/e4', 'e2/e4', 'e4/e4', 'groups'])
imaging_df = pd.read_parquet('../../../randy/rfb/tidy_data/UKBiobank/dementia/neuroimaging/X.parquet', engine = 'fastparquet', columns=['eid'] + overlap_image)
ct_df = pd.read_parquet('cognitive_test_results_2.parquet', engine = 'fastparquet', columns=['eid'] + overlap_cog)

covariates_df.rename(columns = {'IID': 'eid'}, inplace = True)
covariates_df = covariates_df.merge(imaging_df, on='eid', how='inner').merge(ct_df, on='eid', how='inner')
covariates_df.dropna(subset=['curr_age', '31-0.0', 'groups', 'education_years'], inplace=True)
covariates_df

In [None]:
from doubleml_utils import run_dml_instrument, run_dml 
import pandas as pd

overlap_cog = ['4282-2.0', '20016-2.0', '20023-2.0', '20197-2.0']
covariates_df = pd.read_parquet('doubleML_covariates.parquet', engine = 'fastparquet', columns=['IID', 'bmi', 'mdi', 'education_years', 'curr_age', '31-0.0', 'e2/e2', 'e3/e3', 'e2/e3', 'e3/e4', 'e2/e4', 'e4/e4', 'groups'])
covariates_df.rename(columns = {'IID': 'eid'}, inplace = True)
ct_df = pd.read_parquet('cognitive_test_results_2.parquet', engine = 'fastparquet', columns=['eid'] + overlap_cog)

imaging_df = pd.read_parquet('../../../randy/rfb/tidy_data/UKBiobank/dementia/neuroimaging/X.parquet', engine = 'fastparquet')
imaging_only = imaging_df[['eid'] + [col for col in imaging_df.columns if (col.startswith('25') or col.startswith('27') or col.startswith('26'))]]

covariates_df = covariates_df.merge(imaging_only, on='eid', how='inner').merge(ct_df, on='eid', how='inner')

results = {}

fold = 0

# Impute education years ONCE
covariates_df.dropna(subset = ['education_years'], inplace=True)

startidx = int(imaging_only.shape[0] / 100 * fold)
endidx = int(imaging_only.shape[0] / 100 * (fold + 1))

print(f"Processing fold {fold}: indices {startidx} to {endidx}")

for i in imaging_only.drop(columns=['eid']).columns.tolist()[0:2]:
    valid_mask = ~covariates_df[i].isna()
    covariates_df = covariates_df[valid_mask]

    results[i] = {}

    for c in ['4282-2.0', '20016-2.0', '20023-2.0', '20197-2.0']:
    # Create boolean mask for non-missing values
        valid_mask = ~covariates_df[c].isna()
        
        # Check sample size before proceeding
        n_valid = valid_mask.sum()
        if n_valid < 100:
            print(f"Skipping {i} due to insufficient data ({n_valid} samples)")
            continue

        # Use boolean indexing - no copying!
        covariates_df = covariates_df[valid_mask]

        outcome = c
        exposure = i
        covariates = ['curr_age', 'e2/e2', 'e3/e3', 'e2/e3', 'e3/e4', 'e2/e4', 'e4/e4']
        instrument = '31-0.0'

        try:
            result = run_dml_instrument(covariates_df, covariates=covariates, outcome=outcome, exposure=exposure, instrument=instrument)
            results[i][c] = result.summary
            print(f"Completed {i}: {n_valid} samples")
        except Exception as e:
            print(f"Error processing {i}: {str(e)}")
            continue





In [38]:
imaging_only.drop(columns=['eid']).columns.tolist()[0]

'25000-2.0'

In [2]:
import pandas as pd
overlap_cog = ['4282-2.0', '20016-2.0', '20023-2.0', '20197-2.0']
covariates_df = pd.read_parquet('doubleML_covariates.parquet', engine = 'fastparquet', columns=['IID', 'bmi', 'mdi', 'education_years', 'curr_age', '31-0.0', 'e2/e2', 'e3/e3', 'e2/e3', 'e3/e4', 'e2/e4', 'e4/e4', 'groups'])
covariates_df.rename(columns = {'IID': 'eid'}, inplace = True)
ct_df = pd.read_parquet('cognitive_test_results_2.parquet', engine = 'fastparquet', columns=['eid'] + overlap_cog)

imaging_df = pd.read_parquet('../../../randy/rfb/tidy_data/UKBiobank/dementia/neuroimaging/X.parquet', engine = 'fastparquet')
imaging_only = imaging_df[['eid'] + [col for col in imaging_df.columns if (col.startswith('25') or col.startswith('27') or col.startswith('26'))]]

covariates_df = covariates_df.merge(imaging_only, on='eid', how='inner').merge(ct_df, on='eid', how='inner')

In [4]:
covariates_df

Unnamed: 0,eid,bmi,mdi,education_years,curr_age,31-0.0,e2/e2,e3/e3,e2/e3,e3/e4,...,25755_2_0_50,25755_2_0_51,25755_2_0_52,25755_2_0_53,25755_2_0_54,25755_2_0_55,4282-2.0,20016-2.0,20023-2.0,20197-2.0
0,1000046,27.0842,25.37000,,54.0,1.0,0,0,0,0,...,0.740887,0.862363,0.878685,0.913898,0.746185,0.557628,7.0,9.0,406.0,7.0
1,1000304,42.3268,23.00000,18.0,76.0,1.0,0,1,0,0,...,0.595000,0.683000,0.743000,0.587000,0.671000,0.480000,,7.0,526.0,
2,1000396,24.4995,7.59000,16.0,73.0,0.0,0,0,1,0,...,0.506706,0.665595,0.636133,0.592260,0.562239,0.462905,7.0,11.0,550.0,7.0
3,1000595,32.3960,4.25000,,73.0,1.0,0,1,0,0,...,0.640476,0.863235,0.638880,0.634676,0.636426,0.601487,9.0,11.0,584.0,4.0
4,1000697,28.3025,35.61000,,75.0,1.0,0,1,0,0,...,0.637000,0.684000,0.799000,0.503000,0.710000,0.476000,,,632.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42627,6024731,27.4406,3.76000,16.0,66.0,1.0,0,1,0,0,...,0.643735,0.859428,0.783996,0.714896,0.729920,0.535485,,5.0,562.0,
42628,6024757,30.3210,6.12541,17.0,72.0,1.0,0,1,0,0,...,0.601686,0.555240,0.542795,0.657828,0.543537,0.395114,6.0,8.0,499.0,3.0
42629,6025079,27.5938,7.61000,15.0,77.0,1.0,0,1,0,0,...,,,,,,,,7.0,530.0,
42630,6025128,25.9281,6.82000,,62.0,1.0,0,1,0,0,...,0.596681,0.561190,0.714659,0.701322,0.576395,0.483037,7.0,10.0,507.0,6.0


In [21]:
from econml.dml import CausalForestDML
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from lightgbm import LGBMRegressor
from doubleml_utils import run_dml
from econml.iv.dml import DMLIV

results = {}
cates = {}

covariates_df.dropna(subset = ['bmi', 'curr_age', '31-0.0', 'groups', 'education_years', 'e2/e2', 'e3/e3', 'e2/e3', 'e3/e4', 'e2/e4', 'e4/e4'], inplace=True)

#sig_results['Imaging Metric'].unique().tolist()

for i in imaging_only.drop(columns=['eid']).columns.tolist()[0:1]:
    print(f"Processing imaging metric: {i}")
    valid_mask = ~covariates_df[i].isna()
    covariates_df = covariates_df[valid_mask]

    results[i] = {}
    cates[i] = {}

    for test in ['4282-2.0', '20016-2.0', '20023-2.0', '20197-2.0']:
        print('starting test')

        valid_mask = ~covariates_df[test].isna()
        covariates_df = covariates_df[valid_mask]

        Y = covariates_df[test]
        T = covariates_df[i]
        X = covariates_df[['bmi', 'mdi', 'curr_age', 'e2/e2', 'e3/e3', 'e2/e3', 'e3/e4', 'e2/e4', 'e4/e4', 'education_years']]
        Z = covariates_df['31-0.0']

        print(Y.isna().sum(), T.isna().sum(), X.isna().sum(), Z.isna().sum())

        print(f"Fitting model for {i} and {test} with {len(Y)} samples")
        # DML with forest-based CATE estimator
        model = DMLIV(
            model_y_xw=LGBMRegressor(),
            model_t_xw=LGBMRegressor(),
            model_t_xwz=LGBMRegressor(),
            discrete_treatment=False,
            discrete_instrument=True,
            random_state=42
        )
        model.fit(Y, T, Z=Z, X=X)

        # Estimate overall ATE
        ate = model.ate(X)
        results[i][test] = {"ATE": ate, "model": model, "summary": model.summary}

        # Estimate CATEs
        cate = model.effect(X)
        cates[i][test] = cate

Processing imaging metric: 25000-2.0
starting test
0 0 bmi                0
mdi                0
curr_age           0
e2/e2              0
e3/e3              0
e2/e3              0
e3/e4              0
e2/e4              0
e4/e4              0
education_years    0
dtype: int64 0
Fitting model for 25000-2.0 and 4282-2.0 with 14705 samples
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003905 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 579
[LightGBM] [Info] Number of data points in the train set: 7352, number of used features: 10
[LightGBM] [Info] Start training from score 6.445185
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000080 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 582
[LightGBM] [Info] Number of data points in the train set: 7353, number of used features: 10
[LightGBM] [Info] Start training from score 6.433701
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 582
[LightGBM] [Info] Number of data points in the train set: 7353, number of used features: 10
[LightGBM] [Info] Start training from score 1.308345
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000103 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5



starting test
0 0 bmi                0
mdi                0
curr_age           0
e2/e2              0
e3/e3              0
e2/e3              0
e3/e4              0
e2/e4              0
e4/e4              0
education_years    0
dtype: int64 0
Fitting model for 25000-2.0 and 20016-2.0 with 14505 samples
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000085 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 580
[LightGBM] [Info] Number of data points in the train set: 7252, number of used features: 10
[LightGBM] [Info] Start training from score 5.944291
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000079 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 580
[LightGBM] [Info] Number of data 



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000081 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 579
[LightGBM] [Info] Number of data points in the train set: 7253, number of used features: 10
[LightGBM] [Info] Start training from score 5.928995
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000078 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 579
[LightGBM] [Info] Number of data points in the train set: 7253, number of used features: 10
[LightGBM] [Info] Start training from score 1.307366
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000096 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y



starting test
0 0 bmi                0
mdi                0
curr_age           0
e2/e2              0
e3/e3              0
e2/e3              0
e3/e4              0
e2/e4              0
e4/e4              0
education_years    0
dtype: int64 0
Fitting model for 25000-2.0 and 20023-2.0 with 14463 samples
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000090 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 581
[LightGBM] [Info] Number of data points in the train set: 7231, number of used features: 10
[LightGBM] [Info] Start training from score 607.509197
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000080 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 581
[LightGBM] [Info] Number of dat



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000080 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 580
[LightGBM] [Info] Number of data points in the train set: 7232, number of used features: 10
[LightGBM] [Info] Start training from score 606.070658
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000079 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 580
[LightGBM] [Info] Number of data points in the train set: 7232, number of used features: 10
[LightGBM] [Info] Start training from score 1.307044
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000095 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough,



starting test
0 0 bmi                0
mdi                0
curr_age           0
e2/e2              0
e3/e3              0
e2/e3              0
e3/e4              0
e2/e4              0
e4/e4              0
education_years    0
dtype: int64 0
Fitting model for 25000-2.0 and 20197-2.0 with 14095 samples
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000076 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 581
[LightGBM] [Info] Number of data points in the train set: 7047, number of used features: 10
[LightGBM] [Info] Start training from score 6.498652
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000078 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 581
[LightGBM] [Info] Number of data 



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000076 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 580
[LightGBM] [Info] Number of data points in the train set: 7048, number of used features: 10
[LightGBM] [Info] Start training from score 6.418417
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000227 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 580
[LightGBM] [Info] Number of data points in the train set: 7048, number of used features: 10
[LightGBM] [Info] Start training from score 1.305806
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000094 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5



In [28]:
from doubleml_utils import flatten_imaging_cog_dict
cates

{'25000-2.0': {'4282-2.0': array([-1.45894769, -2.2076063 , -2.2338543 , ..., -1.93178068,
         -1.09500744, -1.48103789], shape=(14705,)),
  '20016-2.0': array([-1.49986346, -0.62521354, -0.33209268, ..., -1.89840956,
         -1.21653565, -1.26566607], shape=(14505,)),
  '20023-2.0': array([ 91.8760311 , 150.76354124, 105.40524106, ..., 129.69078513,
         155.61218934, 161.38172793], shape=(14463,)),
  '20197-2.0': array([4.62697062, 6.97858383, 7.59685536, ..., 4.68711958, 4.46187395,
         4.77039202], shape=(14095,))}}

In [None]:
def cates_to_dataframe(cates_dict):
    """
    Convert nested dict of CATEs to a DataFrame.
    Structure: {imaging_metric: {cognitive_test: array, ...}, ...}
    """
    all_data = []
    
    for imaging_metric, cog_tests in cates_dict.items():
        for cog_test, cate_array in cog_tests.items():
            # Create a DataFrame for this combination
            temp_df = pd.DataFrame({
                'imaging_metric': imaging_metric,
                'cognitive_test': cog_test,
                'cate_value': cate_array,
                'subject_id': range(len(cate_array))  # Add subject identifier
            })
            all_data.append(temp_df)
    
    # Concatenate all DataFrames
    final_df = pd.concat(all_data, ignore_index=True)
    return final_df

# Convert your cates to DataFrame
cates_df = cates_to_dataframe(cates)
print(f"Shape: {cates_df.shape}")
print(cates_df.head(10))

df_fixed = cates_to_dataframe(cates)

Using minimum length: 14095
Shape: (14095, 5)
   subject_id  25000-2.0_4282-2.0  25000-2.0_20016-2.0  25000-2.0_20023-2.0  \
0           0           -1.458948            -1.499863            91.876031   
1           1           -2.207606            -0.625214           150.763541   
2           2           -2.233854            -0.332093           105.405241   
3           3           -1.230764             1.095169           138.599142   
4           4           -1.877043             0.139148           298.275979   

   25000-2.0_20197-2.0  
0             4.626971  
1             6.978584  
2             7.596855  
3             6.842732  
4             4.466952  
Using minimum length: 14095


In [32]:
import pandas as pd
import glob

import pandas as pd
import glob

def merge_large_csvs_to_parquet(pattern, output_file, chunksize=10000):
    """
    Memory-efficient merging for very large CSV files, saving as parquet
    """
    csv_files = sorted(glob.glob(pattern))
    
    all_chunks = []
    
    for file in csv_files:
        fold_num = file.split('_')[-1].replace('.csv', '')
        
        # Read file in chunks
        for chunk in pd.read_csv(file, chunksize=chunksize):
            chunk['fold'] = int(fold_num)
            all_chunks.append(chunk)
    
    # Concatenate all chunks and save as parquet
    final_df = pd.concat(all_chunks, ignore_index=True)
    final_df.to_parquet(output_file, engine='fastparquet', index=False)
    
    print(f"Merged {len(csv_files)} files to {output_file}")
    print(f"Final DataFrame shape: {final_df.shape}")

# Usage
pattern = 'double_ml/imaging/results/imaging_to_cog/cates/cates_imaging_to_cogtest_fold_*.csv'
merge_large_csvs_to_parquet(pattern, 'double_ml/imaging/results/imaging_to_cog/cates/merged_results.parquet', chunksize=10000)


Merged 79 files to double_ml/imaging/results/imaging_to_cog/cates/merged_results.parquet
Final DataFrame shape: (217178940, 5)


Analyzing Cates

In [17]:
import pandas as pd
df = pd.read_csv('double_ml/imaging/results/imaging_to_cog/cates/cates_imaging_to_cogtest_fold_0.csv', nrows = 14705)

In [18]:
df

Unnamed: 0,imaging_metric,cognitive_test,cate_value,subject_id
0,25000-2.0,4282-2.0,-1.458948,0
1,25000-2.0,4282-2.0,-2.207606,1
2,25000-2.0,4282-2.0,-2.233854,2
3,25000-2.0,4282-2.0,-1.230764,3
4,25000-2.0,4282-2.0,-1.877043,4
...,...,...,...,...
14700,25000-2.0,4282-2.0,-0.684127,14700
14701,25000-2.0,4282-2.0,-1.280907,14701
14702,25000-2.0,4282-2.0,-1.931781,14702
14703,25000-2.0,4282-2.0,-1.095007,14703


## LGBM on CATES - obtaining feature importances
Using LGBM to extract feature importances of demographics while predicting on CATES

In [20]:
from cates_utils import load_cates_3d_array
cates_array, imaging_metrics, cognitive_tests, n_patients = load_cates_3d_array('double_ml/imaging/results/imaging_to_cog/cates/cates_3d_array.npz')

Loaded 3D CATE array from double_ml/imaging/results/imaging_to_cog/cates/cates_3d_array.npz
Array shape: (14864, 3945, 4)
Imaging metrics: 3945
Cognitive tests: 4
Patients: 14864


In [23]:
imaging_metrics

['25000-2.0',
 '25001-2.0',
 '25002-2.0',
 '25003-2.0',
 '25004-2.0',
 '25005-2.0',
 '25006-2.0',
 '25007-2.0',
 '25008-2.0',
 '25009-2.0',
 '25010-2.0',
 '25011-2.0',
 '25012-2.0',
 '25013-2.0',
 '25014-2.0',
 '25015-2.0',
 '25016-2.0',
 '25017-2.0',
 '25018-2.0',
 '25019-2.0',
 '25020-2.0',
 '25021-2.0',
 '25022-2.0',
 '25023-2.0',
 '25024-2.0',
 '25025-2.0',
 '25026-2.0',
 '25027-2.0',
 '25028-2.0',
 '25029-2.0',
 '25030-2.0',
 '25031-2.0',
 '25032-2.0',
 '25033-2.0',
 '25034-2.0',
 '25035-2.0',
 '25036-2.0',
 '25037-2.0',
 '25038-2.0',
 '25039-2.0',
 '25040-2.0',
 '25042-2.0',
 '25044-2.0',
 '25046-2.0',
 '25048-2.0',
 '25050-2.0',
 '25052-2.0',
 '25054-2.0',
 '25056-2.0',
 '25057-2.0',
 '25058-2.0',
 '25059-2.0',
 '25060-2.0',
 '25061-2.0',
 '25062-2.0',
 '25063-2.0',
 '25064-2.0',
 '25065-2.0',
 '25066-2.0',
 '25067-2.0',
 '25068-2.0',
 '25069-2.0',
 '25070-2.0',
 '25071-2.0',
 '25072-2.0',
 '25073-2.0',
 '25074-2.0',
 '25075-2.0',
 '25076-2.0',
 '25077-2.0',
 '25078-2.0',
 '2507

In [None]:
overlap_cog = ['4282-2.0', '20016-2.0', '20023-2.0', '20197-2.0']
covariates_df = pd.read_parquet('doubleML_covariates.parquet', engine='fastparquet', 
                               columns=['IID', 'bmi', 'mdi', 'education_years', 'curr_age', '31-0.0', 
                                      'e2/e2', 'e3/e3', 'e2/e3', 'e3/e4', 'e2/e4', 'e4/e4', 'groups'])

covariates_df.dropna(subset = ['bmi', 'curr_age', '31-0.0', 'groups', 'education_years', 'e2/e2', 'e3/e3', 'e2/e3', 'e3/e4', 'e2/e4', 'e4/e4'], inplace=True)

covariates_df.rename(columns={'IID': 'eid'}, inplace=True)
ct_df = pd.read_parquet('cognitive_test_results_2.parquet', engine='fastparquet', 
                       columns=['eid'] + overlap_cog)

imaging_df = pd.read_parquet('../../../randy/rfb/tidy_data/UKBiobank/dementia/neuroimaging/X.parquet', engine='fastparquet')
imaging_only = imaging_df[['eid'] + [col for col in imaging_df.columns if (col.startswith('25') or col.startswith('27') or col.startswith('26'))]]

covariates_df = covariates_df.merge(imaging_only, on='eid', how='inner').merge(ct_df, on='eid', how='inner')

In [None]:


# Usage with the fixed function:
test_df = covariates_to_add_cates('4282-2.0', '25000-2.0', covariates_df, cates_array, imaging_metrics, cognitive_tests)
test_df

After filtering for 4282-2.0 and 25000-2.0: 14705 rows
CATE array subset length: 14705
DataFrame length: 14705


Unnamed: 0,eid,bmi,mdi,curr_age,e2/e2,e3/e3,e2/e3,e3/e4,e2/e4,e4/e4,education_years,4282-2.0,25000-2.0,cates_value
0,1000396,24.4995,7.59000,73.0,0,0,1,0,0,0,16.0,7.0,1.32265,-1.458948
1,1000776,29.0941,3.14000,78.0,0,1,0,0,0,0,21.0,5.0,1.49066,-2.207606
2,1000947,28.1909,4.09000,73.0,0,0,1,0,0,0,21.0,9.0,1.20067,-2.233854
3,1001758,33.6916,43.10000,73.0,0,0,0,1,0,0,17.0,8.0,1.18984,-1.230764
4,1002249,36.3119,9.96000,66.0,0,0,0,0,1,0,18.0,5.0,1.31902,-1.877043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14700,6023450,24.7620,39.82000,64.0,0,1,0,0,0,0,17.0,7.0,1.39307,-0.684127
14701,6023530,20.5191,23.00000,63.0,0,0,0,0,1,0,16.0,7.0,1.35110,-1.280907
14702,6023611,22.2816,11.19000,70.0,0,0,0,1,0,0,18.0,7.0,1.30279,-1.931781
14703,6024208,30.8845,14.25000,67.0,0,1,0,0,0,0,16.0,5.0,1.39270,-1.095007


In [68]:

# If you want to analyze multiple imaging metrics and cognitive tests
import pandas as pd
from IPython.display import display

def analyze_cates_by_education_tertiles(cog, img, covariates_df, cates_array, imaging_metrics, cognitive_tests):
    """
    Analyze CATE values by education tertiles for a specific cognitive test and imaging metric.
    """
    # Get the data for this combination
    test_df = covariates_to_add_cates(cog, img, covariates_df, cates_array, imaging_metrics, cognitive_tests)
    
    # Create education tertiles
    test_df['education_tert'] = pd.qcut(test_df['education_years'], q=3, labels=['low', 'mid', 'high'])
    
    # Compute summary statistics
    summary = test_df.groupby('education_tert').agg({
        'cates_value': ['mean', 'std', 'count'],
        'education_years': ['mean', 'min', 'max']
    }).round(4)
    
    # Add metadata
    summary_flat = test_df.groupby('education_tert')['cates_value'].agg(['mean', 'std']).reset_index()
    summary_flat['imaging_metric'] = img
    summary_flat['cognitive_test'] = cog
    
    return summary, summary_flat

# Example usage for your specific case:
cog_test = '4282-2.0'
img_metric = '25000-2.0'

detailed_summary, simple_summary = analyze_cates_by_education_tertiles(
    cog_test, img_metric, covariates_df, cates_array, imaging_metrics, cognitive_tests
)

print(f"Analysis for {img_metric} → {cog_test}")
print("="*50)
display(detailed_summary)
print("\nSimple Summary:")
display(simple_summary)


After filtering for 4282-2.0 and 25000-2.0: 14705 rows
CATE array subset length: 14705
DataFrame length: 14705
Analysis for 25000-2.0 → 4282-2.0


  summary = test_df.groupby('education_tert').agg({
  summary_flat = test_df.groupby('education_tert')['cates_value'].agg(['mean', 'std']).reset_index()


Unnamed: 0_level_0,cates_value,cates_value,cates_value,education_years,education_years,education_years
Unnamed: 0_level_1,mean,std,count,mean,min,max
education_tert,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
low,-1.144,0.358,7630,15.616,-2.0,16.0
mid,-1.4393,0.3353,4367,17.5493,17.0,18.0
high,-1.9765,0.4718,2708,21.2378,19.0,35.0



Simple Summary:


Unnamed: 0,education_tert,mean,std,imaging_metric,cognitive_test
0,low,-1.143977,0.357985,25000-2.0,4282-2.0
1,mid,-1.439279,0.335298,25000-2.0,4282-2.0
2,high,-1.976501,0.471833,25000-2.0,4282-2.0


In [58]:
test_df

Unnamed: 0,eid,bmi,mdi,curr_age,e2/e2,e3/e3,e2/e3,e3/e4,e2/e4,e4/e4,education_years,20197-2.0,25479-2.0,cates_value,Education Level
0,1000396,24.4995,7.59000,73.0,0,0,1,0,0,0,16.0,7.0,0.127578,-38.814930,0
1,1000776,29.0941,3.14000,78.0,0,1,0,0,0,0,21.0,7.0,0.104432,-30.468605,1
2,1000947,28.1909,4.09000,73.0,0,0,1,0,0,0,21.0,6.0,0.285855,-43.155285,1
3,1001758,33.6916,43.10000,73.0,0,0,0,1,0,0,17.0,2.0,0.107248,-35.920963,1
4,1002536,28.4512,35.12000,63.0,0,0,0,1,0,0,15.0,6.0,0.066842,-34.431553,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13879,6023224,31.3291,7.61000,79.0,0,1,0,0,0,0,15.0,0.0,0.073636,,0
13880,6023450,24.7620,39.82000,64.0,0,1,0,0,0,0,17.0,10.0,0.078580,,1
13881,6023611,22.2816,11.19000,70.0,0,0,0,1,0,0,18.0,7.0,0.073165,,1
13882,6024208,30.8845,14.25000,67.0,0,1,0,0,0,0,16.0,7.0,0.076193,,0


In [70]:
from lightgbm import LGBMRegressor
X = test_df.drop(columns=['cates_value', 'eid', '4282-2.0', '25000-2.0'])
y = test_df['cates_value']
LGBM_model = LGBMRegressor()
LGBM_model.fit(X, y)

fi = LGBM_model.feature_importances_
fnames = LGBM_model.feature_name_

feature_importance_df = pd.DataFrame({'feature': fnames, 'importance': fi})
feature_importance_df.sort_values(by='importance', ascending=False, inplace=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000188 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 587
[LightGBM] [Info] Number of data points in the train set: 14705, number of used features: 11
[LightGBM] [Info] Start training from score -1.384988


In [72]:
feature_importance_df

Unnamed: 0,feature,importance
9,education_years,844
1,mdi,748
2,curr_age,650
0,bmi,177
6,e3/e4,168
7,e2/e4,132
4,e3/e3,124
8,e4/e4,82
3,e2/e2,44
5,e2/e3,31


In [56]:
# implement LGBM 

from lightgbm import LGBMRegressor

feature_importances = {}

for i in imaging_metrics: 
    feature_importances[i] = {}
    for c in cognitive_tests:
        test_df = covariates_to_add_cates(c, i, covariates_df, cates_array, imaging_metrics, cognitive_tests)

        # fit
        X = test_df.drop(columns=['cates_value', 'eid', c, i])
        y = test_df['cates_value']

        LGBM_model = LGBMRegressor()
        LGBM_model.fit(X, y)

        # construct feature importances table
        fi = LGBM_model.feature_importances_
        fnames = LGBM_model.feature_name_

        feature_importance_df = pd.DataFrame({'feature': fnames, 'importance': fi})
        feature_importance_df.sort_values(by='importance', ascending=False, inplace=True)

        feature_importances[i][c] = feature_importance_df


After filtering for 20016-2.0 and 25000-2.0: 19862 rows
CATE array subset length: 14864
DataFrame length: 19862
Truncated both to length: 14864
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 584
[LightGBM] [Info] Number of data points in the train set: 14864, number of used features: 10
[LightGBM] [Info] Start training from score -1.280083
After filtering for 20023-2.0 and 25000-2.0: 20229 rows
CATE array subset length: 14864
DataFrame length: 20229
Truncated both to length: 14864
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 584
[LightGBM] [Info] Number of data points in the 

KeyboardInterrupt: 

In [None]:
from doubleml_utils import merge_folds

df = merge_folds('imaging_to_cog', 'double_ml/imaging/results')

In [None]:
all_results = pd.read_csv('double_ml/imaging/results/imaging_to_cog/all_results.csv')

In [28]:
# Bonferroni-corrected threshold for 4 tests
alpha = 0.05 / 4

# Pivot to wide format: Imaging Metric as index, Cognitive Test columns, P>|t| as values
pvals = all_results.pivot(index='Imaging Metric', columns='Cognitive Test', values='P>|t|')

# Find imaging metrics where all 4 cognitive tests are significant
sig_metrics = pvals[(pvals[overlap_cog] < alpha).all(axis=1)].index.tolist()

# Filter all_results for these imaging metrics
sig_results = all_results[all_results['Imaging Metric'].isin(sig_metrics)]

sig_results

Unnamed: 0,Imaging Metric,Cognitive Test,coef,std err,t,P>|t|,2.5 %,97.5 %
0,25000-2.0,4282-2.0,-1.287195,0.174328,-7.383756,1.538857e-13,-1.628871,-0.945518
1,25000-2.0,20016-2.0,-1.230470,0.205471,-5.988544,2.117281e-09,-1.633185,-0.827755
2,25000-2.0,20023-2.0,140.752507,11.751644,11.977261,4.675181e-33,117.719707,163.785306
3,25000-2.0,20197-2.0,5.404706,0.296206,18.246446,2.208169e-74,4.824153,5.985259
4,25001-2.0,4282-2.0,-0.000011,0.000001,-7.495370,6.611139e-14,-0.000014,-0.000008
...,...,...,...,...,...,...,...,...
15775,25506-2.0,20197-2.0,-101.985230,7.974312,-12.789221,1.883534e-37,-117.614593,-86.355866
15776,25507-2.0,4282-2.0,-42.484014,6.542968,-6.493079,8.409931e-11,-55.307996,-29.660031
15777,25507-2.0,20016-2.0,-36.664368,7.542127,-4.861277,1.166309e-06,-51.446665,-21.882071
15778,25507-2.0,20023-2.0,4233.194934,533.811539,7.930130,2.189165e-15,3186.943542,5279.446326


In [35]:
sig_results['Imaging Metric'].unique().tolist()

['25000-2.0',
 '25001-2.0',
 '25002-2.0',
 '25003-2.0',
 '25004-2.0',
 '25005-2.0',
 '25006-2.0',
 '25007-2.0',
 '25008-2.0',
 '25009-2.0',
 '25010-2.0',
 '25011-2.0',
 '25012-2.0',
 '25013-2.0',
 '25014-2.0',
 '25015-2.0',
 '25016-2.0',
 '25017-2.0',
 '25018-2.0',
 '25019-2.0',
 '25020-2.0',
 '25021-2.0',
 '25022-2.0',
 '25023-2.0',
 '25024-2.0',
 '25025-2.0',
 '25026-2.0',
 '25027-2.0',
 '25028-2.0',
 '25029-2.0',
 '25030-2.0',
 '25031-2.0',
 '25032-2.0',
 '25033-2.0',
 '25034-2.0',
 '25035-2.0',
 '25036-2.0',
 '25037-2.0',
 '25039-2.0',
 '25040-2.0',
 '25042-2.0',
 '25044-2.0',
 '25046-2.0',
 '25048-2.0',
 '25050-2.0',
 '25052-2.0',
 '25054-2.0',
 '25056-2.0',
 '25059-2.0',
 '25061-2.0',
 '25062-2.0',
 '25063-2.0',
 '25064-2.0',
 '25065-2.0',
 '25066-2.0',
 '25068-2.0',
 '25069-2.0',
 '25070-2.0',
 '25071-2.0',
 '25072-2.0',
 '25073-2.0',
 '25074-2.0',
 '25075-2.0',
 '25076-2.0',
 '25078-2.0',
 '25079-2.0',
 '25080-2.0',
 '25082-2.0',
 '25083-2.0',
 '25084-2.0',
 '25085-2.0',
 '2508

In [33]:
# testing the idea of education as protective for cognitive function, using imaging data as covariates
from econml.dml import CausalForestDML
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from doubleml_utils import run_dml

df = covariates_df.fillna(covariates_df.mean())

# treatment as education, outcome as cognitive test scores, x as imaging data
results = {}
cates = {}

for test in overlap_cog:
    nan_count = df[test].isna().sum()
    nan_pct = nan_count / len(df)
    if nan_count > 0 and nan_pct < 0.2:
        df = df[df[test].notna()]

    Y = df[test].values
    T = df['education_years'].values
    X = df.drop(columns=overlap_cog + ['education_years', 'groups', 'eid']).values

    # DML with forest-based CATE estimator
    model = CausalForestDML(
        model_y=GradientBoostingRegressor(),
        model_t=GradientBoostingRegressor(),
        n_estimators=100,
        min_samples_leaf=10,
        max_depth=10,
        discrete_treatment=False,
        random_state=42
    )
    model.fit(Y, T, X=X)

    # Estimate overall ATE
    ate = model.ate(X)
    results[test] = {"ATE": ate, "model": model, "summary": model.summary}

    # Estimate CATEs
    cate = model.effect(X)
    cates[test] = cate


KeyboardInterrupt: 

In [None]:
cates.to_csv('CATES_FOR_LATER.csv')

In [None]:
import pandas as pd

cate_df = pd.DataFrame(cates[test], columns=['cate'])
brain_df = df.drop(columns=overlap_cog + ['education_years', 'groups', 'eid']).reset_index(drop=True)

corr = cate_df['cate'].corr(brain_df['hippocampal_volume'])  # example




In [None]:
results

Path Analysis 

In [None]:
from sklearn.decomposition import PCA

imaging_df_scaled.fillna(imaging_df_scaled.mean(), inplace=True)
image_only = imaging_df_scaled.drop(columns=['eid'])


# Reduce 500 brain variables to 20 components
brain_pca = PCA(n_components=20).fit_transform(image_only[overlap_image])
for i in range(20):
    imaging_df_scaled[f'brain_pc{i+1}'] = brain_pca[:, i]



In [None]:
imaging_df_scaled = imaging_df_scaled.merge(covariates_df, on='eid', how='inner')

In [None]:
final_df = imaging_df_scaled[['eid', 'education_years', 'groups']+overlap_image + overlap_cog]


In [None]:
final_df.to_parquet('imaging_ed_cog_brain_FOR_R.parquet', engine = 'fastparquet', index=False)

In [None]:
hmmm_df = imaging_df_scaled.merge(covariates_df, on='eid', how='inner')
final_df = hmmm_df[['eid', 'education_years', 'groups']+[col for col in hmmm_df.columns if (col.startswith('25') or col.startswith('27') or col.startswith('26'))] + overlap_cog]
final_df

In [None]:
final_df['eid'] = final_df['eid'].astype('int64')

final_df.to_parquet('imaging_ed_cog_brain_FOR_R.parquet', engine = 'fastparquet')

In [None]:
final_df.isna().sum()

In [None]:
brain_pca = PCA(n_components=20)
brain_pca.fit(imaging_df_scaled[overlap_image])
loadings = pd.DataFrame(brain_pca.components_, columns=imaging_df_scaled[overlap_image].columns)
loadings.abs().idxmax(axis=1)


In [None]:
final_df.isna().sum()

In [None]:
from semopy import Model, Optimizer
import pandas as pd

final_df.fillna(final_df.mean(), inplace=True)

# Debug: Check what we're working with
print("overlap_cog:", overlap_cog)
print("Available columns:", final_df.columns.tolist())

# Rename columns in final_df to match what we expect
cog_vars_clean = []
for var in overlap_cog:
    clean_var = var.replace('-', '_').replace('.', '_')
    # Rename the column in the dataframe if original exists
    if var in final_df.columns:
        final_df = final_df.rename(columns={var: clean_var})
        cog_vars_clean.append(clean_var)
        print(f"Renamed {var} -> {clean_var}")
    elif clean_var in final_df.columns:
        # Already renamed
        cog_vars_clean.append(clean_var)
        print(f"Found already cleaned variable: {clean_var}")
    else:
        print(f"WARNING: {var} not found in final_df")

if len(cog_vars_clean) == 0:
    print("ERROR: No cognitive variables found!")
else:
    cog_vars = " + ".join(cog_vars_clean)
    print("Final cog_vars string:", repr(cog_vars))

    desc = f"""
brain_reserve =~ brain_pc1 + brain_pc2 + brain_pc3 + brain_pc4 + brain_pc5 + brain_pc6 + brain_pc7 + brain_pc8 + brain_pc9 + brain_pc10 + brain_pc11 + brain_pc12 + brain_pc13 + brain_pc14 + brain_pc15 + brain_pc16 + brain_pc17 + brain_pc18 + brain_pc19 + brain_pc20
cognitive_reserve =~ {cog_vars}

brain_reserve ~ education_years
cognitive_reserve ~ education_years + brain_reserve
groups ~ education_years + brain_reserve + cognitive_reserve
"""

    print("Model specification:")
    print(desc)

    try:
        model = Model(desc)
        model.fit(final_df)
        report = model.inspect(std_est=True)
        print(report)
    except Exception as e:
        print(f"Model error: {e}")

In [None]:
report.head(20)

In [None]:
report

Mediation - Education, Images, AD

In [None]:
import networkx as nx
from dowhy import CausalModel
from dowhy.causal_estimators import linear_regression_estimator
from dowhy.causal_estimators.regression_estimator import RegressionEstimator
import pandas as pd

mediators = overlap  # List of mediators from the mediation analysis
results = []

covariates_df = covariates_df.dropna(subset=['education_years', 'curr_age', '31-0.0'])

for mediator in mediators:
        # Create a subgraph with only the current mediator
        gml_graph_single = f"""
        graph [
        directed 1
        node [ id 0 label "education_years" ]
        node [ id 1 label "groups" ]
        node [ id 2 label "{mediator}" ]
        node [ id 3 label "e2/e2" ]
        node [ id 4 label "e3/e3" ]
        node [ id 5 label "e2/e3" ]
        node [ id 6 label "e3/e4" ]
        node [ id 7 label "e2/e4" ]
        node [ id 8 label "e4/e4" ]
        node [ id 9 label "curr_age" ]
        node [ id 10 label "31-0.0" ]
        # Covariates to treatment and outcome
        edge [ source 3 target 0 ]
        edge [ source 3 target 1 ]
        edge [ source 4 target 0 ]
        edge [ source 4 target 1 ]
        edge [ source 5 target 0 ]
        edge [ source 5 target 1 ]
        edge [ source 6 target 0 ]
        edge [ source 6 target 1 ]
        edge [ source 7 target 0 ]
        edge [ source 7 target 1 ]
        edge [ source 8 target 0 ]
        edge [ source 8 target 1 ]
        edge [ source 9 target 0 ]
        edge [ source 9 target 1 ]
        edge [ source 10 target 0 ]
        edge [ source 10 target 1 ]

        # Covariates to mediator
        edge [ source 3 target 2 ]
        edge [ source 4 target 2 ]
        edge [ source 5 target 2 ]
        edge [ source 6 target 2 ]
        edge [ source 7 target 2 ]
        edge [ source 8 target 2 ]
        edge [ source 9 target 2 ]
        edge [ source 10 target 2 ]

        # Treatment to mediator
        edge [ source 0 target 2 ]
        # Mediator to outcome
        edge [ source 2 target 1 ]
        ]

        edge [ source 0 target 1 ]
        """
        with open("causal_graph_single.gml", "w") as f:
            f.write(gml_graph_single)
        gml_graph = nx.read_gml("causal_graph_single.gml")

        covariates_df = covariates_df.dropna(subset=mediator)
    
        model = CausalModel(
            data=covariates_df,
            treatment='education_years',
            outcome='groups',
            graph=gml_graph,
        )

        identified_estimand_indirect = model.identify_effect(estimand_type='nonparametric-nie', proceed_when_unidentifiable=True)
        #print(identified_estimand_indirect)

        causal_estimate_nie = model.estimate_effect(
            identified_estimand_indirect,
            method_name="mediation.two_stage_regression",
            control_value = 0,
            treatment_value = 1,
            effect_modifiers = ['e2/e2', 'e3/e3', 'e2/e3', 'e3/e4', 'e2/e4', 'e4/e4'],
            confidence_intervals=False,
            test_significance=False,
            method_params={
                'first_stage_model': linear_regression_estimator.LinearRegressionEstimator,
                'second_stage_model': linear_regression_estimator.LinearRegressionEstimator,
            }
        )
        results.append({
            'mediator': mediator,
            'nie': causal_estimate_nie.value,
            #'CATE': causal_estimate_nie.cate_estimates, 
        })


In [None]:
from semopy import Model, Optimizer
import pandas as pd

desc = """
brain_reserve =~ hippocampal_volume + icv + cortical_thickness
cognitive_reserve =~ memory_score + executive_function + processing_speed

brain_reserve ~ education_years
cognitive_reserve ~ education_years + brain_reserve
ad_dx ~ education_years + brain_reserve + cognitive_reserve
"""

model = Model(desc)
model.fit(df)  # df = pandas DataFrame
report = model.inspect(std_est=True)


In [None]:
results = pd.DataFrame(results).sort_values(by='nie', key=abs, ascending=False)

In [None]:
results.to_csv('./double_ml/imaging_ed_to_AD_mediation_results.csv', index=False)