In [14]:
import os
os.environ['QT_API'] = 'PyQt6'

import pandas as pd
import numpy as np
import plotnine as pn
from janitor import clean_names
from matplotlib import rcParams
# import seaborn as sns
import matplotlib.pyplot as plt
from pyhere import here
import pyarrow as pa
import pyarrow.parquet as pq

# Set some options
pd.set_option('display.max_columns', None)
pd.set_option('mode.copy_on_write', True)
rcParams.update({'savefig.bbox': 'tight'}) # Keeps plotnine legend from being cut off

from functions import nhanes_link_doc, link_convert, read_data_list, reverse_binary, remove_and_clean

G = 2011-2012

H = 2013-2014

In [None]:
# [nhanes_link_doc(year, 'CFQ') for year in [2011, 2013]]

cog = read_data_list('CFQ', [2011, 2013])

cog_column_names = {'SEQN': 'id',
'CFDCCS': 'cerad_complete_status', # must be 1 to be involved
'CFDCST1': 'cerad_score_trial1_recall',
'CFDCST2': 'cerad_score_trial2_recall',
'CFDCST3': 'cerad_score_trial3_recall',
'CFDCSR': 'cerad_score_delay_recall',
'CFDCIT1': 'cerad_intrusion_wordcount_trial1',
'CFDCIT2': 'cerad_intrusion_wordcount_trial2',
'CFDCIT3': 'cerad_intrusion_wordcount_trial3',
'CFDCIR': 'cerad_intrusion_wordcount_recall',
'CFDAPP': 'animal_fluency_sample_test', # must be 1 to be involved
'CFDAST': 'animal_fluency_score',
'CFDDPP': 'digit_symbol_sample_test',
'CFDDS': 'digit_symbol_score'}

cog_df = pd.concat([cog[0], cog[1]])

cog_df = cog_df.rename(columns = cog_column_names)

cog_df = cog_df[['id',
             'cerad_complete_status',
             'cerad_score_trial1_recall',
             'cerad_score_trial2_recall',
             'cerad_score_trial3_recall',
             'cerad_score_delay_recall',
             'cerad_intrusion_wordcount_trial1',
             'cerad_intrusion_wordcount_trial2',
             'cerad_intrusion_wordcount_trial3',
             'cerad_intrusion_wordcount_recall',
             'animal_fluency_sample_test',
             'animal_fluency_score',
             'digit_symbol_sample_test',
             'digit_symbol_score']] 

cog_df.head()

In [None]:
# [nhanes_link_doc(year, 'HDL') for year in [2011, 2013]]

hdl = read_data_list('HDL', [2011, 2013])

hdl_column_names = {'SEQN': 'id',
                    'LBDHDDSI': 'hdl_mg_dl'}

hdl_df = pd.concat([hdl[0], hdl[1]])

hdl_df = hdl_df.rename(columns = hdl_column_names)

hdl_df = hdl_df[['id', 'hdl_mg_dl']]

hdl_df.head()

In [None]:
# [nhanes_link_doc(year, 'TRIGLY') for year in [2011, 2013]]

ldl = read_data_list('TRIGLY', [2011, 2013])

ldl_column_names = {'SEQN': 'id',
                    'LBXTR': 'trigly_mg_dl',
                    'LBDLDL': 'ldl_mg_dl'}

ldl_df = pd.concat([ldl[0], ldl[1]])

ldl_df = ldl_df.rename(columns = ldl_column_names)

ldl_df = ldl_df[['id', 'trigly_mg_dl', 'ldl_mg_dl']]

ldl_df.head()

In [None]:
# [nhanes_link_doc(year, 'BIOPRO') for year in [2011, 2013]]

bio = read_data_list('BIOPRO', [2011, 2013])

bio_column_names = {
  'SEQN': 'id',
  'LBXSASSI': 'ast_u_l', # direct links to cardiovascular issues
  'LBXSCH': 'cholesterol_mg_dl',
  'LBXSCK': 'cpk_iu_l',
  'LBXSGL': 'glucose_mg_dl',
  'LBXSLDSI': 'ldh_u_l',
  'LBXSTR': 'tri_mg_dl',
  'LBXSBU': 'bun_g_dl', # indrect links to cardiovascular issues
  'LBXSCLSI': 'chloride_mmol_l',
  'LBXSKSI': 'potassium_mmol_l',
  'LBXSNASI': 'sodium_mmol_l',
  'LBXSUA': 'uric_acid_mg_dl',
  'LBXSC3SI': 'bicarbonate_mmol_l',
  'LBXSAL': 'albumin_g_dl' # indirect link to cognitive issues
  }

bio_df = pd.concat([bio[0], bio[1]])

bio_df = bio_df.rename(columns = bio_column_names)

bio_df = bio_df[['id', 
             'ast_u_l', 
             'cholesterol_mg_dl', 
             'cpk_iu_l', 
             'glucose_mg_dl', 
             'ldh_u_l', 
             'tri_mg_dl', 
             'bun_g_dl', 
             'chloride_mmol_l', 
             'potassium_mmol_l', 
             'sodium_mmol_l', 
             'uric_acid_mg_dl', 
             'bicarbonate_mmol_l', 
             'albumin_g_dl']]

bio_df.head()

In [None]:
# [nhanes_link_doc(year, 'DEMO') for year in [2011, 2013]]

demo = read_data_list('DEMO', [2011, 2013])

demo_columns = {'SEQN': 'id',
                'RIAGENDR': 'sex',
                'RIDAGEYR': 'age',
                'RIDRETH3': 'race_ethnic',
                'DMDBORN4': 'birth_country',
                'DMDCITZN': 'citizen',
                'DMDYRSUS': 'length_us',
                'DMDEDUC2': 'ed',
                'DMDMARTL': 'marital',
                'INDHHIN2': 'annual_house_income',
                'INDFMPIR': 'fam_income_pov_ratio',
                'DMDHHSIZ': 'total_num_house',
                'DMDFMSIZ': 'total_num_fam'
                }


demo_df = pd.concat([demo[0], demo[1]])
demo_df = demo_df.rename(columns = demo_columns)
demo_df = demo_df[['id',
                   'sex',
                   'age',
                   'race_ethnic',
                   'birth_country',
                   'citizen',
                   'length_us',
                   'ed',
                   'marital',
                   'annual_house_income',
                   'fam_income_pov_ratio',
                   'total_num_house',
                   'total_num_fam']]

demo_df.head()

In [None]:
# [nhanes_link_doc(year, 'ALQ') for year in [2011, 2013]]

alq = read_data_list('ALQ', [2011, 2013])

alq_columns = {
  'SEQN': 'id',
  'ALQ101': 'drinks_12_yr',
  'ALQ151': 'ever_45_drink_everyday'
}

alq_df = pd.concat([alq[0], alq[1]])
alq_df = alq_df.rename(columns = alq_columns)
alq_df = alq_df[['id',
                 'drinks_12_yr',
                 'ever_45_drink_everyday']]

alq_df.head()

In [None]:
# [nhanes_link_doc(year, 'DPQ') for year in [2011, 2013]]

depress = read_data_list('DPQ', [2011, 2013])

depress_columns = {'SEQN': 'id',
    'DPQ010': 'little_interest_things',
    'DPQ020': 'down_depress_hopeless',
    'DPQ030': 'sleep_issues',
    'DPQ040': 'tired',
    'DPQ050': 'diet_issues',
    'DPQ060': 'feed_bad_self',
    'DPQ070': 'concentrate_issues',
    'DPQ080': 'move_speak_issues',
    'DPQ090': 'better_off_dead',
    'DPQ100': 'diff_prob_cause'}

depress_df = pd.concat([depress[0], depress[1]])
depress_df = depress_df.rename(columns = depress_columns)

depress_df.head()

In [None]:
# [nhanes_link_doc(year, 'BPQ') for year in [2011, 2013]]

bp = read_data_list('BPQ', [2011, 2013])

bp_columns = {
  'SEQN': 'id',
  'BPQ020': 'told_high_bp',
  'BPQ030': 'told_high_bp_2plus',
  'BPQ080': 'dr_told_high_chol',
}

bp_df = pd.concat([bp[0], bp[1]])
bp_df = bp_df.rename(columns = bp_columns)
bp_df = bp_df[['id',
               'told_high_bp',
               'told_high_bp_2plus',
               'dr_told_high_chol']]

bp_df.head()

In [None]:
# [nhanes_link_doc(year, 'DIQ') for year in [2011, 2013]]

diabetes = read_data_list('DIQ', [2011, 2013])

diabetes_columns = {
  'SEQN': 'id',
  'DIQ010': 'dr_told_diabetes',
  'DIQ160': 'told_prediabetes',
  'DIQ170': 'told_risk_diabetes',
  'DIQ172': 'feel_risk_diabetes'
}

dia_df = pd.concat([diabetes[0], diabetes[1]])
dia_df = dia_df.rename(columns = diabetes_columns)
dia_df = dia_df[['id',
                 'dr_told_diabetes',
                 'told_prediabetes',
                 'told_risk_diabetes',
                 'feel_risk_diabetes']]

dia_df.head()

In [None]:
# [nhanes_link_doc(year, 'DBQ') for year in [2011, 2013]]

diet = read_data_list('DBQ', [2011, 2013])

diet_columns = {
  'SEQN': 'id',
  'DBD895': 'num_meal_not_home_prepare',
  'DBD905': 'num_ready_eat_30day',
  'DBD910': 'num_frozen_meal_30day'
}

diet_df = pd.concat([diet[0], diet[1]])
diet_df = diet_df.rename(columns = diet_columns)
diet_df = diet_df[['id',
                   'num_meal_not_home_prepare',
                   'num_ready_eat_30day',
                   'num_frozen_meal_30day']]

diet_df.head()

In [None]:
# [nhanes_link_doc(year, 'DUQ') for year in [2011, 2013]]

drug = read_data_list('DUQ', [2011, 2013])

drug_columns = {
  'SEQN': 'id',
  'DUQ200': 'ever_use_weed',
  'DUQ240': 'ever_use_coke_heroin_meth'
}

drug_df = pd.concat([drug[0], drug[1]])
drug_df = drug_df.rename(columns = drug_columns)
drug_df = drug_df[['id',
                   'ever_use_weed',
                   'ever_use_coke_heroin_meth']]

drug_df.head()

In [None]:
# [nhanes_link_doc(year, 'DUQ') for year in [2011, 2013]]

drug = read_data_list('DUQ', [2011, 2013])

drug_columns = {
  'SEQN': 'id',
  'DUQ200': 'ever_use_weed',
  'DUQ240': 'ever_use_coke_heroin_meth'
}

drug_df = pd.concat([drug[0], drug[1]])
drug_df = drug_df.rename(columns = drug_columns)
drug_df = drug_df[['id',
                   'ever_use_weed',
                   'ever_use_coke_heroin_meth']]

drug_df.head()

In [None]:
# [nhanes_link_doc(year, 'BMX') for year in [2011, 2013]]

bmi_waist = read_data_list('BMX', [2011, 2013])

bmi_waist_columns = {
  'SEQN': 'id',
  'BMXBMI': 'bmi',
  'BMXWAIST': 'waist_cir'
}

bmi_waist_df = pd.concat([bmi_waist[0], bmi_waist[1]])
bmi_waist_df = bmi_waist_df.rename(columns = bmi_waist_columns)
bmi_waist_df = bmi_waist_df[['id',
                             'bmi',
                             'waist_cir']]

bmi_waist_df.head()

In [None]:
# [nhanes_link_doc(year, 'MCQ') for year in [2011, 2013]]

med_cond = read_data_list('MCQ', [2011, 2013])

med_cond_columns = {
  'SEQN': 'id',
  'MCQ080': 'dr_told_overweight',
  'MCQ084': 'diff_think_remember',
  'MCQ160B': 'told_heart_fail',
  'MCQ160C': 'told_heart_disease',
  'MCQ160D': 'told_angina',
  'MCQ160E': 'told_heart_attack',
  'MCQ160F': 'told_stroke',
  'MCQ365A': 'dr_told_lose_wt',
  'MCQ365B': 'dr_told_exercise',
  'MCQ365C': 'dr_told_reduce_salt',
  'MCQ365D': 'dr_told_reduce_fat',
  'MCQ370A': 'now_lose_wt',
  'MCQ370B': 'now_exercise',
  'MCQ370C': 'now_reduce_salt',
  'MCQ370D': 'now_reduce_fat'
}

med_cond_df = pd.concat([med_cond[0], med_cond[1]])
med_cond_df = med_cond_df.rename(columns = med_cond_columns)
med_cond_df = med_cond_df[['id',
                           'dr_told_overweight',
                           'diff_think_remember',
                           'told_heart_fail',
                           'told_heart_disease',
                           'told_angina',
                           'told_heart_attack',
                           'told_stroke',
                           'dr_told_lose_wt',
                           'dr_told_exercise',
                           'dr_told_reduce_salt',
                           'dr_told_reduce_fat',
                           'now_lose_wt',
                           'now_exercise',
                           'now_reduce_salt',
                           'now_reduce_fat']]

med_cond_df.head()

In [None]:
# [nhanes_link_doc(year, 'PAQ') for year in [2011, 2013]]

pa = read_data_list('PAQ', [2011, 2013])

pa_columns = {
  'SEQN': 'id',
  'PAQ605': 'vig_work_pa',
  'PAQ620': 'mod_work_pa',
  'PAQ635': 'walk_bike',
  'PAQ650': 'vig_rec_act',
  'PAQ665': 'mod_rec_act',
  'PAQ710': 'hr_watch_tv_30day',
  'PAQ715': 'hr_comp_use_30day'
}

pa_df = pd.concat([pa[0], pa[1]])
pa_df = pa_df.rename(columns = pa_columns)
pa_df = pa_df[['id',
               'vig_work_pa',
               'mod_work_pa',
               'walk_bike',
               'vig_rec_act',
               'mod_rec_act',
               'hr_watch_tv_30day',
               'hr_comp_use_30day']]

pa_df.head()

In [None]:
# [nhanes_link_doc(year, 'SLQ') for year in [2011, 2013]]

sleep = read_data_list('SLQ', [2011, 2013])

sleep_columns = {
  'SEQN': 'id',
  'SLD010H': 'sleep_hours_weekday',
  'SLQ050': 'dr_told_sleep_trouble',
  'SLQ060': 'dr_told_sleep_disorder'
}

sleep_df = pd.concat([sleep[0], sleep[1]])
sleep_df = sleep_df.rename(columns = sleep_columns)
sleep_df = sleep_df[['id',
           'sleep_hours_weekday',
           'dr_told_sleep_trouble',
           'dr_told_sleep_disorder']]

sleep_df.head()

In [None]:
# [nhanes_link_doc(year, 'SMQ') for year in [2011, 2013]]

smoke = read_data_list('SMQ', [2011, 2013])

smoke_columns = {
  'SEQN': 'id',
  'SMQ020': 'smoke_100cig_life'
}

smoke_df = pd.concat([smoke[0], smoke[1]])
smoke_df = smoke_df.rename(columns = smoke_columns)
smoke_df = smoke_df[['id',
                     'smoke_100cig_life']]

smoke_df.head()

In [None]:
df = (
  demo_df
  .merge(alq_df, 'outer', 'id')
  .merge(bio_df, 'outer', 'id')
  .merge(bmi_waist_df, 'outer', 'id')
  .merge(bp_df, 'outer', 'id')
  .merge(cog_df, 'outer', 'id')
  .merge(dia_df, 'outer', 'id')
  .merge(diet_df, 'outer', 'id')
  .merge(drug_df, 'outer', 'id')
  .merge(hdl_df, 'outer', 'id')
  .merge(ldl_df, 'outer', 'id')
  .merge(med_cond_df, 'outer', 'id')
  .merge(pa_df, 'outer', 'id')
  .merge(sleep_df, 'outer', 'id')
  .merge(smoke_df, 'outer', 'id')
)

In [None]:
from great_tables import GT as gt
pct_miss = df.isnull().sum()/len(df)*100
gt(pct_miss.sort_values(ascending = False).reset_index().head(10)).show()

In [51]:
df = df.dropna(subset = ['cerad_complete_status',
                    'animal_fluency_sample_test',
                    'digit_symbol_sample_test'])

# 'digit_symbol_score',
# 'animal_fluency_score',
# 'cerad_intrusion_wordcount_recall',
# 'cerad_score_delay_recall',

In [57]:
latino = df.loc[df['race_ethnic'].isin([1, 2])]
latino = latino.round(2)

In [58]:
pct_miss_lat = latino.isnull().sum()/len(latino)*100
gt(pct_miss_lat.sort_values(ascending = False).reset_index().head(10)).show()


  * Position 1: 0


index,0
ever_use_weed,100.0
ldl_mg_dl,53.1502423263328
trigly_mg_dl,52.34248788368336
told_high_bp_2plus,43.45718901453958
ever_use_coke_heroin_meth,33.60258481421648
told_prediabetes,32.794830371567045
length_us,30.533117932148627
told_risk_diabetes,27.78675282714055
feel_risk_diabetes,27.78675282714055
ever_45_drink_everyday,23.747980613893372


In [59]:
latino.describe().transpose()[['mean', 'min', 'max']]

Unnamed: 0,mean,min,max
id,73291.581583,62209.0,83723.0
sex,1.515347,1.0,2.0
age,67.187399,60.0,80.0
race_ethnic,1.528271,1.0,2.0
birth_country,1.945073,1.0,77.0
...,...,...,...
hr_comp_use_30day,5.688207,0.0,8.0
sleep_hours_weekday,7.165049,3.0,99.0
dr_told_sleep_trouble,1.718901,1.0,2.0
dr_told_sleep_disorder,1.935380,1.0,9.0


In [60]:
corr_matrix = latino.corr()
corr_matrix = corr_matrix.round(2)

upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
threshold = 0.7
high_corr_pairs = upper_tri[upper_tri > threshold].stack().sort_values(ascending=False)
high_corr_pairs

birth_country              marital                      0.95
total_num_house            total_num_fam                0.95
tri_mg_dl                  trigly_mg_dl                 0.95
cholesterol_mg_dl          ldl_mg_dl                    0.92
bmi                        waist_cir                    0.87
annual_house_income        fam_income_pov_ratio         0.86
cerad_score_trial2_recall  cerad_score_trial3_recall    0.71
now_reduce_salt            now_reduce_fat               0.71
dtype: float64

In [None]:
# latino.to_csv(here('latino_vascular_dementia_indicators.csv'))

In [None]:
# df = df.loc[(df['cerad_complete_status'] == 1) &
#        (df['animal_fluency_sample_test'] == 1) &
#        (df['digit_symbol_sample_test']) == 1]

# Indicators Used

* Albumin
* Diabetes Diagnosis
* 