In [5]:
import os
import sys

import numpy as np
import pandas as pd

import sklearn

In [6]:
demographic = pd.read_csv('raw_data/demographic.csv', encoding='ISO-8859-1')
diet = pd.read_csv('raw_data/diet.csv', encoding='ISO-8859-1')
examination = pd.read_csv('raw_data/examination.csv', encoding='ISO-8859-1')
labs = pd.read_csv('raw_data/labs.csv', encoding='ISO-8859-1')
medication = pd.read_csv('raw_data/medications.csv', encoding='ISO-8859-1')
questionnaire = pd.read_csv('raw_data/questionnaire.csv', encoding='ISO-8859-1')

# sex, race, age(age group)
# health outcome (examination, labs)
# lifestyle (diet, questionnaire)
variables = pd.read_csv('variables.csv')
variables_lst = variables['Variable'].tolist()

In [7]:
# print all variables in varaibles_lst that are in none of the dataframes
for var in variables_lst:
    if var not in demographic.columns and var not in diet.columns and var not in examination.columns and var not in labs.columns and var not in medication.columns and var not in questionnaire.columns:
        print(var)

In [8]:
# select columns in each dataset that are in the variables list (may not in it), join then by SEQN in each dataset

demographic_subset = demographic[["SEQN"]+list(set(variables_lst).intersection(demographic.columns))]
diet_subset = diet[["SEQN"]+list(set(variables_lst).intersection(diet.columns))]
examination_subset = examination[["SEQN"]+list(set(variables_lst).intersection(examination.columns))]
labs_subset = labs[["SEQN"]+list(set(variables_lst).intersection(labs.columns))]
medication_subset = medication[["SEQN"]+list(set(variables_lst).intersection(medication.columns))]
questionnaire_subset = questionnaire[["SEQN"]+list(set(variables_lst).intersection(questionnaire.columns))]
# merge all datasets
merged_df = demographic_subset.merge(diet_subset, on='SEQN').merge(examination_subset, on='SEQN').merge(labs_subset, on='SEQN').merge(medication_subset, on='SEQN').merge(questionnaire_subset, on='SEQN').drop_duplicates()

# replace all categorical variables with the categroies value in the Notes column in the variables.csv
# for example: DMDEDUC2,Education Level (Adults 20+),Demographics,Categorical,"Less than 9th grade, 9–11th grade, High school/GED, Some college/AA degree, College graduate or above"
# then 1 is "Less than 9th grade", 2 is "9–11th grade" and so on
# categorical variables
categorical_variables = variables[variables['Data Type'] == 'Categorical']
for i in range(1, len(merged_df.columns)):
    if merged_df.columns[i] in categorical_variables['Variable'].tolist():
        categories = categorical_variables[categorical_variables['Variable'] == merged_df.columns[i]]['Notes'].values[0].split(', ')
        # convert any value out of range to be NaN
        merged_df[merged_df.columns[i]] = merged_df[merged_df.columns[i]].apply(lambda x: categories[int(x)-1] if not pd.isnull(x) and int(x) <= len(categories) else np.nan)

# replace column names each column names' corresponding Description
merged_df.columns = ["Respondent"] + variables.set_index('Variable').loc[merged_df.columns[1:]]['Description'].tolist()  
merged_df

Unnamed: 0,Respondent,Country of Birth,Marital Status,Full Sample 2-Year MEC Exam Weight,Family Poverty Income Ratio,Education Level (Adults 20+),Pregnancy Status at Examination,Age of the Participant (years),Race/Ethnicity,Gender of the Participant,...,Frequency of Muscle-Strengthening Activities per Week,Time Spent Watching TV or Videos (minutes/day),Now Taking Hypertension Medication,Engaged in Vigorous Activity in Past 30 Days,Doctor Told You Have Diabetes,Ever Told Had Chlamydia,Ever Had at Least 12 Alcoholic Drinks in One Year,Smoked at Least 100 Cigarettes in Life,Sleep Duration (hours),Frequency of Vigorous Physical Activity per Week
0,73557,Born in 50 U.S. states or D.C.,Separated,13481.042095,0.84,High school/GED,,69,Non-Hispanic Black,Male,...,8.0,2.0,Yes,No,Yes,,,Yes,7.0,2.0
2,73558,Born in 50 U.S. states or D.C.,Married,24471.769625,1.78,High school/GED,,54,Non-Hispanic White,Male,...,8.0,2.0,Yes,No,Yes,,,Yes,9.0,4.0
6,73559,Born in 50 U.S. states or D.C.,Married,57193.285376,4.51,Some college/AA degree,,72,Non-Hispanic White,Male,...,0.0,2.0,Yes,No,Yes,,,Yes,8.0,4.0
11,73560,Born in 50 U.S. states or D.C.,,55766.512438,2.52,,,9,Non-Hispanic White,Male,...,2.0,,,,No,,,,,3.0
12,73561,Born in 50 U.S. states or D.C.,Married,65541.871229,5.00,College graduate or above,,73,Non-Hispanic White,Female,...,1.0,2.0,No,No,No,,,No,9.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19575,83727,Born in 50 U.S. states or D.C.,Never married,28115.568221,,College graduate or above,,26,Other Hispanic,Male,...,1.0,2.0,No,No,No,,,No,7.0,2.0
19576,83728,Born in 50 U.S. states or D.C.,,16512.138781,1.95,,,2,Mexican American,Female,...,0.0,,,,No,,,,,3.0
19577,83729,Born in Mexico,Divorced,26902.344381,3.66,College graduate or above,Not pregnant,42,Non-Hispanic Black,Female,...,1.0,2.0,No,No,No,,,No,7.0,5.0
19578,83730,Born in 50 U.S. states or D.C.,,26686.025573,1.05,,,7,Other Hispanic,Male,...,2.0,,,,No,,,,,1.0


In [9]:
# check any col have nan > 50%
nan_count = merged_df.isna().sum()
nan_count = nan_count[nan_count > 0]
nan_count = nan_count/merged_df.shape[0]
nan_count = nan_count[nan_count > 0.5]
nan_count

Pregnancy Status at Examination                      0.876185
Triglycerides (mg/dL)                                0.679405
Calculated LDL Cholesterol (mg/dL)                   0.683583
Urinary Albumin (mg/L)                               0.729542
Urine Creatinine (mg/dL)                             0.725874
Number of Prescription Medications Taken             0.595435
Ever Told Had Genital Herpes                         0.824009
Current Smoking Frequency                            0.746255
Ever Told Had Chlamydia                              0.823703
Ever Had at Least 12 Alcoholic Drinks in One Year    0.834200
dtype: float64

In [10]:
# save processed data
merged_df.to_csv('processed_data.csv', index=False)

In [11]:
# get list of lifestyle variables health outcomes, demographics 
lifestyle_factors = list(set(merged_df.columns).intersection(variables[variables['Category'] == 'Lifestyle Factors']['Description'].tolist()))
print(lifestyle_factors)
health_outcomes = list(set(merged_df.columns).intersection(variables[variables['Category'] == 'Health Outcomes']['Description'].tolist()))
print(health_outcomes)
demographics = list(set(merged_df.columns).intersection(variables[variables['Category'] == 'Demographics']['Description'].tolist()))
print(demographics)

['Calcium Intake (mg)', 'Smoked at Least 100 Cigarettes in Life', 'Ever Had at Least 12 Alcoholic Drinks in One Year', 'Frequency of Muscle-Strengthening Activities per Week', 'Sugar Intake (grams)', 'Time Spent Watching TV or Videos (minutes/day)', 'Alcohol Intake (grams)', 'Current Smoking Frequency', 'Frequency of Vigorous Physical Activity per Week', 'Sleep Duration (hours)', 'Total Fat Intake (grams)', 'Engaged in Vigorous Activity in Past 30 Days']
['Mean Corpuscular Volume (fL)', 'Blood Selenium (µg/L)', 'Blood Mercury (µg/L)', 'Albumin-Creatinine Ratio (mg/g)', 'Ever Told Had Chlamydia', 'Red Blood Cell Count (million cells/uL)', 'Serum Creatinine (mg/dL)', 'Number of Prescription Medications Taken', 'Calculated LDL Cholesterol (mg/dL)', 'Triglycerides (mg/dL)', 'Now Taking Hypertension Medication', 'Blood Lead (µg/dL)', 'Ever Told Had Genital Herpes', 'Diastolic Blood Pressure (mmHg)', 'Hematocrit (%)', 'Doctor Told You Have Diabetes', 'Total Cholesterol (mg/dL)', 'Red Cell Di

In [12]:
# print the top N variables in each lifestyle_factors, health_outcomes, demographics with least missing values
N = 5

percent_missing_lifestyle_factors = {}
for var in lifestyle_factors:
    percent_missing_lifestyle_factors[var] = merged_df[var].isna().sum()/merged_df.shape[0]
percent_missing_lifestyle_factors = dict(sorted(percent_missing_lifestyle_factors.items(), key=lambda item: item[1]))
print(list(percent_missing_lifestyle_factors.keys())[:N])

percent_missing_health_outcomes = {}
for var in health_outcomes:
    percent_missing_health_outcomes[var] = merged_df[var].isna().sum()/merged_df.shape[0]
percent_missing_health_outcomes = dict(sorted(percent_missing_health_outcomes.items(), key=lambda item: item[1]))
print(list(percent_missing_health_outcomes.keys())[:N])

percent_missing_demographics = {}
for var in demographics:
    percent_missing_demographics[var] = merged_df[var].isna().sum()/merged_df.shape[0]
percent_missing_demographics = dict(sorted(percent_missing_demographics.items(), key=lambda item: item[1]))
print(list(percent_missing_demographics.keys())[:N])

['Frequency of Muscle-Strengthening Activities per Week', 'Frequency of Vigorous Physical Activity per Week', 'Calcium Intake (mg)', 'Sugar Intake (grams)', 'Alcohol Intake (grams)']
['Use Any Prescription Medications', 'Weight (kg)', 'Dentition Examination Status', 'Doctor Told You Have Diabetes', 'Body Mass Index (BMI)']
['Full Sample 2-Year Interview Weight', 'Household Size', 'Age of the Participant (years)', 'Examination Month Period', 'Full Sample 2-Year MEC Exam Weight']
