In [9]:
import os
import sys

import numpy as np
import pandas as pd

import sklearn

In [10]:
demographic = pd.read_csv('raw_data/demographic.csv', encoding='ISO-8859-1')
diet = pd.read_csv('raw_data/diet.csv', encoding='ISO-8859-1')
examination = pd.read_csv('raw_data/examination.csv', encoding='ISO-8859-1')
labs = pd.read_csv('raw_data/labs.csv', encoding='ISO-8859-1')
medication = pd.read_csv('raw_data/medications.csv', encoding='ISO-8859-1')
questionnaire = pd.read_csv('raw_data/questionnaire.csv', encoding='ISO-8859-1')

# sex, race, age(age group)
# health outcome (examination, labs)
# lifestyle (diet, questionnaire)
variables = pd.read_csv('variables.csv')
variables_lst = variables['Variable'].tolist()

In [11]:
# print all variables in varaibles_lst that are in none of the dataframes
for var in variables_lst:
    if var not in demographic.columns and var not in diet.columns and var not in examination.columns and var not in labs.columns and var not in medication.columns and var not in questionnaire.columns:
        print(var)

In [12]:
# select columns in each dataset that are in the variables list (may not in it), join then by SEQN in each dataset

demographic_subset = demographic[["SEQN"]+list(set(variables_lst).intersection(demographic.columns))]
diet_subset = diet[["SEQN"]+list(set(variables_lst).intersection(diet.columns))]
examination_subset = examination[["SEQN"]+list(set(variables_lst).intersection(examination.columns))]
labs_subset = labs[["SEQN"]+list(set(variables_lst).intersection(labs.columns))]
medication_subset = medication[["SEQN"]+list(set(variables_lst).intersection(medication.columns))]
questionnaire_subset = questionnaire[["SEQN"]+list(set(variables_lst).intersection(questionnaire.columns))]
# merge all datasets
merged_df = demographic_subset.merge(diet_subset, on='SEQN').merge(examination_subset, on='SEQN').merge(labs_subset, on='SEQN').merge(medication_subset, on='SEQN').merge(questionnaire_subset, on='SEQN').drop_duplicates()

# replace all categorical variables with the categroies value in the Notes column in the variables.csv
# for example: DMDEDUC2,Education Level (Adults 20+),Demographics,Categorical,"Less than 9th grade, 9–11th grade, High school/GED, Some college/AA degree, College graduate or above"
# then 1 is "Less than 9th grade", 2 is "9–11th grade" and so on
# categorical variables
categorical_variables = variables[variables['Data Type'] == 'Categorical']
for i in range(1, len(merged_df.columns)):
    if merged_df.columns[i] in categorical_variables['Variable'].tolist():
        categories = categorical_variables[categorical_variables['Variable'] == merged_df.columns[i]]['Notes'].values[0].split(', ')
        # convert any value out of range to be NaN
        merged_df[merged_df.columns[i]] = merged_df[merged_df.columns[i]].apply(lambda x: categories[int(x)-1] if not pd.isnull(x) and int(x) <= len(categories) else np.nan)

# replace column names each column names' corresponding Description
merged_df.columns = ["Respondent"] + variables.set_index('Variable').loc[merged_df.columns[1:]]['Description'].tolist()  
merged_df

Unnamed: 0,Respondent,Marital Status,Household Size,Age of the Participant (years),Full Sample 2-Year MEC Exam Weight,Full Sample 2-Year Interview Weight,Gender of the Participant,Examination Month Period,Pregnancy Status at Examination,Education Level (Adults 20+),...,Ever Had at Least 12 Alcoholic Drinks in One Year,Ever Told Had Genital Herpes,Time Since Last Dental Visit,Time Spent Watching TV or Videos (minutes/day),Frequency of Vigorous Physical Activity per Week,Current Smoking Frequency,Smoked at Least 100 Cigarettes in Life,Frequency of Muscle-Strengthening Activities per Week,Doctor Told You Have Diabetes,Engaged in Vigorous Activity in Past 30 Days
0,73557,Separated,3,69,13481.042095,13281.237386,Male,November 1–April 30,,High school/GED,...,,,<6 months,2.0,2.0,Not at all,Yes,8.0,Yes,No
2,73558,Married,4,54,24471.769625,23682.057386,Male,November 1–April 30,,High school/GED,...,,Yes,Never,2.0,4.0,Some days,Yes,8.0,Yes,No
6,73559,Married,2,72,57193.285376,57214.803319,Male,May 1–October 31,,Some college/AA degree,...,,,>1 year,2.0,4.0,Not at all,Yes,0.0,Yes,No
11,73560,,4,9,55766.512438,55201.178592,Male,November 1–April 30,,,...,,,6–12 months,,3.0,,,2.0,No,
12,73561,Married,2,73,65541.871229,63709.667069,Female,November 1–April 30,,College graduate or above,...,,,6–12 months,2.0,1.0,,No,1.0,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19575,83727,Never married,5,26,28115.568221,27141.631824,Male,May 1–October 31,,College graduate or above,...,,Yes,Never,2.0,2.0,,No,1.0,No,No
19576,83728,,4,2,16512.138781,15274.475360,Female,May 1–October 31,,,...,,,>1 year,,3.0,,,0.0,No,
19577,83729,Divorced,1,42,26902.344381,24122.248900,Female,May 1–October 31,Not pregnant,College graduate or above,...,,,6–12 months,2.0,5.0,,No,1.0,No,No
19578,83730,,4,7,26686.025573,25521.878546,Male,November 1–April 30,,,...,,,>1 year,,1.0,,,2.0,No,


In [13]:
# check any col have nan > 50%
nan_count = merged_df.isna().sum()
nan_count = nan_count[nan_count > 0]
nan_count = nan_count/merged_df.shape[0]
nan_count = nan_count[nan_count > 0.5]
nan_count

Pregnancy Status at Examination                      0.876185
Urinary Albumin (mg/L)                               0.729542
Urine Creatinine (mg/dL)                             0.725874
Calculated LDL Cholesterol (mg/dL)                   0.683583
Triglycerides (mg/dL)                                0.679405
Number of Prescription Medications Taken             0.595435
Ever Told Had Chlamydia                              0.823703
Ever Had at Least 12 Alcoholic Drinks in One Year    0.834200
Ever Told Had Genital Herpes                         0.824009
Current Smoking Frequency                            0.746255
dtype: float64

In [14]:
# save processed data
merged_df.to_csv('processed_data.csv', index=False)

In [15]:
# get list of lifestyle variables health outcomes, demographics 
lifestyle_factors = list(set(merged_df.columns).intersection(variables[variables['Category'] == 'Lifestyle Factors']['Description'].tolist()))
print(lifestyle_factors)
health_outcomes = list(set(merged_df.columns).intersection(variables[variables['Category'] == 'Health Outcomes']['Description'].tolist()))
print(health_outcomes)
demographics = list(set(merged_df.columns).intersection(variables[variables['Category'] == 'Demographics']['Description'].tolist()))
print(demographics)

['Frequency of Vigorous Physical Activity per Week', 'Current Smoking Frequency', 'Ever Had at Least 12 Alcoholic Drinks in One Year', 'Frequency of Muscle-Strengthening Activities per Week', 'Calcium Intake (mg)', 'Time Spent Watching TV or Videos (minutes/day)', 'Sleep Duration (hours)', 'Alcohol Intake (grams)', 'Sugar Intake (grams)', 'Total Fat Intake (grams)', 'Smoked at Least 100 Cigarettes in Life', 'Engaged in Vigorous Activity in Past 30 Days']
['Total Cholesterol (mg/dL)', 'Hematocrit (%)', 'Calculated LDL Cholesterol (mg/dL)', 'Time Since Last Dental Visit', 'Urinary Albumin (mg/L)', 'Blood Lead (µg/dL)', 'Use Any Prescription Medications', 'Red Cell Distribution Width (%)', 'Urine Creatinine (mg/dL)', 'Serum Creatinine (mg/dL)', 'Ever Told Had Chlamydia', 'Blood Mercury (µg/L)', 'Body Mass Index (BMI)', 'White Blood Cell Count (10^3 cells/uL)', 'Lymphocyte Percentage (%)', 'Doctor Told You Have Diabetes', 'Hemoglobin (g/dL)', 'Total Energy Intake (kcal)', 'Diastolic Blood 

In [16]:
# print the top N variables in each lifestyle_factors, health_outcomes, demographics with least missing values
N = 5

percent_missing_lifestyle_factors = {}
for var in lifestyle_factors:
    percent_missing_lifestyle_factors[var] = merged_df[var].isna().sum()/merged_df.shape[0]
percent_missing_lifestyle_factors = dict(sorted(percent_missing_lifestyle_factors.items(), key=lambda item: item[1]))
print(list(percent_missing_lifestyle_factors.keys())[:N])

percent_missing_health_outcomes = {}
for var in health_outcomes:
    percent_missing_health_outcomes[var] = merged_df[var].isna().sum()/merged_df.shape[0]
percent_missing_health_outcomes = dict(sorted(percent_missing_health_outcomes.items(), key=lambda item: item[1]))
print(list(percent_missing_health_outcomes.keys())[:N])

percent_missing_demographics = {}
for var in demographics:
    percent_missing_demographics[var] = merged_df[var].isna().sum()/merged_df.shape[0]
percent_missing_demographics = dict(sorted(percent_missing_demographics.items(), key=lambda item: item[1]))
print(list(percent_missing_demographics.keys())[:N])

['Frequency of Vigorous Physical Activity per Week', 'Frequency of Muscle-Strengthening Activities per Week', 'Calcium Intake (mg)', 'Alcohol Intake (grams)', 'Sugar Intake (grams)']
['Use Any Prescription Medications', 'Weight (kg)', 'Dentition Examination Status', 'Doctor Told You Have Diabetes', 'Body Mass Index (BMI)']
['Household Size', 'Full Sample 2-Year Interview Weight', 'Examination Month Period', 'Full Sample 2-Year MEC Exam Weight', 'Gender of the Participant']
