## The goal of this analysis will be to determine the causal relationship between diet and some key health indicators
#### The target audience of this notebook is other data scientists. There is a summary notebook written to the general public.

In [14]:
import pandas as pd
import numpy as np
from functools import reduce
import pdb
import xport
import glob
import os

In [2]:
pd.__version__

'0.23.4'

In [3]:
pd.set_option('display.max_rows', 200)

### All data was gathered off of this page from the CDC: https://wwwn.cdc.gov/nchs/nhanes/search/nnyfs12.aspx

#### Convert all the data from xpt format dataframes

In [4]:
file_map = {
    'DSBI.XPT': 'supplement_blend',
    'DSPI.XPT': 'supplement_product',
    'Y_CEX.XPT': 'cardiorespiratory_endurance',
    'Y_DR1IFF.XPT': 'dietary_interview_individual',
    'Y_DS1IDS.XPT': 'dietary_24_hour_individual',
    'Y_DSQIDS.XPT': 'dietary_30_days_individual',
    'Y_GMX.XPT': 'gross_motor_development',
    'Y_MGX.XPT': 'grip_strength',
    'Y_PLX.XPT': 'plank',
    'DSII.XPT': 'supplement_ingredient',
    'Y_BMX.XPT': 'body_measures',
    'Y_CVX.XPT': 'cardiovascular_fitness',
    'Y_DR1TOT.XPT': 'dietary_interview_total',
    'Y_DS1TOT.XPT': 'dietary_24_hours_total',
    'Y_DSQTOT.XPT': 'dietary_30_days_total',
    'Y_LMX.XPT': 'lower_body_muscle_strength',
    'Y_MPX.XPT': 'modified_pull_up',
    'Y_ACQ.XPT': 'acculturation',
    'Y_DIQ.XPT': 'diabetes',
    'Y_DBQ.XPT': 'diet_behavior_nutrition',
    'Y_ECQ.XPT': 'early_childhood',
    'Y_HIQ.XPT': 'health_insurance',
    'Y_HUQ.XPT': 'health_utilization',
    'Y_MCQ.XPT': 'medical_conditions',
    'Y_PAQ.XPT': 'physical_activity',
    'Y_PFQ.XPT': 'physical_functioning',
    'Y_RXQ_RX.XPT': 'prescription_medications',
    'Y_RDQ.XPT': 'respriratory_health',
    'Y_SMQ.XPT': 'smoking'
}

In [5]:
dfs = {}
os.chdir('data_files')
for file_name in glob.glob('*.XPT'):
    with open(file_name, 'rb') as in_file:
        dfs[file_map[file_name]] = xport.to_dataframe(in_file)


### Separate the data into the independent and dependent variables

#### unused_health_files are exactly that - unused data. The purpose of putting them into a list is for readability.
- supplement_blend: Information about the supplements, not about the participants.
- supplement_product: Information about the supplements, not about the participants.
- supplment_ingredient: Information about the supplements, not about the participants.

In [6]:
unused_health_files = [
    'supplement_blend',
    'supplement_product',
    'supplement_ingredient',
]

#### Variables from these datasets are thought to be causal variables
- dietary_30_days_individual: Dietary information could have causal links to health problems.
- dietary_interview_individual: Dietary information could have causal links to health problems.
- dietary_24_hour_individual: Dietary information could have causal links to health problems.
- dietary_interview_total: Dietary information could have causal links to health problems.
- dietary_24_hours_total: Dietary information could have causal links to health problems.
- dietary_30_days_total: Dietary information could have causal links to health problems.
- diet_behavior_nutrition: Dietary information could have causal links to health problems.
- acculturation: This data is about languages spoken. Although languages spoken should not have an effect, these variables could be correlated to things like food eaten, cultural values, and ethnicity.
- early_childhood: This has data like mother smoking during pregnancy, birthweight, was child ever overweight, etc. Some of these variables could have long term health impacts.
- health_insurance: Having health insurance should lead to higher utilization and therefore better health, although we will wait to see the results of the analysis.
- health_utilization: Using healthcare should lead to better overall health, although we will wait to see the results of the analysis.
- physical_activity: Exercise could have causal effects on overall health.
- prescription_medications: Medications could have an effect on overall health.
- smoking: Smoking has been known to be bad for health, although we will wait to see the results of the analysis.

In [7]:
independent_health_files = [
    'dietary_30_days_individual',
    'dietary_interview_individual',
    'dietary_24_hour_individual',
    'dietary_interview_total',
    'dietary_24_hours_total',
    'dietary_30_days_total',
    'acculturation',
    'diet_behavior_nutrition',
    'early_childhood',
    'health_insurance',
    'health_utilization',
    'physical_activity',
    'prescription_medications',
    'smoking'
]
independent_dfs = {file_name: dfs[file_name] for file_name in independent_health_files}

#### Variables from these datasets will be used to create overall health metrics
- cardiorespiratory_endurance: Strong cardio is a sign of good health.
- gross_motor_development: Motor development is a sign of good health.
- grip_strength: Grip strength is a sign of good health.
- plank: Plank length is a sign of good health.
- body_measures: Body measures can be indicative of good or bad health.
- cardiovascular_fitness: Strong cardio is a sign of good health.
- lower_body_muscle_strength: Muscle strength is a sign of good health.
- modified_pull_up: Muscle strength is a sign of good health.
- diabetes.csv: Diabetes is a sign of bad health.
- medical_conditions: Medical conditions can be a sign of bad health.
- physical_functioning: Impared physical functioning can be a sign of bad health.
- respriratory_health: respriratory data can be a sign of good or bad health.

In [8]:
dependent_health_files = [
    'cardiorespiratory_endurance',
    'gross_motor_development',
    'grip_strength',
    'plank',
    'body_measures',
    'cardiovascular_fitness',
    'lower_body_muscle_strength',
    'modified_pull_up',
    'diabetes',
    'medical_conditions',
    'physical_functioning',
    'respriratory_health',
]
dependent_dfs = {file_name: dfs[file_name] for file_name in dependent_health_files}

### Examine the dependent variables

#### The goal will be to reduce the dependent health files into a single variable or a small number of variables so that regression and machine learning models can be used to determine the effect of the independent variables on the health of the child.

In [10]:
for file_name, df in dependent_dfs.items():
    print(file_name, len(df.columns))

cardiorespiratory_endurance 67
gross_motor_development 57
grip_strength 21
plank 4
body_measures 22
cardiovascular_fitness 33
lower_body_muscle_strength 121
modified_pull_up 4
diabetes 6
medical_conditions 7
physical_functioning 5
respriratory_health 3


In [11]:
for file_name, df in dependent_dfs.items():
    print(file_name, '\n', df.columns, '\n')

cardiorespiratory_endurance 
 Index(['SEQN', 'CEDEXSTS', 'CEDEXCMT', 'CEDEXLEN', 'CEDTOTEX', 'CEDMAXHR',
       'CEDENDHR', 'CEXPROT', 'CEXTEM1', 'CEXHUM1', 'CEXTEM2', 'CEXHUM2',
       'CEXWS', 'CEXWG', 'CEXWTIM', 'CEXWHR', 'CEXS1S', 'CEXS1G', 'CEXS1TIM',
       'CEXS1HR', 'CEXS2S', 'CEXS2G', 'CEXS2TIM', 'CEXS2HR', 'CEXS3S',
       'CEXS3G', 'CEXS3TIM', 'CEXS3HR', 'CEXS4S', 'CEXS4G', 'CEXS4TIM',
       'CEXS4HR', 'CEXS5S', 'CEXS5G', 'CEXS5TIM', 'CEXS5HR', 'CEXS6S',
       'CEXS6G', 'CEXS6TIM', 'CEXS6HR', 'CEXS7S', 'CEXS7G', 'CEXS7TIM',
       'CEXS7HR', 'CEXS8S', 'CEXS8G', 'CEXS8TIM', 'CEXS8HR', 'CEXS9S',
       'CEXS9G', 'CEXS9TIM', 'CEXS9HR', 'CEXS10S', 'CEXS10G', 'CEXS10TI',
       'CEXS10HR', 'CEXRS', 'CEXRG', 'CEXRTIM', 'CEXR1HR', 'CEXR2HR',
       'CEX220S1', 'CEX220S2', 'CEX220S3', 'CEX220S4', 'CEX220S5', 'CEX220J'],
      dtype='object') 

gross_motor_development 
 Index(['SEQN', 'TGMDSTAT', 'TGQPRHA', 'TGQPRFO', 'TGQR1S', 'TGQR2S', 'TGQR3S',
       'TGQR4S', 'TGQG1S', 'TGQG2S

In [13]:
for file_name, df in independent_dfs.items():
    print(file_name, len(df.columns))

dietary_30_days_individual 85
dietary_interview_individual 82
dietary_24_hour_individual 46
dietary_interview_total 99
dietary_24_hours_total 46
dietary_30_days_total 39
acculturation 5
diet_behavior_nutrition 12
early_childhood 7
health_insurance 13
health_utilization 4
physical_activity 96
prescription_medications 7
smoking 4


In [12]:
for file_name, df in independent_dfs.items():
    print(file_name, '\n', df.columns, '\n')

dietary_30_days_individual 
 Index(['SEQN', 'DSDSUPID', 'DSDSUPP', 'DSDANTA', 'DSD070', 'DSDMTCH', 'DSD090',
       'DSD103', 'DSD122Q', 'DSD122U', 'DSDACTSS', 'DSDDAY1', 'DSQ124',
       'DSQ128A', 'DSQ128B', 'DSQ128C', 'DSQ128D', 'DSQ128E', 'DSQ128F',
       'DSQ128G', 'DSQ128H', 'DSQ128I', 'DSQ128J', 'DSQ128K', 'DSQ128L',
       'DSQ128M', 'DSQ128N', 'DSQ128O', 'DSQ128P', 'DSQ128Q', 'DSQ128R',
       'DSD128T', 'DSD128U', 'DSD128V', 'DSD128W', 'DSD128X', 'DSD128Z',
       'DSD128AA', 'DSD128BB', 'DSD128DD', 'DSD128EE', 'DSD128FF', 'DSD128GG',
       'DSD128HH', 'DSD128II', 'DSD128JJ', 'DSD128KK', 'DSD128LL', 'DSD128MM',
       'DSQ128S', 'RXQ215A', 'DSQIKCAL', 'DSQIPROT', 'DSQICARB', 'DSQISUGR',
       'DSQIFIBE', 'DSQITFAT', 'DSQISFAT', 'DSQIMFAT', 'DSQIPFAT', 'DSQICHOL',
       'DSQILYCO', 'DSQILZ', 'DSQIVB1', 'DSQIVB2', 'DSQINIAC', 'DSQIVB6',
       'DSQIFA', 'DSQIFDFE', 'DSQICHL', 'DSQIVB12', 'DSQIVC', 'DSQIVK',
       'DSQIVD', 'DSQICALC', 'DSQIPHOS', 'DSQIMAGN', 'DSQIIRON', 'D

#### There are too many variables at the current moment to make any useful conclusions. The next step will be going through all the dependent variables and determining which ones are worth keeping and which ones can be ignored. As these are the dependent variables we are looking at and we are trying to come up with a "health score", things like regression and PCA are not of much use. 

### Cardiorespritory Endurance

### nnyfs_diabetes.csv
#### The purpose of these dependent variables is to get a simplified look at the childs health through a single or series of variables. Therefore only the high level conditions for variables were kept.
- 'SEQ': sequence ID - kept
- 'IQ01': Doctor said you have diabetes - kept
- 'ID04': Age first told you had diabetes - dropped
- 'IQ16': Doctor said you have prediabetes - kept
- 'IQ05': Take insulin now - dropped
- 'IQ070': Take diabetic pills - dropped

In [None]:
dependent_dfs['nnyfs_diabetes.csv'] = dependent_dfs['nnyfs_diabetes.csv'][['SEQ', 'IQ01', 'IQ16']]
dependent_dfs['nnyfs_diabetes.csv'].columns = ['ID', 'diabetic', 'pre-diabetic']

In [None]:
np.unique(dependent_dfs['nnyfs_diabetes.csv']['pre-diabetic'])

### nnyfs_medical_conditions.csv
#### The goal will be to create a feature for 'has condition' and another feature for 'severity of condition'. All variables kept will be used in one of those features.
- 'SEQ': Sequence ID - kept
- 'CQ01': Told you have asthma - kept
- 'CQ03': Still have asthma - kept
- 'CQ04': Had asthma attack in past year - kept
- 'CQ05': Medication for asthma - dropped
- 'CQ14': Trouble seeing even with glasses - kept
- 'CQ149': Periods started - dropped

In [None]:
dependent_dfs['nnyfs_medical_conditions.csv'] = dependent_dfs['nnyfs_medical_conditions.csv'][
    ['SEQ', 'CQ01', 'CQ03', 'CQ04', 'CQ14']
]
dependent_dfs['nnyfs_medical_conditions.csv'].columns = ['ID', 'had_has_asthma', 'has_asthma', 'recent_asthma_attack', 'vision_problems']

In [None]:
df = dependent_dfs['nnyfs_medical_conditions.csv']

In [None]:
np.unique(df.had_has_asthma)

In [None]:
df = dependent_dfs['nnyfs_medical_conditions.csv']
df['had_asthma'] = df[(df.had_has_asthma == '1') & (df.has_asthma == '2')]

In [None]:
df['had_asthma'].value_counts()

In [None]:
for file_name, frame in dataframes.items():
    print(file_name + ":", frame.shape)

In [None]:
for frame in dataframes.values():
    print('SEQ' in frame.columns)

In [None]:
df = reduce(lambda left, right: pd.merge(left, right, on='SEQ', how='outer'), dataframes.values())

In [None]:
df.shape

In [None]:
df.columns.tolist()

In [None]:
df.columns = [
#     'SEQN' : 'ID',
    'ID',
#     'DBQ360' : 'In Grade School',
    'In Grade School',
#     'DBQ370' : 'School Lunch Offered',
    'School Lunch Offered',
#     'DBD381' : 'Times per Week Eat School Lunch',
    'Times per Week Eat School Lunch',
#     'DBQ390' : 'School Lunch Price',
    'School Lunch Price',
#     'DBQ400' : 'School Breakfast Offered',
    'School Breakfast Offered',
#     'DBD411' : 'Times per Week Eat School Breakfast',
    'Times per Week Eat School Breakfast',
#     'DBQ421' : 'School Breakfast Price',
    'School Breakfast Price',
#     'DBD895' : 'Meals per Week Not Home Prepared', 
    'Meals per Week Not Home Prepared',
#     'DBD900' : 'Meals per Week Fast Food',
    'Meals per Week Fast Food',
#     'DBD905' : 'Ready to Eat Foods Past 30 days',
    'Ready to Eat Food Past 30 Days',
#     'DBD910' : 'Frozen Meals Past 30 Days'
    'Frozen Meals Past 30 Days'
]