In [1]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import re
from collections import defaultdict

In [2]:
#Use one ./ if running from run_notebooks.py
#Use two ../if running directly from Jupyter Notebooks
#file_beg = '../NHANES-Downloader/data/csv_data/'

file_beg = './NHANES-Downloader/data/csv_data/'

In [3]:
#Import all the files
files1 = glob.glob(file_beg+'1999-2000/*/*.csv')
files2 = glob.glob(file_beg+'2001-2002/*/*.csv')
files3 = glob.glob(file_beg+'2003-2004/*/*.csv')
files4 = glob.glob(file_beg+'2005-2006/*/*.csv')
files5 = glob.glob(file_beg+'2007-2008/*/*.csv')
files6 = glob.glob(file_beg+'2009-2010/*/*.csv')
files7 = glob.glob(file_beg+'2011-2012/*/*.csv')
files8 = glob.glob(file_beg+'2013-2014/*/*.csv')
files9 = glob.glob(file_beg+'2015-2016/*/*.csv')

In [3]:
#Add files into a list
file_list = [files1, files2, files3, files4, files5, 
          files6, files7, files8, files9]

In [4]:
#Add files into a list of sorted and dictionaries
file_list_dict = []
for x in file_list:
    x.sort()
    file_list_dict.append(dict(enumerate(x)))

In [5]:
demo_indx = [0, 0, 0, 0, 0, 0, 0, 0, 0]

In [6]:
dfs = defaultdict(int)
for i in range(0,len(demo_indx)):
    dfs[i] = pd.read_csv(file_list_dict[i][demo_indx[i]])

## Important Variables

In [7]:
#EDIT: Removed INDHHINC from each after DMDHHSIZ and before INDFMINC
#EDIT: Removed INDHHIN2 from each after DMDHHSIZ and before INDFMIN2
#EDIT: Removed DMDAGE before DMDREDU before WT variable

#1999-2002
var_names = ["SEQN", "RIAGENDR", "RIDAGEYR", "RIDRETH1", "DMDBORN", "DMDCITZN", "DMDHHSIZ", "INDFMINC", "DMDHREDU", "WTINT4YR", "WTMEC4YR"]

#2003-2006
#RIDRETH1 is the only race/ethnicity variable in the 2005-2006 Demographics data file. The Demographics files that were released for NHANES 1999-2004 had a variable called ‘RIDRETH2’ that provided an analytic link to the NHANES III race/ethnicity variable.
var_names1 = ["SEQN", "RIAGENDR", "RIDAGEYR", "RIDRETH1", "DMDBORN", "DMDCITZN", "DMDHHSIZ", "INDFMINC", "DMDHREDU", "WTINT2YR", "WTMEC2YR"]

#2007-2010
#DMDHRBRN becomes DMDBORN2
#Recategorize DMDHBRN
#INDFMINC becomes INDFMIN2 - Recategorize 14 and 15 to 11 to match 1999-2006
#INDHHINC becomes INDHHIN2 - Recategorize 14 and 15 to 11 to match 1999-2006
var_names2 = ["SEQN", "RIAGENDR", "RIDAGEYR", "RIDRETH1", "DMDBORN2", "DMDCITZN", "DMDHHSIZ", "INDFMIN2", "DMDHREDU", "WTINT2YR", "WTMEC2YR"]

#2011-2016
#DMDHRBRN4 becomes 2 categories
#Recategorize all previous
var_names3 = ["SEQN", "RIAGENDR", "RIDAGEYR", "RIDRETH1", "DMDBORN4", "DMDCITZN", "DMDHHSIZ", "INDFMIN2", "DMDHREDU", "WTINT2YR", "WTMEC2YR"]

In [8]:
#To display all columns in Jupyter Notebooks
pd.set_option('display.max_columns', 500)

## Functions: Recategorize values, Count Values, Drop Rows

In [9]:
#Recategorize function
def recategorize(df, name, replace_dict):
    df[name].replace(
    to_replace=replace_dict,
    inplace=True
)
    
#Count values function
def count_vals(df, name):
    df_count = df.groupby(name)['SEQN'].nunique()
    print(df_count,"\n\n","NaN: ", df[name].isnull().sum())
    
#Drop rows that include certain values
def drop_rows(df, name, val_list):
    df.drop(df[df[name].isin(val_list)].index, inplace=True)

## Make a copy of the dataframes

In [10]:
df0 = dfs[0][var_names].copy() #1999-2000
df1 = dfs[1][var_names].copy() #2001-2002
df2 = dfs[2][var_names1].copy() #2003-2004
df3 = dfs[3][var_names1].copy() #2005-2006
df4 = dfs[4][var_names2].copy() #2007-2008
df5 = dfs[5][var_names2].copy() #2009-2010
df6 = dfs[6][var_names3].copy() #2011-2012
df7 = dfs[7][var_names3].copy() #2013-2014
df8 = dfs[8][var_names3].copy() #2015-2016

## Weight for 18-years: 1999 - 2016

In [11]:
#Create column for weight
def reweight(df, col_name, year_wt, weight):
    df[col_name] = df[year_wt]*weight

In [12]:
reweight(df0, 'MEC18YR', 'WTMEC4YR', 2/9)
reweight(df1, 'MEC18YR', 'WTMEC4YR', 2/9)
reweight(df2, 'MEC18YR', 'WTMEC2YR', 1/9)
reweight(df3, 'MEC18YR', 'WTMEC2YR', 1/9)
reweight(df4, 'MEC18YR', 'WTMEC2YR', 1/9)
reweight(df5, 'MEC18YR', 'WTMEC2YR', 1/9)
reweight(df6, 'MEC18YR', 'WTMEC2YR', 1/9)
reweight(df7, 'MEC18YR', 'WTMEC2YR', 1/9)
reweight(df8, 'MEC18YR', 'WTMEC2YR', 1/9)

## Remove 2-yr and 4-yr weights

In [13]:
#Drop columns
def drop_cols(df, cols):
    df.drop(cols, axis=1, inplace=True)

In [14]:
drop_cols(df0,['WTINT4YR','WTMEC4YR'])
drop_cols(df1,['WTINT4YR','WTMEC4YR'])

In [15]:
drop_cols(df2,['WTINT2YR','WTMEC2YR'])
drop_cols(df3,['WTINT2YR','WTMEC2YR'])
drop_cols(df4,['WTINT2YR','WTMEC2YR'])
drop_cols(df5,['WTINT2YR','WTMEC2YR'])
drop_cols(df6,['WTINT2YR','WTMEC2YR'])
drop_cols(df7,['WTINT2YR','WTMEC2YR'])
drop_cols(df8,['WTINT2YR','WTMEC2YR'])

In [16]:
df0[:5]

Unnamed: 0,SEQN,RIAGENDR,RIDAGEYR,RIDRETH1,DMDBORN,DMDCITZN,DMDHHSIZ,INDFMINC,DMDHREDU,MEC18YR
0,1.0,2.0,2.0,4.0,1.0,1.0,3.0,3.0,3.0,990.268132
1,2.0,1.0,77.0,3.0,1.0,1.0,1.0,8.0,5.0,3408.044382
2,3.0,2.0,10.0,3.0,3.0,2.0,4.0,6.0,4.0,4724.103694
3,4.0,1.0,1.0,4.0,1.0,1.0,7.0,3.0,3.0,1013.864237
4,5.0,1.0,49.0,3.0,1.0,1.0,3.0,11.0,4.0,10219.103963


## Recategorize values

In [17]:
#Recategorize DMDBORN to 2 categories
recategorize(df0, 'DMDBORN', {3:2, 7:77, 9:99}) #1999-2000
recategorize(df1, 'DMDBORN', {3:2, 7:77, 9:99}) #2001-2002
recategorize(df2, 'DMDBORN', {3:2, 7:77, 9:99}) #2003-2004
recategorize(df3, 'DMDBORN', {3:2, 7:77, 9:99}) #2005-2006


#Recategorize DMDBORN2 to 2 categories
#Recategorize INHHIN2 to INDHHINC
#Recategorize INDFMIN2 to INDFMINC

#2007-2008
recategorize(df4, 'DMDBORN2', {4:2, 5:2, 7:77, 9:99})
#recategorize(df4, 'INDHHIN2', {14:11, 15:11})
recategorize(df4, 'INDFMIN2', {14:11, 15:11})

#2009-2010
recategorize(df5, 'DMDBORN2', {4:2, 5:2, 7:77, 9:99})
#recategorize(df5, 'INDHHIN2', {14:11, 15:11})
recategorize(df5, 'INDFMIN2', {14:11, 15:11})

#2011-2012
#recategorize(df6, 'INDHHIN2', {14:11, 15:11})
recategorize(df6, 'INDFMIN2', {14:11, 15:11})

#2013-2014
#recategorize(df7, 'INDHHIN2', {14:11, 15:11})
recategorize(df7, 'INDFMIN2', {14:11, 15:11})

#2015-2016
#recategorize(df8, 'INDHHIN2', {14:11, 15:11})
recategorize(df8, 'INDFMIN2', {14:11, 15:11})

## Rename columns 1999 - 2016

In [18]:
#New column names
col_names = ["SEQN", "RIAGENDR", "RIDAGEYR", "RIDRETH1", "DMDBORN4", "DMDCITZN", "DMDHHSIZ", "INDFMINC", "DMDHREDU", "MEC18YR"]

In [19]:
df0.columns = col_names
df1.columns = col_names
df2.columns = col_names
df3.columns = col_names
df4.columns = col_names
df5.columns = col_names
df6.columns = col_names
df7.columns = col_names
df8.columns = col_names

## Append years 1999 - 2016

In [20]:
years = ["1999-2000","2001-2002","2003-2004","2005-2006","2007-2008", 
        "2009-2010","2011-2012","2013-2014","2015-2016"]

In [21]:
frames = [df0, df1, df2, df3, df4, df5, df6, df7, df8]

In [22]:
#Add years as a column
for i, df in enumerate(frames):
    df["Year"] = years[i]

In [23]:
result_1999_2016 = pd.concat(frames, keys = years)

In [24]:
result_1999_2016_cleaned = result_1999_2016.copy()

In [25]:
#Getting rows by keys
#result_1999_2016.loc[years[0]][:5]

In [26]:
len(result_1999_2016)

92062

## Count values for each column

In [27]:
len(col_names)

10

In [28]:
#RIAGENDR - Gender of the sample person
count_vals(result_1999_2016, col_names[1])

RIAGENDR
1.0    45336
2.0    46726
Name: SEQN, dtype: int64 

 NaN:  0


In [29]:
#RIDAGEYR - Age at Screening Adjudicated
#count_vals(result_1999_2016, col_names[2])

In [30]:
#RIDRETH1 - Race/Ethnicity
count_vals(result_1999_2016, col_names[3])

RIDRETH1
1.0    21082
2.0     7474
3.0    34282
4.0    21529
5.0     7695
Name: SEQN, dtype: int64 

 NaN:  0


In [31]:
#DMDBORN4 - Country of Birth
count_vals(result_1999_2016, col_names[4])

DMDBORN4
1.0     75542
2.0     16466
77.0       29
99.0        9
Name: SEQN, dtype: int64 

 NaN:  16


In [32]:
#DMDCITZN - Citizenship status
count_vals(result_1999_2016, col_names[5])

DMDCITZN
1.0    82224
2.0     9649
7.0      111
9.0       47
Name: SEQN, dtype: int64 

 NaN:  31


In [33]:
#DMDHHSIZ - Total number of people in the Household
count_vals(result_1999_2016, col_names[6])

DMDHHSIZ
1.0     7408
2.0    17207
3.0    16162
4.0    19333
5.0    14862
6.0     8027
7.0     9063
Name: SEQN, dtype: int64 

 NaN:  0


In [34]:
#INDHHINC - Annual Household Income
#count_vals(result_1999_2016, col_names[7])

In [35]:
#INDFMINC - Annual Family Income
count_vals(result_1999_2016, col_names[7])

INDFMINC
1.0      3914
2.0      5370
3.0      7891
4.0      7223
5.0      7733
6.0     10759
7.0      8165
8.0      6729
9.0      4796
10.0     3878
11.0    17639
12.0     2300
13.0     1404
77.0     1647
99.0     1302
Name: SEQN, dtype: int64 

 NaN:  1312


In [36]:
#DMDHRAGE - HH Ref Person Age
#count_vals(result_1999_2016, col_names[9])

In [37]:
#DMDHREDU - HH reference person's education level
count_vals(result_1999_2016, col_names[8])

DMDHREDU
1.0    11069
2.0    15178
3.0    20841
4.0    23868
5.0    17656
7.0       65
9.0      372
Name: SEQN, dtype: int64 

 NaN:  3013


In [38]:
len(result_1999_2016)

92062

In [39]:
result_1999_2016_cleaned[:5]

Unnamed: 0,Unnamed: 1,SEQN,RIAGENDR,RIDAGEYR,RIDRETH1,DMDBORN4,DMDCITZN,DMDHHSIZ,INDFMINC,DMDHREDU,MEC18YR,Year
1999-2000,0,1.0,2.0,2.0,4.0,1.0,1.0,3.0,3.0,3.0,990.268132,1999-2000
1999-2000,1,2.0,1.0,77.0,3.0,1.0,1.0,1.0,8.0,5.0,3408.044382,1999-2000
1999-2000,2,3.0,2.0,10.0,3.0,2.0,2.0,4.0,6.0,4.0,4724.103694,1999-2000
1999-2000,3,4.0,1.0,1.0,4.0,1.0,1.0,7.0,3.0,3.0,1013.864237,1999-2000
1999-2000,4,5.0,1.0,49.0,3.0,1.0,1.0,3.0,11.0,4.0,10219.103963,1999-2000


## Remove rows with missing values:

In [40]:
drop_rows(result_1999_2016_cleaned, col_names[1], [np.nan])
drop_rows(result_1999_2016_cleaned, col_names[2], [np.nan])
drop_rows(result_1999_2016_cleaned, col_names[3], [np.nan])
drop_rows(result_1999_2016_cleaned, col_names[4], [np.nan, 77, 99])
drop_rows(result_1999_2016_cleaned, col_names[5], [np.nan, 7, 9])
drop_rows(result_1999_2016_cleaned, col_names[6], [np.nan])
drop_rows(result_1999_2016_cleaned, col_names[7], [np.nan, 12, 13, 77, 99])
#drop_rows(result_1999_2016_cleaned, col_names[8], [np.nan, 13, 13, 77, 99]) #Previously INDHHINC
#drop_rows(result_1999_2016_cleaned, col_names[8], [np.nan]) #Previously DMDHREDU
drop_rows(result_1999_2016_cleaned, col_names[8], [np.nan, 7, 9]) 

In [41]:
len(result_1999_2016_cleaned)

81635

In [42]:
result_1999_2016_cleaned[:5]

Unnamed: 0,Unnamed: 1,SEQN,RIAGENDR,RIDAGEYR,RIDRETH1,DMDBORN4,DMDCITZN,DMDHHSIZ,INDFMINC,DMDHREDU,MEC18YR,Year
1999-2000,0,1.0,2.0,2.0,4.0,1.0,1.0,3.0,3.0,3.0,990.268132,1999-2000
1999-2000,1,2.0,1.0,77.0,3.0,1.0,1.0,1.0,8.0,5.0,3408.044382,1999-2000
1999-2000,2,3.0,2.0,10.0,3.0,2.0,2.0,4.0,6.0,4.0,4724.103694,1999-2000
1999-2000,3,4.0,1.0,1.0,4.0,1.0,1.0,7.0,3.0,3.0,1013.864237,1999-2000
1999-2000,4,5.0,1.0,49.0,3.0,1.0,1.0,3.0,11.0,4.0,10219.103963,1999-2000


## See if missing values have been correctly removed:

In [43]:
count_vals(result_1999_2016_cleaned, col_names[1])

RIAGENDR
1.0    40388
2.0    41247
Name: SEQN, dtype: int64 

 NaN:  0


In [44]:
count_vals(result_1999_2016_cleaned, col_names[2])

RIDAGEYR
5.397605e-79    3734
1.000000e+00    2463
2.000000e+00    2503
3.000000e+00    1688
4.000000e+00    1805
5.000000e+00    1617
6.000000e+00    1695
7.000000e+00    1672
8.000000e+00    1680
9.000000e+00    1619
1.000000e+01    1603
1.100000e+01    1681
1.200000e+01    1844
1.300000e+01    1847
1.400000e+01    1898
1.500000e+01    1751
1.600000e+01    1861
1.700000e+01    1721
1.800000e+01    1650
1.900000e+01    1586
2.000000e+01     761
2.100000e+01     762
2.200000e+01     793
2.300000e+01     793
2.400000e+01     768
2.500000e+01     752
2.600000e+01     784
2.700000e+01     719
2.800000e+01     738
2.900000e+01     801
                ... 
5.600000e+01     602
5.700000e+01     547
5.800000e+01     501
5.900000e+01     524
6.000000e+01     874
6.100000e+01     801
6.200000e+01     775
6.300000e+01     726
6.400000e+01     664
6.500000e+01     675
6.600000e+01     638
6.700000e+01     574
6.800000e+01     564
6.900000e+01     523
7.000000e+01     621
7.100000e+01     549
7.20

In [45]:
count_vals(result_1999_2016_cleaned, col_names[3])

RIDRETH1
1.0    18092
2.0     6348
3.0    31387
4.0    19149
5.0     6659
Name: SEQN, dtype: int64 

 NaN:  0


In [46]:
count_vals(result_1999_2016_cleaned, col_names[4])

DMDBORN4
1.0    67867
2.0    13768
Name: SEQN, dtype: int64 

 NaN:  0


In [47]:
count_vals(result_1999_2016_cleaned, col_names[5])

DMDCITZN
1.0    73619
2.0     8016
Name: SEQN, dtype: int64 

 NaN:  0


In [48]:
count_vals(result_1999_2016_cleaned, col_names[6])

DMDHHSIZ
1.0     6804
2.0    15449
3.0    14513
4.0    17223
5.0    13214
6.0     7031
7.0     7401
Name: SEQN, dtype: int64 

 NaN:  0


In [49]:
count_vals(result_1999_2016_cleaned, col_names[7])

INDFMINC
1.0      3702
2.0      5199
3.0      7651
4.0      6992
5.0      7490
6.0     10516
7.0      7977
8.0      6521
9.0      4688
10.0     3791
11.0    17108
Name: SEQN, dtype: int64 

 NaN:  0


In [50]:
count_vals(result_1999_2016_cleaned, col_names[8])

DMDHREDU
1.0     9714
2.0    13787
3.0    19160
4.0    22351
5.0    16623
Name: SEQN, dtype: int64 

 NaN:  0


In [51]:
before = len(result_1999_2016)
before

92062

In [52]:
after = len(result_1999_2016_cleaned)
after

81635

In [53]:
(before-after)/after

0.1277270778465119

## MongoDB Insertion

In [54]:
#Import MongoClient
from pymongo import MongoClient

In [55]:
#Create a MongoClient to run the MongoDB instance
client = MongoClient("localhost", 27017)

In [56]:
client

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)

In [57]:
#Creating a database
db = client['NHANES']
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'NHANES')

In [58]:
db.list_collection_names()

['smqfam',
 'smq',
 'mcq',
 'hiq',
 'demo',
 'bpq',
 'huq',
 'paq',
 'whq',
 'alq',
 'diq']

In [59]:
#Creating a collection
demo = db.demo

In [60]:
#If collections exist, then drop
if 'demo' in db.list_collection_names():
    demo.drop()
    db.list_collection_names()

## Inputting into DB

In [61]:
result_1999_2016_cleaned.rename(columns= {'SEQN':'_id'}, inplace=True)

In [62]:
result_1999_2016_cleaned[:3]

Unnamed: 0,Unnamed: 1,_id,RIAGENDR,RIDAGEYR,RIDRETH1,DMDBORN4,DMDCITZN,DMDHHSIZ,INDFMINC,DMDHREDU,MEC18YR,Year
1999-2000,0,1.0,2.0,2.0,4.0,1.0,1.0,3.0,3.0,3.0,990.268132,1999-2000
1999-2000,1,2.0,1.0,77.0,3.0,1.0,1.0,1.0,8.0,5.0,3408.044382,1999-2000
1999-2000,2,3.0,2.0,10.0,3.0,2.0,2.0,4.0,6.0,4.0,4724.103694,1999-2000


In [63]:
demo_dict = result_1999_2016_cleaned.to_dict(orient='records')

In [64]:
demo_dict[0]

{'DMDBORN4': 1.0,
 'DMDCITZN': 1.0,
 'DMDHHSIZ': 3.0,
 'DMDHREDU': 3.0,
 'INDFMINC': 3.0,
 'MEC18YR': 990.2681319999998,
 'RIAGENDR': 2.0,
 'RIDAGEYR': 2.0,
 'RIDRETH1': 4.0,
 'Year': '1999-2000',
 '_id': 1.0}

In [65]:
#Insert collection
demo.insert_many(demo_dict)

<pymongo.results.InsertManyResult at 0x1156b4d48>

In [66]:
db.list_collection_names()

['smqfam',
 'smq',
 'mcq',
 'hiq',
 'demo',
 'bpq',
 'huq',
 'paq',
 'whq',
 'alq',
 'diq']

In [67]:
#for d in demo.find():
    #print(d)