In [67]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import re
from collections import defaultdict

In [68]:
#Use one ./ if running from run_notebooks.py
#Use two ../if running directly from Jupyter Notebooks
#file_beg = '../NHANES-Downloader/data/csv_data/'

file_beg = './NHANES-Downloader/data/csv_data/'

In [69]:
#Import all the files
files1 = glob.glob(file_beg+'1999-2000/*/*.csv')
files2 = glob.glob(file_beg+'2001-2002/*/*.csv')
files3 = glob.glob(file_beg+'2003-2004/*/*.csv')
files4 = glob.glob(file_beg+'2005-2006/*/*.csv')
files5 = glob.glob(file_beg+'2007-2008/*/*.csv')
files6 = glob.glob(file_beg+'2009-2010/*/*.csv')
files7 = glob.glob(file_beg+'2011-2012/*/*.csv')
files8 = glob.glob(file_beg+'2013-2014/*/*.csv')
files9 = glob.glob(file_beg+'2015-2016/*/*.csv')

In [70]:
#Add files into a list
file_list = [files1, files2, files3, files4, files5, 
          files6, files7, files8, files9]

In [71]:
#Add files into a list of sorted and dictionaries
file_list_dict = []
for x in file_list:
    x.sort()
    file_list_dict.append(dict(enumerate(x)))

In [72]:
demo_indx = [0, 0, 0, 0, 0, 0, 0, 0, 0]

In [73]:
dfs = defaultdict(int)
for i in range(0,len(demo_indx)):
    dfs[i] = pd.read_csv(file_list_dict[i][demo_indx[i]])

## Important Variables

In [74]:
#EDIT: Removed INDHHINC from each after DMDHHSIZ and before INDFMINC
#EDIT: Removed INDHHIN2 from each after DMDHHSIZ and before INDFMIN2
#EDIT: Removed DMDAGE before DMDREDU before WT variable

#1999-2002
var_names = ["SEQN", "RIAGENDR", "RIDAGEYR", "RIDRETH1", "DMDBORN", "DMDCITZN", "DMDHHSIZ", "INDFMINC", "DMDHREDU", "WTINT4YR", "WTMEC4YR", "RIDSTATR", "SDMVPSU", "SDMVSTRA"]

#2003-2006
#RIDRETH1 is the only race/ethnicity variable in the 2005-2006 Demographics data file. The Demographics files that were released for NHANES 1999-2004 had a variable called ‘RIDRETH2’ that provided an analytic link to the NHANES III race/ethnicity variable.
var_names1 = ["SEQN", "RIAGENDR", "RIDAGEYR", "RIDRETH1", "DMDBORN", "DMDCITZN", "DMDHHSIZ", "INDFMINC", "DMDHREDU", "WTINT2YR", "WTMEC2YR", "RIDSTATR", "SDMVPSU", "SDMVSTRA"]

#2007-2010
#DMDHRBRN becomes DMDBORN2
#Recategorize DMDHBRN
#INDFMINC becomes INDFMIN2 - Recategorize 14 and 15 to 11 to match 1999-2006
#INDHHINC becomes INDHHIN2 - Recategorize 14 and 15 to 11 to match 1999-2006
var_names2 = ["SEQN", "RIAGENDR", "RIDAGEYR", "RIDRETH1", "DMDBORN2", "DMDCITZN", "DMDHHSIZ", "INDFMIN2", "DMDHREDU", "WTINT2YR", "WTMEC2YR", "RIDSTATR", "SDMVPSU", "SDMVSTRA"]

#2011-2016
#DMDHRBRN4 becomes 2 categories
#Recategorize all previous
var_names3 = ["SEQN", "RIAGENDR", "RIDAGEYR", "RIDRETH1", "DMDBORN4", "DMDCITZN", "DMDHHSIZ", "INDFMIN2", "DMDHREDU", "WTINT2YR", "WTMEC2YR", "RIDSTATR", "SDMVPSU", "SDMVSTRA"]

In [75]:
#To display all columns in Jupyter Notebooks
pd.set_option('display.max_columns', 500)

## Functions: Recategorize values, Count Values, Drop Rows

In [76]:
#Recategorize function
def recategorize(df, name, replace_dict):
    df[name].replace(
    to_replace=replace_dict,
    inplace=True
)
    
#Count values function
def count_vals(df, name):
    df_count = df.groupby(name)['SEQN'].nunique()
    print(df_count,"\n\n","NaN: ", df[name].isnull().sum())
    
#Drop rows that include certain values
def drop_rows(df, name, val_list):
    df.drop(df[df[name].isin(val_list)].index, inplace=True)

## Make a copy of the dataframes

In [77]:
df0 = dfs[0][var_names].copy() #1999-2000
df1 = dfs[1][var_names].copy() #2001-2002
df2 = dfs[2][var_names1].copy() #2003-2004
df3 = dfs[3][var_names1].copy() #2005-2006
df4 = dfs[4][var_names2].copy() #2007-2008
df5 = dfs[5][var_names2].copy() #2009-2010
df6 = dfs[6][var_names3].copy() #2011-2012
df7 = dfs[7][var_names3].copy() #2013-2014
df8 = dfs[8][var_names3].copy() #2015-2016

In [78]:
df0.shape

(9965, 14)

In [79]:
df8.shape

(9971, 14)

# Take data of only RIDSTATR = 2; Where individual was both interviewed and MEC examined

In [80]:
df0 = df0[df0['RIDSTATR']==2] #1999-2000
df1 = df1[df1['RIDSTATR']==2] #2001-2002
df2 = df2[df2['RIDSTATR']==2] #2003-2004
df3 = df3[df3['RIDSTATR']==2] #2005-2006
df4 = df4[df4['RIDSTATR']==2] #2007-2008
df5 = df5[df5['RIDSTATR']==2] #2009-2010
df6 = df6[df6['RIDSTATR']==2] #2011-2012
df7 = df7[df7['RIDSTATR']==2] #2013-2014
df8 = df8[df8['RIDSTATR']==2] #2015-2016

## Weight for 18-years: 1999 - 2016

In [81]:
#Create column for weight
def reweight(df, col_name, year_wt, weight):
    df[col_name] = df[year_wt]*weight

In [82]:
reweight(df0, 'MEC18YR', 'WTMEC4YR', 2/9)
reweight(df1, 'MEC18YR', 'WTMEC4YR', 2/9)
reweight(df2, 'MEC18YR', 'WTMEC2YR', 1/9)
reweight(df3, 'MEC18YR', 'WTMEC2YR', 1/9)
reweight(df4, 'MEC18YR', 'WTMEC2YR', 1/9)
reweight(df5, 'MEC18YR', 'WTMEC2YR', 1/9)
reweight(df6, 'MEC18YR', 'WTMEC2YR', 1/9)
reweight(df7, 'MEC18YR', 'WTMEC2YR', 1/9)
reweight(df8, 'MEC18YR', 'WTMEC2YR', 1/9)

## Remove 2-yr and 4-yr weights

In [83]:
#Drop columns
def drop_cols(df, cols):
    df.drop(cols, axis=1, inplace=True)

In [84]:
drop_cols(df0,['WTINT4YR','WTMEC4YR'])
drop_cols(df1,['WTINT4YR','WTMEC4YR'])

In [85]:
drop_cols(df2,['WTINT2YR','WTMEC2YR'])
drop_cols(df3,['WTINT2YR','WTMEC2YR'])
drop_cols(df4,['WTINT2YR','WTMEC2YR'])
drop_cols(df5,['WTINT2YR','WTMEC2YR'])
drop_cols(df6,['WTINT2YR','WTMEC2YR'])
drop_cols(df7,['WTINT2YR','WTMEC2YR'])
drop_cols(df8,['WTINT2YR','WTMEC2YR'])

In [86]:
df0[:5]

Unnamed: 0,SEQN,RIAGENDR,RIDAGEYR,RIDRETH1,DMDBORN,DMDCITZN,DMDHHSIZ,INDFMINC,DMDHREDU,RIDSTATR,SDMVPSU,SDMVSTRA,MEC18YR
0,1.0,2.0,2.0,4.0,1.0,1.0,3.0,3.0,3.0,2.0,1.0,5.0,990.268132
1,2.0,1.0,77.0,3.0,1.0,1.0,1.0,8.0,5.0,2.0,3.0,1.0,3408.044382
2,3.0,2.0,10.0,3.0,3.0,2.0,4.0,6.0,4.0,2.0,2.0,7.0,4724.103694
3,4.0,1.0,1.0,4.0,1.0,1.0,7.0,3.0,3.0,2.0,1.0,2.0,1013.864237
4,5.0,1.0,49.0,3.0,1.0,1.0,3.0,11.0,4.0,2.0,2.0,8.0,10219.103963


## Recategorize values

In [87]:
#Recategorize DMDBORN to 2 categories
recategorize(df0, 'DMDBORN', {3:2, 7:77, 9:99}) #1999-2000
recategorize(df1, 'DMDBORN', {3:2, 7:77, 9:99}) #2001-2002
recategorize(df2, 'DMDBORN', {3:2, 7:77, 9:99}) #2003-2004
recategorize(df3, 'DMDBORN', {3:2, 7:77, 9:99}) #2005-2006


#Recategorize DMDBORN2 to 2 categories
#Recategorize INHHIN2 to INDHHINC
#Recategorize INDFMIN2 to INDFMINC

#2007-2008
recategorize(df4, 'DMDBORN2', {4:2, 5:2, 7:77, 9:99})
#recategorize(df4, 'INDHHIN2', {14:11, 15:11})
recategorize(df4, 'INDFMIN2', {14:11, 15:11})

#2009-2010
recategorize(df5, 'DMDBORN2', {4:2, 5:2, 7:77, 9:99})
#recategorize(df5, 'INDHHIN2', {14:11, 15:11})
recategorize(df5, 'INDFMIN2', {14:11, 15:11})

#2011-2012
#recategorize(df6, 'INDHHIN2', {14:11, 15:11})
recategorize(df6, 'INDFMIN2', {14:11, 15:11})

#2013-2014
#recategorize(df7, 'INDHHIN2', {14:11, 15:11})
recategorize(df7, 'INDFMIN2', {14:11, 15:11})

#2015-2016
#recategorize(df8, 'INDHHIN2', {14:11, 15:11})
recategorize(df8, 'INDFMIN2', {14:11, 15:11})

## Rename columns 1999 - 2016

In [88]:
df0

Unnamed: 0,SEQN,RIAGENDR,RIDAGEYR,RIDRETH1,DMDBORN,DMDCITZN,DMDHHSIZ,INDFMINC,DMDHREDU,RIDSTATR,SDMVPSU,SDMVSTRA,MEC18YR
0,1.0,2.0,2.000000e+00,4.0,1.0,1.0,3.0,3.0,3.0,2.0,1.0,5.0,990.268132
1,2.0,1.0,7.700000e+01,3.0,1.0,1.0,1.0,8.0,5.0,2.0,3.0,1.0,3408.044382
2,3.0,2.0,1.000000e+01,3.0,2.0,2.0,4.0,6.0,4.0,2.0,2.0,7.0,4724.103694
3,4.0,1.0,1.000000e+00,4.0,1.0,1.0,7.0,3.0,3.0,2.0,1.0,2.0,1013.864237
4,5.0,1.0,4.900000e+01,3.0,1.0,1.0,3.0,11.0,4.0,2.0,2.0,8.0,10219.103963
5,6.0,2.0,1.900000e+01,5.0,1.0,1.0,2.0,3.0,4.0,2.0,2.0,2.0,4074.958535
6,7.0,2.0,5.900000e+01,4.0,1.0,1.0,1.0,,2.0,2.0,2.0,4.0,2976.960080
7,8.0,1.0,1.300000e+01,3.0,1.0,1.0,7.0,3.0,1.0,2.0,1.0,6.0,3274.333422
8,9.0,2.0,1.100000e+01,4.0,1.0,1.0,4.0,99.0,2.0,2.0,2.0,9.0,781.858676
9,10.0,1.0,4.300000e+01,4.0,1.0,1.0,1.0,99.0,3.0,2.0,1.0,7.0,2494.744543


In [89]:
#New column names
col_names = ["SEQN", "RIAGENDR", "RIDAGEYR", "RIDRETH1", "DMDBORN4", "DMDCITZN", "DMDHHSIZ", "INDFMINC", "DMDHREDU", "RIDSTATR", "SDMVPSU", "SDMVSTRA", "MEC18YR"]

In [90]:
df0.columns = col_names
df1.columns = col_names
df2.columns = col_names
df3.columns = col_names
df4.columns = col_names
df5.columns = col_names
df6.columns = col_names
df7.columns = col_names
df8.columns = col_names

## Append years 1999 - 2016

In [91]:
years = ["1999-2000","2001-2002","2003-2004","2005-2006","2007-2008", 
        "2009-2010","2011-2012","2013-2014","2015-2016"]

In [92]:
frames = [df0, df1, df2, df3, df4, df5, df6, df7, df8]

In [93]:
#Add years as a column
for i, df in enumerate(frames):
    df["Year"] = years[i]

In [94]:
result_1999_2016 = pd.concat(frames, keys = years)

In [95]:
result_1999_2016_cleaned = result_1999_2016.copy()

In [96]:
#Getting rows by keys
#result_1999_2016.loc[years[0]][:5]

In [97]:
len(result_1999_2016)

88062

## Count values for each column

In [98]:
len(col_names)

13

In [99]:
#RIAGENDR - Gender of the sample person
count_vals(result_1999_2016, col_names[1])

RIAGENDR
1.0    43439
2.0    44623
Name: SEQN, dtype: int64 

 NaN:  0


In [100]:
#RIDAGEYR - Age at Screening Adjudicated
#count_vals(result_1999_2016, col_names[2])

In [101]:
#RIDRETH1 - Race/Ethnicity
count_vals(result_1999_2016, col_names[3])

RIDRETH1
1.0    20270
2.0     7107
3.0    32641
4.0    20753
5.0     7291
Name: SEQN, dtype: int64 

 NaN:  0


In [102]:
#DMDBORN4 - Country of Birth
count_vals(result_1999_2016, col_names[4])

DMDBORN4
1.0     72305
2.0     15722
77.0       21
99.0        7
Name: SEQN, dtype: int64 

 NaN:  7


In [103]:
#DMDCITZN - Citizenship status
count_vals(result_1999_2016, col_names[5])

DMDCITZN
1.0    78640
2.0     9261
7.0       93
9.0       44
Name: SEQN, dtype: int64 

 NaN:  24


In [104]:
#DMDHHSIZ - Total number of people in the Household
count_vals(result_1999_2016, col_names[6])

DMDHHSIZ
1.0     6797
2.0    16354
3.0    15437
4.0    18532
5.0    14356
6.0     7780
7.0     8806
Name: SEQN, dtype: int64 

 NaN:  0


In [105]:
#INDHHINC - Annual Household Income
#count_vals(result_1999_2016, col_names[7])

In [106]:
#INDFMINC - Annual Family Income
count_vals(result_1999_2016, col_names[7])

INDFMINC
1.0      3754
2.0      5132
3.0      7592
4.0      6926
5.0      7495
6.0     10386
7.0      7865
8.0      6466
9.0      4627
10.0     3731
11.0    16890
12.0     2151
13.0     1304
77.0     1503
99.0     1214
Name: SEQN, dtype: int64 

 NaN:  1026


In [107]:
#DMDHRAGE - HH Ref Person Age
#count_vals(result_1999_2016, col_names[9])

In [108]:
#DMDHREDU - HH reference person's education level
count_vals(result_1999_2016, col_names[8])

DMDHREDU
1.0    10592
2.0    14579
3.0    19970
4.0    23006
5.0    16883
7.0       47
9.0      350
Name: SEQN, dtype: int64 

 NaN:  2635


In [109]:
len(result_1999_2016)

88062

In [110]:
result_1999_2016_cleaned[:5]

Unnamed: 0,Unnamed: 1,SEQN,RIAGENDR,RIDAGEYR,RIDRETH1,DMDBORN4,DMDCITZN,DMDHHSIZ,INDFMINC,DMDHREDU,RIDSTATR,SDMVPSU,SDMVSTRA,MEC18YR,Year
1999-2000,0,1.0,2.0,2.0,4.0,1.0,1.0,3.0,3.0,3.0,2.0,1.0,5.0,990.268132,1999-2000
1999-2000,1,2.0,1.0,77.0,3.0,1.0,1.0,1.0,8.0,5.0,2.0,3.0,1.0,3408.044382,1999-2000
1999-2000,2,3.0,2.0,10.0,3.0,2.0,2.0,4.0,6.0,4.0,2.0,2.0,7.0,4724.103694,1999-2000
1999-2000,3,4.0,1.0,1.0,4.0,1.0,1.0,7.0,3.0,3.0,2.0,1.0,2.0,1013.864237,1999-2000
1999-2000,4,5.0,1.0,49.0,3.0,1.0,1.0,3.0,11.0,4.0,2.0,2.0,8.0,10219.103963,1999-2000


## Remove rows with missing values:

In [111]:
drop_rows(result_1999_2016_cleaned, col_names[1], [np.nan])
drop_rows(result_1999_2016_cleaned, col_names[2], [np.nan])
drop_rows(result_1999_2016_cleaned, col_names[3], [np.nan])
drop_rows(result_1999_2016_cleaned, col_names[4], [np.nan, 77, 99])
drop_rows(result_1999_2016_cleaned, col_names[5], [np.nan, 7, 9])
drop_rows(result_1999_2016_cleaned, col_names[6], [np.nan])
drop_rows(result_1999_2016_cleaned, col_names[7], [np.nan, 12, 13, 77, 99])
drop_rows(result_1999_2016_cleaned, col_names[8], [np.nan, 7, 9]) 

In [112]:
len(result_1999_2016_cleaned)

78588

In [113]:
result_1999_2016_cleaned[:5]

Unnamed: 0,Unnamed: 1,SEQN,RIAGENDR,RIDAGEYR,RIDRETH1,DMDBORN4,DMDCITZN,DMDHHSIZ,INDFMINC,DMDHREDU,RIDSTATR,SDMVPSU,SDMVSTRA,MEC18YR,Year
1999-2000,0,1.0,2.0,2.0,4.0,1.0,1.0,3.0,3.0,3.0,2.0,1.0,5.0,990.268132,1999-2000
1999-2000,1,2.0,1.0,77.0,3.0,1.0,1.0,1.0,8.0,5.0,2.0,3.0,1.0,3408.044382,1999-2000
1999-2000,2,3.0,2.0,10.0,3.0,2.0,2.0,4.0,6.0,4.0,2.0,2.0,7.0,4724.103694,1999-2000
1999-2000,3,4.0,1.0,1.0,4.0,1.0,1.0,7.0,3.0,3.0,2.0,1.0,2.0,1013.864237,1999-2000
1999-2000,4,5.0,1.0,49.0,3.0,1.0,1.0,3.0,11.0,4.0,2.0,2.0,8.0,10219.103963,1999-2000


## See if missing values have been correctly removed:

In [114]:
count_vals(result_1999_2016_cleaned, col_names[1])

RIAGENDR
1.0    38922
2.0    39666
Name: SEQN, dtype: int64 

 NaN:  0


In [115]:
count_vals(result_1999_2016_cleaned, col_names[2])

RIDAGEYR
5.397605e-79    3623
1.000000e+00    2364
2.000000e+00    2397
3.000000e+00    1616
4.000000e+00    1719
5.000000e+00    1533
6.000000e+00    1634
7.000000e+00    1623
8.000000e+00    1624
9.000000e+00    1567
1.000000e+01    1556
1.100000e+01    1631
1.200000e+01    1805
1.300000e+01    1801
1.400000e+01    1844
1.500000e+01    1702
1.600000e+01    1820
1.700000e+01    1683
1.800000e+01    1600
1.900000e+01    1535
2.000000e+01     742
2.100000e+01     739
2.200000e+01     759
2.300000e+01     767
2.400000e+01     746
2.500000e+01     719
2.600000e+01     746
2.700000e+01     687
2.800000e+01     706
2.900000e+01     771
                ... 
5.600000e+01     579
5.700000e+01     536
5.800000e+01     481
5.900000e+01     509
6.000000e+01     847
6.100000e+01     766
6.200000e+01     747
6.300000e+01     704
6.400000e+01     643
6.500000e+01     650
6.600000e+01     618
6.700000e+01     552
6.800000e+01     541
6.900000e+01     502
7.000000e+01     599
7.100000e+01     528
7.20

In [116]:
count_vals(result_1999_2016_cleaned, col_names[3])

RIDRETH1
1.0    17513
2.0     6069
3.0    30089
4.0    18523
5.0     6394
Name: SEQN, dtype: int64 

 NaN:  0


In [117]:
count_vals(result_1999_2016_cleaned, col_names[4])

DMDBORN4
1.0    65345
2.0    13243
Name: SEQN, dtype: int64 

 NaN:  0


In [118]:
count_vals(result_1999_2016_cleaned, col_names[5])

DMDCITZN
1.0    70855
2.0     7733
Name: SEQN, dtype: int64 

 NaN:  0


In [119]:
count_vals(result_1999_2016_cleaned, col_names[6])

DMDHHSIZ
1.0     6314
2.0    14795
3.0    13944
4.0    16631
5.0    12845
6.0     6849
7.0     7210
Name: SEQN, dtype: int64 

 NaN:  0


In [120]:
count_vals(result_1999_2016_cleaned, col_names[7])

INDFMINC
1.0      3558
2.0      4971
3.0      7364
4.0      6712
5.0      7271
6.0     10157
7.0      7694
8.0      6274
9.0      4535
10.0     3655
11.0    16397
Name: SEQN, dtype: int64 

 NaN:  0


In [121]:
count_vals(result_1999_2016_cleaned, col_names[8])

DMDHREDU
1.0     9336
2.0    13294
3.0    18407
4.0    21593
5.0    15958
Name: SEQN, dtype: int64 

 NaN:  0


In [122]:
before = len(result_1999_2016)
before

88062

In [123]:
after = len(result_1999_2016_cleaned)
after

78588

In [124]:
(before-after)/after

0.12055275614597649

In [125]:
result_1999_2016_cleaned.isnull().any()

SEQN        False
RIAGENDR    False
RIDAGEYR    False
RIDRETH1    False
DMDBORN4    False
DMDCITZN    False
DMDHHSIZ    False
INDFMINC    False
DMDHREDU    False
RIDSTATR    False
SDMVPSU     False
SDMVSTRA    False
MEC18YR     False
Year        False
dtype: bool

## MongoDB Insertion

In [126]:
#Import MongoClient
from pymongo import MongoClient

In [127]:
#Create a MongoClient to run the MongoDB instance
client = MongoClient("localhost", 27017)

In [128]:
client

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)

In [129]:
#Creating a database
db = client['NHANES']
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'NHANES')

In [130]:
db.list_collection_names()

['mcq_h',
 'smq',
 'smqfam',
 'bpx',
 'descr',
 'mcq_b',
 'hiq',
 'demo',
 'bpq',
 'drxtot',
 'huq',
 'tchol',
 'mcq_a',
 'mcq_c',
 'paq',
 'whq',
 'alq',
 'rdq',
 'diq',
 'bmx']

In [131]:
#Creating a collection
demo_p = db.demo_p

In [132]:
#If collections exist, then drop
if 'demo_p' in db.list_collection_names():
    demo_p.drop()
    db.list_collection_names()

## Inputting into DB

In [133]:
result_1999_2016_cleaned.rename(columns= {'SEQN':'_id'}, inplace=True)

In [134]:
result_1999_2016_cleaned[:3]

Unnamed: 0,Unnamed: 1,_id,RIAGENDR,RIDAGEYR,RIDRETH1,DMDBORN4,DMDCITZN,DMDHHSIZ,INDFMINC,DMDHREDU,RIDSTATR,SDMVPSU,SDMVSTRA,MEC18YR,Year
1999-2000,0,1.0,2.0,2.0,4.0,1.0,1.0,3.0,3.0,3.0,2.0,1.0,5.0,990.268132,1999-2000
1999-2000,1,2.0,1.0,77.0,3.0,1.0,1.0,1.0,8.0,5.0,2.0,3.0,1.0,3408.044382,1999-2000
1999-2000,2,3.0,2.0,10.0,3.0,2.0,2.0,4.0,6.0,4.0,2.0,2.0,7.0,4724.103694,1999-2000


In [135]:
demo_p_dict = result_1999_2016_cleaned.to_dict(orient='records')

In [136]:
demo_p_dict[0]

{'DMDBORN4': 1.0,
 'DMDCITZN': 1.0,
 'DMDHHSIZ': 3.0,
 'DMDHREDU': 3.0,
 'INDFMINC': 3.0,
 'MEC18YR': 990.2681319999998,
 'RIAGENDR': 2.0,
 'RIDAGEYR': 2.0,
 'RIDRETH1': 4.0,
 'RIDSTATR': 2.0,
 'SDMVPSU': 1.0,
 'SDMVSTRA': 5.0,
 'Year': '1999-2000',
 '_id': 1.0}

In [137]:
#Insert collection
demo_p.insert_many(demo_p_dict)

<pymongo.results.InsertManyResult at 0x11c278e48>

In [138]:
db.list_collection_names()

['mcq_h',
 'smq',
 'smqfam',
 'bpx',
 'descr',
 'mcq_b',
 'hiq',
 'demo',
 'bpq',
 'drxtot',
 'huq',
 'tchol',
 'mcq_a',
 'mcq_c',
 'demo_p',
 'paq',
 'whq',
 'alq',
 'rdq',
 'diq',
 'bmx']

## Dataframe to CSV

In [66]:
#result_1999_2016_cleaned.to_csv('DEMO.csv')