In [52]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import re
from collections import defaultdict

In [53]:
#To display all columns in Jupyter Notebooks
pd.set_option('display.max_columns', 500)

In [54]:
#Import MongoClient
from pymongo import MongoClient

#Create a MongoClient to run the MongoDB instance
client = MongoClient('localhost', 27017)

In [55]:
#Connect to existing datbase
db = client.NHANES

In [56]:
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'NHANES')

In [57]:
col = db.list_collection_names()
col.sort()
col

['alq',
 'bmx',
 'bpq',
 'bpx',
 'demo',
 'descr',
 'diq',
 'drxtot',
 'hiq',
 'huq',
 'mcq_a',
 'mcq_b',
 'mcq_c',
 'mcq_h',
 'paq',
 'rdq',
 'smq',
 'smqfam',
 'tchol',
 'whq']

In [58]:
#Collections
demo = db.demo
alq = db.alq
diq = db.diq
drxtot = db.drxtot
bpq = db.bpq
bpx = db.bpx
tchol = db.tchol
bmx = db.bmx
paq = db.paq
smq = db.smq
smqfam = db.smqfam

mcq_a = db.mcq_a #Asthma
mcq_h = db.mcq_h #Heart Disease
mcq_c = db.mcq_c #Cancer
mcq_b = db.mcq_b #Bronchitis (Chronic Lung)

hiq = db.hiq
huq = db.huq
whq = db.whq
rd = db.rdq

descr = db.descr

In [59]:
#Create dataframes from database
df_demo = pd.DataFrame(list(demo.find()))
df_alq = pd.DataFrame(list(alq.find()))
df_diq = pd.DataFrame(list(diq.find()))
df_drxtot = pd.DataFrame(list(drxtot.find()))
df_bpq = pd.DataFrame(list(bpq.find()))
df_bpx = pd.DataFrame(list(bpx.find()))
df_tchol = pd.DataFrame(list(tchol.find()))
df_bmx = pd.DataFrame(list(bmx.find()))
df_paq = pd.DataFrame(list(paq.find()))
df_smq = pd.DataFrame(list(smq.find()))
df_smqfam = pd.DataFrame(list(smqfam.find()))

df_mcq_a = pd.DataFrame(list(mcq_a.find()))
df_mcq_h = pd.DataFrame(list(mcq_h.find()))
df_mcq_c = pd.DataFrame(list(mcq_c.find()))
df_mcq_b = pd.DataFrame(list(mcq_b.find()))

df_hiq = pd.DataFrame(list(hiq.find()))
df_huq = pd.DataFrame(list(huq.find()))
df_whq = pd.DataFrame(list(whq.find()))
df_rdq = pd.DataFrame(list(rd.find()))

df_descr = pd.DataFrame(list(descr.find()))

In [60]:
#All records
dfs = [df_demo, df_alq, df_diq, df_drxtot, df_bpq, df_bpx, df_tchol, df_bmx, df_paq, 
       df_smq, df_smqfam, df_mcq_a, df_mcq_b, df_mcq_c, df_mcq_h, df_hiq, df_huq, df_whq, df_rdq]

In [61]:
names = ['demo', 'alq', 'diq', 'drxtot', 'bpq', 'bpx', 'tchol', 'bmx', 'paq',
        'smq', 'smqfam', 'mcq_a', 'mcq_b', 'mcq_c', 'mcq_h', 'hiq', 'huq', 'whq', 'rdq']

In [62]:
data_dict = dict(zip(names,dfs))

## Functions:

In [63]:
#Declare label globally
label = 'MCQ160K'

In [64]:
#Function for inner join
def innerjoin_df(dfs_list, join_on):
    df_join = dfs_list[0]
    for d in dfs_list[1:]:
        df_join = df_join.merge(d, how='inner', on=join_on)
    return df_join

In [65]:
#Function for getting info from list of collections
#Look at records and features for each
def get_info(dfs, names):
    shape = [x.shape for x in dfs]
    d = defaultdict(str)
    for i in range(0,len(shape)):
        d[names[i]] = shape[i]
    info = pd.DataFrame.from_dict(d, orient='index').reset_index()
    info.columns = ['_id', 'Records', 'Features']
    return info

In [66]:
info = get_info(dfs, names)
info_join = innerjoin_df([info, df_descr], ['_id'])
info_join = info_join.sort_values(by='Records', ascending=False)
info_join

Unnamed: 0,_id,Records,Features,Description
15,hiq,91399,3,Health Insurance
10,smqfam,90825,3,Household Smoking
2,diq,87852,3,Diabetes
11,mcq_a,87808,3,Asthma
16,huq,87707,7,Hospital Utilization
0,demo,81635,11,Demographics
3,drxtot,80241,26,Dietary
7,bmx,76742,6,Body Measures
18,rdq,68481,4,Cough
8,paq,64823,5,Physical Activity


### Select data to use

In [67]:
#Get relevant data
def get_reldata(df):
    dfs = []
    for c in df:
        dfs.append(data_dict[c])
    return dfs

In [70]:
#Selected risk factors for disease
names = ['demo', 'alq', 'drxtot', 'bpq', 'bpx', 'bmx', 'paq',
        'smq', 'smqfam', 'mcq_a', 'mcq_b', 'hiq', 'huq']

In [71]:
#Selected risk factors for disease
dfs = get_reldata(names)

## Join dataframes

In [72]:
df_j = innerjoin_df(dfs, ['_id','Year'])
df_j.shape

(33009, 58)

In [73]:
df_j.head()

Unnamed: 0,DMDBORN4,DMDCITZN,DMDHHSIZ,DMDHREDU,INDFMINC,MEC18YR,RIAGENDR,RIDAGEYR,RIDRETH1,Year,_id,ALQ101,DRD320GW,DRDTSODI,DRX18YR,DRXTALCO,DRXTCAFF,DRXTCALC,DRXTCARB,DRXTCHOL,DRXTCOPP,DRXTFIBE,DRXTIRON,DRXTKCAL,DRXTMAGN,DRXTPHOS,DRXTPOTA,DRXTPROT,DRXTTFAT,DRXTVARE,DRXTVB1,DRXTVB12,DRXTVB2,DRXTVB6,DRXTVC,DRXTZINC,BPQ020,BPXDI1,BPXPULS,BPXSY1,BMXBMI,BMXHT,BMXWAIST,BMXWT,PAQ635,PAQ650,PAQ665,SMAQUEX,SMQ680,SMD410,MCQ010,MCQ160K,HID010,HUQ010,HUQ020,HUQ030,HUQ050,HUQ070
0,1.0,1.0,1.0,5.0,8.0,3408.044382,1.0,77.0,3.0,1999-2000,2.0,1.0,5.397605e-79,5710.03,3315.985398,5.397605e-79,530.45,925.37,350.37,313.95,2.08,36.99,37.29,2463.0,502.25,1974.57,4672.48,123.16,71.95,923.91,2.11,8.68,3.25,2.9,119.12,41.61,2.0,58.0,1.0,106.0,24.9,174.0,98.0,75.4,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,3.0,1.0
1,1.0,1.0,3.0,4.0,11.0,10219.103963,1.0,49.0,3.0,1999-2000,5.0,1.0,1298.0,3756.36,13105.246918,34.56,5.397605e-79,1626.38,253.98,180.57,1.95,17.28,27.22,2658.14,367.72,1811.55,3743.15,97.13,114.52,1298.44,2.62,6.66,3.05,2.92,112.19,10.17,1.0,82.0,1.0,122.0,29.1,178.3,99.9,92.5,2.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,3.0,1.0,3.0,2.0
2,1.0,1.0,4.0,2.0,11.0,10149.365568,1.0,37.0,3.0,1999-2000,12.0,1.0,3304.0,7511.18,13280.15039,5.397605e-79,309.02,1292.81,309.57,1242.06,1.7,18.68,23.31,3348.31,435.55,2364.22,4607.26,130.02,178.85,923.8,2.43,8.36,3.31,2.4,70.96,17.47,1.0,108.0,1.0,182.0,30.62,180.0,112.8,99.2,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,4.0,3.0,1.0,2.0,2.0
3,1.0,1.0,2.0,5.0,8.0,11437.714415,2.0,38.0,3.0,1999-2000,15.0,1.0,2478.0,3832.49,18684.987158,13.15,432.85,814.59,264.69,596.3,1.4,16.15,16.14,2573.14,266.8,1321.3,3171.15,82.66,127.03,720.5,1.38,4.64,1.79,1.72,36.11,11.77,2.0,68.0,1.0,106.0,26.68,174.9,86.7,81.6,1.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,3.0,3.0,1.0,3.0,2.0
4,1.0,1.0,1.0,1.0,1.0,1957.4984,2.0,85.0,4.0,1999-2000,16.0,2.0,236.0,2129.94,1304.207395,5.397605e-79,5.397605e-79,509.35,112.24,124.95,0.66,16.36,7.54,898.38,183.6,689.38,1458.25,39.62,33.87,1844.08,0.88,1.19,0.89,0.77,69.19,6.47,2.0,62.0,1.0,164.0,19.96,144.2,74.4,41.5,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,3.0,3.0,1.0,1.0,2.0


## Reorder columns

In [74]:
#Get a list of columns
cols = list(df_j)

In [75]:
#Move '_id' column to head of list using dex, pop and insert
cols.insert(0, cols.pop(cols.index('_id')))

#Move 'Year' column to back of list using index, pop and insert
cols.insert(len(df_j.columns)-1, cols.pop(cols.index('Year')))

In [76]:
#Reorder dataframe
df_j = df_j.loc[:, cols]
df_j.head()

Unnamed: 0,_id,DMDBORN4,DMDCITZN,DMDHHSIZ,DMDHREDU,INDFMINC,MEC18YR,RIAGENDR,RIDAGEYR,RIDRETH1,ALQ101,DRD320GW,DRDTSODI,DRX18YR,DRXTALCO,DRXTCAFF,DRXTCALC,DRXTCARB,DRXTCHOL,DRXTCOPP,DRXTFIBE,DRXTIRON,DRXTKCAL,DRXTMAGN,DRXTPHOS,DRXTPOTA,DRXTPROT,DRXTTFAT,DRXTVARE,DRXTVB1,DRXTVB12,DRXTVB2,DRXTVB6,DRXTVC,DRXTZINC,BPQ020,BPXDI1,BPXPULS,BPXSY1,BMXBMI,BMXHT,BMXWAIST,BMXWT,PAQ635,PAQ650,PAQ665,SMAQUEX,SMQ680,SMD410,MCQ010,MCQ160K,HID010,HUQ010,HUQ020,HUQ030,HUQ050,HUQ070,Year
0,2.0,1.0,1.0,1.0,5.0,8.0,3408.044382,1.0,77.0,3.0,1.0,5.397605e-79,5710.03,3315.985398,5.397605e-79,530.45,925.37,350.37,313.95,2.08,36.99,37.29,2463.0,502.25,1974.57,4672.48,123.16,71.95,923.91,2.11,8.68,3.25,2.9,119.12,41.61,2.0,58.0,1.0,106.0,24.9,174.0,98.0,75.4,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,3.0,1.0,1999-2000
1,5.0,1.0,1.0,3.0,4.0,11.0,10219.103963,1.0,49.0,3.0,1.0,1298.0,3756.36,13105.246918,34.56,5.397605e-79,1626.38,253.98,180.57,1.95,17.28,27.22,2658.14,367.72,1811.55,3743.15,97.13,114.52,1298.44,2.62,6.66,3.05,2.92,112.19,10.17,1.0,82.0,1.0,122.0,29.1,178.3,99.9,92.5,2.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,3.0,1.0,3.0,2.0,1999-2000
2,12.0,1.0,1.0,4.0,2.0,11.0,10149.365568,1.0,37.0,3.0,1.0,3304.0,7511.18,13280.15039,5.397605e-79,309.02,1292.81,309.57,1242.06,1.7,18.68,23.31,3348.31,435.55,2364.22,4607.26,130.02,178.85,923.8,2.43,8.36,3.31,2.4,70.96,17.47,1.0,108.0,1.0,182.0,30.62,180.0,112.8,99.2,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,4.0,3.0,1.0,2.0,2.0,1999-2000
3,15.0,1.0,1.0,2.0,5.0,8.0,11437.714415,2.0,38.0,3.0,1.0,2478.0,3832.49,18684.987158,13.15,432.85,814.59,264.69,596.3,1.4,16.15,16.14,2573.14,266.8,1321.3,3171.15,82.66,127.03,720.5,1.38,4.64,1.79,1.72,36.11,11.77,2.0,68.0,1.0,106.0,26.68,174.9,86.7,81.6,1.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,3.0,3.0,1.0,3.0,2.0,1999-2000
4,16.0,1.0,1.0,1.0,1.0,1.0,1957.4984,2.0,85.0,4.0,2.0,236.0,2129.94,1304.207395,5.397605e-79,5.397605e-79,509.35,112.24,124.95,0.66,16.36,7.54,898.38,183.6,689.38,1458.25,39.62,33.87,1844.08,0.88,1.19,0.89,0.77,69.19,6.47,2.0,62.0,1.0,164.0,19.96,144.2,74.4,41.5,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,3.0,3.0,1.0,1.0,2.0,1999-2000


## Remap years to number categories

In [77]:
di = {"1999-2000": 0, "2001-2002": 1, "2003-2004": 2, "2005-2006": 3, "2007-2008": 4, 
      "2009-2010": 5, "2011-2012": 6, "2013-2014": 7, "2015-2016": 8}

In [78]:
#Map categorical years to numerical
df_j['Year'] = df_j['Year'].map(di)

In [79]:
df_j.head()

Unnamed: 0,_id,DMDBORN4,DMDCITZN,DMDHHSIZ,DMDHREDU,INDFMINC,MEC18YR,RIAGENDR,RIDAGEYR,RIDRETH1,ALQ101,DRD320GW,DRDTSODI,DRX18YR,DRXTALCO,DRXTCAFF,DRXTCALC,DRXTCARB,DRXTCHOL,DRXTCOPP,DRXTFIBE,DRXTIRON,DRXTKCAL,DRXTMAGN,DRXTPHOS,DRXTPOTA,DRXTPROT,DRXTTFAT,DRXTVARE,DRXTVB1,DRXTVB12,DRXTVB2,DRXTVB6,DRXTVC,DRXTZINC,BPQ020,BPXDI1,BPXPULS,BPXSY1,BMXBMI,BMXHT,BMXWAIST,BMXWT,PAQ635,PAQ650,PAQ665,SMAQUEX,SMQ680,SMD410,MCQ010,MCQ160K,HID010,HUQ010,HUQ020,HUQ030,HUQ050,HUQ070,Year
0,2.0,1.0,1.0,1.0,5.0,8.0,3408.044382,1.0,77.0,3.0,1.0,5.397605e-79,5710.03,3315.985398,5.397605e-79,530.45,925.37,350.37,313.95,2.08,36.99,37.29,2463.0,502.25,1974.57,4672.48,123.16,71.95,923.91,2.11,8.68,3.25,2.9,119.12,41.61,2.0,58.0,1.0,106.0,24.9,174.0,98.0,75.4,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,3.0,1.0,0
1,5.0,1.0,1.0,3.0,4.0,11.0,10219.103963,1.0,49.0,3.0,1.0,1298.0,3756.36,13105.246918,34.56,5.397605e-79,1626.38,253.98,180.57,1.95,17.28,27.22,2658.14,367.72,1811.55,3743.15,97.13,114.52,1298.44,2.62,6.66,3.05,2.92,112.19,10.17,1.0,82.0,1.0,122.0,29.1,178.3,99.9,92.5,2.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,3.0,1.0,3.0,2.0,0
2,12.0,1.0,1.0,4.0,2.0,11.0,10149.365568,1.0,37.0,3.0,1.0,3304.0,7511.18,13280.15039,5.397605e-79,309.02,1292.81,309.57,1242.06,1.7,18.68,23.31,3348.31,435.55,2364.22,4607.26,130.02,178.85,923.8,2.43,8.36,3.31,2.4,70.96,17.47,1.0,108.0,1.0,182.0,30.62,180.0,112.8,99.2,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,4.0,3.0,1.0,2.0,2.0,0
3,15.0,1.0,1.0,2.0,5.0,8.0,11437.714415,2.0,38.0,3.0,1.0,2478.0,3832.49,18684.987158,13.15,432.85,814.59,264.69,596.3,1.4,16.15,16.14,2573.14,266.8,1321.3,3171.15,82.66,127.03,720.5,1.38,4.64,1.79,1.72,36.11,11.77,2.0,68.0,1.0,106.0,26.68,174.9,86.7,81.6,1.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,3.0,3.0,1.0,3.0,2.0,0
4,16.0,1.0,1.0,1.0,1.0,1.0,1957.4984,2.0,85.0,4.0,2.0,236.0,2129.94,1304.207395,5.397605e-79,5.397605e-79,509.35,112.24,124.95,0.66,16.36,7.54,898.38,183.6,689.38,1458.25,39.62,33.87,1844.08,0.88,1.19,0.89,0.77,69.19,6.47,2.0,62.0,1.0,164.0,19.96,144.2,74.4,41.5,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,3.0,3.0,1.0,1.0,2.0,0


In [63]:
#Check if any NaN
df_j.isnull().values.any()

False

## Categorize features that need to be One Hot Encoded

In [80]:
df_j.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33009 entries, 0 to 33008
Data columns (total 58 columns):
_id         33009 non-null float64
DMDBORN4    33009 non-null float64
DMDCITZN    33009 non-null float64
DMDHHSIZ    33009 non-null float64
DMDHREDU    33009 non-null float64
INDFMINC    33009 non-null float64
MEC18YR     33009 non-null float64
RIAGENDR    33009 non-null float64
RIDAGEYR    33009 non-null float64
RIDRETH1    33009 non-null float64
ALQ101      33009 non-null float64
DRD320GW    33009 non-null float64
DRDTSODI    33009 non-null float64
DRX18YR     33009 non-null float64
DRXTALCO    33009 non-null float64
DRXTCAFF    33009 non-null float64
DRXTCALC    33009 non-null float64
DRXTCARB    33009 non-null float64
DRXTCHOL    33009 non-null float64
DRXTCOPP    33009 non-null float64
DRXTFIBE    33009 non-null float64
DRXTIRON    33009 non-null float64
DRXTKCAL    33009 non-null float64
DRXTMAGN    33009 non-null float64
DRXTPHOS    33009 non-null float64
DRXTPOTA    3300

In [81]:
#Change columns to category
#Columns to remove: 
#DRX18YR - 18 Year weight
#MEC18YR - 18 year Weight
#Year
#_id

cat_cols = ['DMDBORN4',
            'DMDCITZN',
            'RIAGENDR',
            'RIDRETH1',
            'ALQ101',
            'DIQ010',
            'BPQ020',
            'BPXPULS',
            'PAQ635',
            'PAQ650',
            'PAQ665',
            'SMAQUEX',
            'SMQ680',
            'SMD410',
            'MCQ010',
            'MCQ160C',
            'MCQ160K',
            'MCQ220',
            'HID010',
            'HUQ020',
            'HUQ030',
            'HUQ070',
            'WHQ030',
            'WHQ040']

def recat_cols(df, col_names):
    for x in col_names:
        if x in cat_cols:
            df[x] = df[x].astype('category')
    return df

col_names = df_j.columns
df_ohe = recat_cols(df_j, col_names)

In [82]:
df_ohe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33009 entries, 0 to 33008
Data columns (total 58 columns):
_id         33009 non-null float64
DMDBORN4    33009 non-null category
DMDCITZN    33009 non-null category
DMDHHSIZ    33009 non-null float64
DMDHREDU    33009 non-null float64
INDFMINC    33009 non-null float64
MEC18YR     33009 non-null float64
RIAGENDR    33009 non-null category
RIDAGEYR    33009 non-null float64
RIDRETH1    33009 non-null category
ALQ101      33009 non-null category
DRD320GW    33009 non-null float64
DRDTSODI    33009 non-null float64
DRX18YR     33009 non-null float64
DRXTALCO    33009 non-null float64
DRXTCAFF    33009 non-null float64
DRXTCALC    33009 non-null float64
DRXTCARB    33009 non-null float64
DRXTCHOL    33009 non-null float64
DRXTCOPP    33009 non-null float64
DRXTFIBE    33009 non-null float64
DRXTIRON    33009 non-null float64
DRXTKCAL    33009 non-null float64
DRXTMAGN    33009 non-null float64
DRXTPHOS    33009 non-null float64
DRXTPOTA   

## One Hot Encoding Cateogires

In [69]:
#DRX18YR - 18 Year weight
#MEC18YR - 18 year Weight
#Year
#_id

In [83]:
#Function to One Hot Encode Categories
def ohe(df_j, label=None):
    #Make copy of df
    df_t = df_j.copy()
    #Select datatypes that are categories
    X_cat = df_t.select_dtypes(include=['category'])
    if(label != None):
        #Drop label and year
        X_cat = X_cat.drop([label], axis=1)
    #Copy df with categories that dropped label and year
    X_enc = X_cat.copy()
    #Create dummies from categories
    X_enc_d = pd.get_dummies(X_enc, drop_first=True)
    #Drop original non-OHE columns from original df
    df = df_j.drop(list(X_enc), axis=1)
    df = pd.concat([df,X_enc_d], axis=1)
    if(label != None):
        df[label] = df[label].astype(np.uint8)
    df['Year'] = df['Year'].astype(np.uint8)
    return df

In [84]:
df_ohe = ohe(df_ohe, label)
df_no_ohe = df_j.copy()

In [85]:
df_ohe[:1].shape

(1, 62)

In [86]:
df_ohe.shape

(33009, 62)

### Recategorize label MCQ160K to binary: 0 - No Bronchitis; 1 - Bronchitis

In [87]:
#Recategorize function
def recategorize(df, name, replace_dict):
    df[name].replace(
    to_replace=replace_dict,
    inplace=True
)

In [88]:
#Recategorize to: 0 - No Bronchitis; 1 - Bronchitis
recategorize(df_ohe, label, {2:0})
recategorize(df_no_ohe, label, {2:0})

In [89]:
df_ohe.head()

Unnamed: 0,_id,DMDHHSIZ,DMDHREDU,INDFMINC,MEC18YR,RIDAGEYR,DRD320GW,DRDTSODI,DRX18YR,DRXTALCO,DRXTCAFF,DRXTCALC,DRXTCARB,DRXTCHOL,DRXTCOPP,DRXTFIBE,DRXTIRON,DRXTKCAL,DRXTMAGN,DRXTPHOS,DRXTPOTA,DRXTPROT,DRXTTFAT,DRXTVARE,DRXTVB1,DRXTVB12,DRXTVB2,DRXTVB6,DRXTVC,DRXTZINC,BPXDI1,BPXSY1,BMXBMI,BMXHT,BMXWAIST,BMXWT,MCQ160K,HUQ010,HUQ050,Year,DMDBORN4_2.0,DMDCITZN_2.0,RIAGENDR_2.0,RIDRETH1_2.0,RIDRETH1_3.0,RIDRETH1_4.0,RIDRETH1_5.0,ALQ101_2.0,BPQ020_2.0,BPXPULS_2.0,PAQ635_2.0,PAQ650_2.0,PAQ665_2.0,SMQ680_2.0,SMD410_2.0,MCQ010_2.0,HID010_2.0,HUQ020_2.0,HUQ020_3.0,HUQ030_2.0,HUQ030_3.0,HUQ070_2.0
0,2.0,1.0,5.0,8.0,3408.044382,77.0,5.397605e-79,5710.03,3315.985398,5.397605e-79,530.45,925.37,350.37,313.95,2.08,36.99,37.29,2463.0,502.25,1974.57,4672.48,123.16,71.95,923.91,2.11,8.68,3.25,2.9,119.12,41.61,58.0,106.0,24.9,174.0,98.0,75.4,0,2.0,3.0,0,0,0,0,0,1,0,0,0,1,0,1,1,1,1,1,1,0,1,0,0,0,0
1,5.0,3.0,4.0,11.0,10219.103963,49.0,1298.0,3756.36,13105.246918,34.56,5.397605e-79,1626.38,253.98,180.57,1.95,17.28,27.22,2658.14,367.72,1811.55,3743.15,97.13,114.52,1298.44,2.62,6.66,3.05,2.92,112.19,10.17,82.0,122.0,29.1,178.3,99.9,92.5,0,2.0,3.0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1
2,12.0,4.0,2.0,11.0,10149.365568,37.0,3304.0,7511.18,13280.15039,5.397605e-79,309.02,1292.81,309.57,1242.06,1.7,18.68,23.31,3348.31,435.55,2364.22,4607.26,130.02,178.85,923.8,2.43,8.36,3.31,2.4,70.96,17.47,108.0,182.0,30.62,180.0,112.8,99.2,1,4.0,2.0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,1,0,0,1,0,0,1
3,15.0,2.0,5.0,8.0,11437.714415,38.0,2478.0,3832.49,18684.987158,13.15,432.85,814.59,264.69,596.3,1.4,16.15,16.14,2573.14,266.8,1321.3,3171.15,82.66,127.03,720.5,1.38,4.64,1.79,1.72,36.11,11.77,68.0,106.0,26.68,174.9,86.7,81.6,0,3.0,3.0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1
4,16.0,1.0,1.0,1.0,1957.4984,85.0,236.0,2129.94,1304.207395,5.397605e-79,5.397605e-79,509.35,112.24,124.95,0.66,16.36,7.54,898.38,183.6,689.38,1458.25,39.62,33.87,1844.08,0.88,1.19,0.89,0.77,69.19,6.47,62.0,164.0,19.96,144.2,74.4,41.5,0,3.0,1.0,0,0,0,1,0,0,1,0,1,1,0,1,0,1,1,1,1,0,0,1,0,0,1


In [90]:
df_no_ohe.head()

Unnamed: 0,_id,DMDBORN4,DMDCITZN,DMDHHSIZ,DMDHREDU,INDFMINC,MEC18YR,RIAGENDR,RIDAGEYR,RIDRETH1,ALQ101,DRD320GW,DRDTSODI,DRX18YR,DRXTALCO,DRXTCAFF,DRXTCALC,DRXTCARB,DRXTCHOL,DRXTCOPP,DRXTFIBE,DRXTIRON,DRXTKCAL,DRXTMAGN,DRXTPHOS,DRXTPOTA,DRXTPROT,DRXTTFAT,DRXTVARE,DRXTVB1,DRXTVB12,DRXTVB2,DRXTVB6,DRXTVC,DRXTZINC,BPQ020,BPXDI1,BPXPULS,BPXSY1,BMXBMI,BMXHT,BMXWAIST,BMXWT,PAQ635,PAQ650,PAQ665,SMAQUEX,SMQ680,SMD410,MCQ010,MCQ160K,HID010,HUQ010,HUQ020,HUQ030,HUQ050,HUQ070,Year
0,2.0,1.0,1.0,1.0,5.0,8.0,3408.044382,1.0,77.0,3.0,1.0,5.397605e-79,5710.03,3315.985398,5.397605e-79,530.45,925.37,350.37,313.95,2.08,36.99,37.29,2463.0,502.25,1974.57,4672.48,123.16,71.95,923.91,2.11,8.68,3.25,2.9,119.12,41.61,2.0,58.0,1.0,106.0,24.9,174.0,98.0,75.4,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0,1.0,2.0,2.0,1.0,3.0,1.0,0
1,5.0,1.0,1.0,3.0,4.0,11.0,10219.103963,1.0,49.0,3.0,1.0,1298.0,3756.36,13105.246918,34.56,5.397605e-79,1626.38,253.98,180.57,1.95,17.28,27.22,2658.14,367.72,1811.55,3743.15,97.13,114.52,1298.44,2.62,6.66,3.05,2.92,112.19,10.17,1.0,82.0,1.0,122.0,29.1,178.3,99.9,92.5,2.0,1.0,1.0,2.0,1.0,2.0,2.0,0,1.0,2.0,3.0,1.0,3.0,2.0,0
2,12.0,1.0,1.0,4.0,2.0,11.0,10149.365568,1.0,37.0,3.0,1.0,3304.0,7511.18,13280.15039,5.397605e-79,309.02,1292.81,309.57,1242.06,1.7,18.68,23.31,3348.31,435.55,2364.22,4607.26,130.02,178.85,923.8,2.43,8.36,3.31,2.4,70.96,17.47,1.0,108.0,1.0,182.0,30.62,180.0,112.8,99.2,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1,1.0,4.0,3.0,1.0,2.0,2.0,0
3,15.0,1.0,1.0,2.0,5.0,8.0,11437.714415,2.0,38.0,3.0,1.0,2478.0,3832.49,18684.987158,13.15,432.85,814.59,264.69,596.3,1.4,16.15,16.14,2573.14,266.8,1321.3,3171.15,82.66,127.03,720.5,1.38,4.64,1.79,1.72,36.11,11.77,2.0,68.0,1.0,106.0,26.68,174.9,86.7,81.6,1.0,1.0,1.0,2.0,1.0,1.0,2.0,0,1.0,3.0,3.0,1.0,3.0,2.0,0
4,16.0,1.0,1.0,1.0,1.0,1.0,1957.4984,2.0,85.0,4.0,2.0,236.0,2129.94,1304.207395,5.397605e-79,5.397605e-79,509.35,112.24,124.95,0.66,16.36,7.54,898.38,183.6,689.38,1458.25,39.62,33.87,1844.08,0.88,1.19,0.89,0.77,69.19,6.47,2.0,62.0,1.0,164.0,19.96,144.2,74.4,41.5,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0,1.0,3.0,3.0,1.0,1.0,2.0,0


## MongoDB Insertion

In [94]:
#Import MongoClient
from pymongo import MongoClient

#Create a MongoClient to run the MongoDB instance
client = MongoClient("localhost", 27017)

In [95]:
#Connect to existing database
db = client.NHANES_Q2
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'NHANES_Q2')

In [96]:
db.list_collection_names()

['HD', 'HD_no_ohe', 'CA_no_ohe', 'CA']

In [97]:
#Creating a collection
CL = db.CL
CL_no_ohe = db.CL_no_ohe

In [98]:
#If collections exist, then drop
if 'CL' in db.list_collection_names():
    CL.drop()
    db.list_collection_names()
    
if 'CL_no_ohe' in db.list_collection_names():
    CL_no_ohe.drop()
    db.list_collection_names()

In [99]:
#MongoDB keys CLn't contain '.'
df_ohe.columns = df_ohe.columns.str.replace(".", "_")

In [100]:
df_ohe.head()

Unnamed: 0,_id,DMDHHSIZ,DMDHREDU,INDFMINC,MEC18YR,RIDAGEYR,DRD320GW,DRDTSODI,DRX18YR,DRXTALCO,DRXTCAFF,DRXTCALC,DRXTCARB,DRXTCHOL,DRXTCOPP,DRXTFIBE,DRXTIRON,DRXTKCAL,DRXTMAGN,DRXTPHOS,DRXTPOTA,DRXTPROT,DRXTTFAT,DRXTVARE,DRXTVB1,DRXTVB12,DRXTVB2,DRXTVB6,DRXTVC,DRXTZINC,BPXDI1,BPXSY1,BMXBMI,BMXHT,BMXWAIST,BMXWT,MCQ160K,HUQ010,HUQ050,Year,DMDBORN4_2_0,DMDCITZN_2_0,RIAGENDR_2_0,RIDRETH1_2_0,RIDRETH1_3_0,RIDRETH1_4_0,RIDRETH1_5_0,ALQ101_2_0,BPQ020_2_0,BPXPULS_2_0,PAQ635_2_0,PAQ650_2_0,PAQ665_2_0,SMQ680_2_0,SMD410_2_0,MCQ010_2_0,HID010_2_0,HUQ020_2_0,HUQ020_3_0,HUQ030_2_0,HUQ030_3_0,HUQ070_2_0
0,2.0,1.0,5.0,8.0,3408.044382,77.0,5.397605e-79,5710.03,3315.985398,5.397605e-79,530.45,925.37,350.37,313.95,2.08,36.99,37.29,2463.0,502.25,1974.57,4672.48,123.16,71.95,923.91,2.11,8.68,3.25,2.9,119.12,41.61,58.0,106.0,24.9,174.0,98.0,75.4,0,2.0,3.0,0,0,0,0,0,1,0,0,0,1,0,1,1,1,1,1,1,0,1,0,0,0,0
1,5.0,3.0,4.0,11.0,10219.103963,49.0,1298.0,3756.36,13105.246918,34.56,5.397605e-79,1626.38,253.98,180.57,1.95,17.28,27.22,2658.14,367.72,1811.55,3743.15,97.13,114.52,1298.44,2.62,6.66,3.05,2.92,112.19,10.17,82.0,122.0,29.1,178.3,99.9,92.5,0,2.0,3.0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1
2,12.0,4.0,2.0,11.0,10149.365568,37.0,3304.0,7511.18,13280.15039,5.397605e-79,309.02,1292.81,309.57,1242.06,1.7,18.68,23.31,3348.31,435.55,2364.22,4607.26,130.02,178.85,923.8,2.43,8.36,3.31,2.4,70.96,17.47,108.0,182.0,30.62,180.0,112.8,99.2,1,4.0,2.0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,1,0,0,1,0,0,1
3,15.0,2.0,5.0,8.0,11437.714415,38.0,2478.0,3832.49,18684.987158,13.15,432.85,814.59,264.69,596.3,1.4,16.15,16.14,2573.14,266.8,1321.3,3171.15,82.66,127.03,720.5,1.38,4.64,1.79,1.72,36.11,11.77,68.0,106.0,26.68,174.9,86.7,81.6,0,3.0,3.0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1
4,16.0,1.0,1.0,1.0,1957.4984,85.0,236.0,2129.94,1304.207395,5.397605e-79,5.397605e-79,509.35,112.24,124.95,0.66,16.36,7.54,898.38,183.6,689.38,1458.25,39.62,33.87,1844.08,0.88,1.19,0.89,0.77,69.19,6.47,62.0,164.0,19.96,144.2,74.4,41.5,0,3.0,1.0,0,0,0,1,0,0,1,0,1,1,0,1,0,1,1,1,1,0,0,1,0,0,1


In [102]:
#Dataframe to dictionary
CL_dict = df_ohe.to_dict(orient='records')

CL_no_ohe_dict = df_no_ohe.to_dict(orient='records')

In [103]:
CL_dict[0]

{'ALQ101_2_0': 0,
 'BMXBMI': 24.9,
 'BMXHT': 174.0,
 'BMXWAIST': 98.0,
 'BMXWT': 75.4,
 'BPQ020_2_0': 1,
 'BPXDI1': 58.0,
 'BPXPULS_2_0': 0,
 'BPXSY1': 106.0,
 'DMDBORN4_2_0': 0,
 'DMDCITZN_2_0': 0,
 'DMDHHSIZ': 1.0,
 'DMDHREDU': 5.0,
 'DRD320GW': 5.397605346934027e-79,
 'DRDTSODI': 5710.030000000005,
 'DRX18YR': 3315.985398314134,
 'DRXTALCO': 5.397605346934027e-79,
 'DRXTCAFF': 530.45,
 'DRXTCALC': 925.37,
 'DRXTCARB': 350.37000000000006,
 'DRXTCHOL': 313.95,
 'DRXTCOPP': 2.08,
 'DRXTFIBE': 36.99,
 'DRXTIRON': 37.29,
 'DRXTKCAL': 2463.0,
 'DRXTMAGN': 502.25,
 'DRXTPHOS': 1974.57,
 'DRXTPOTA': 4672.480000000001,
 'DRXTPROT': 123.16,
 'DRXTTFAT': 71.95,
 'DRXTVARE': 923.91,
 'DRXTVB1': 2.11,
 'DRXTVB12': 8.68,
 'DRXTVB2': 3.25,
 'DRXTVB6': 2.9,
 'DRXTVC': 119.12,
 'DRXTZINC': 41.61,
 'HID010_2_0': 0,
 'HUQ010': 2.0,
 'HUQ020_2_0': 1,
 'HUQ020_3_0': 0,
 'HUQ030_2_0': 0,
 'HUQ030_3_0': 0,
 'HUQ050': 3.0,
 'HUQ070_2_0': 0,
 'INDFMINC': 8.0,
 'MCQ010_2_0': 1,
 'MCQ160K': 0,
 'MEC18YR': 340

In [104]:
#Insert collection
CL.insert_many(CL_dict)

<pymongo.results.InsertManyResult at 0x1277002c8>

In [105]:
CL_no_ohe_dict[0]

{'ALQ101': 1.0,
 'BMXBMI': 24.9,
 'BMXHT': 174.0,
 'BMXWAIST': 98.0,
 'BMXWT': 75.4,
 'BPQ020': 2.0,
 'BPXDI1': 58.0,
 'BPXPULS': 1.0,
 'BPXSY1': 106.0,
 'DMDBORN4': 1.0,
 'DMDCITZN': 1.0,
 'DMDHHSIZ': 1.0,
 'DMDHREDU': 5.0,
 'DRD320GW': 5.397605346934027e-79,
 'DRDTSODI': 5710.030000000005,
 'DRX18YR': 3315.985398314134,
 'DRXTALCO': 5.397605346934027e-79,
 'DRXTCAFF': 530.45,
 'DRXTCALC': 925.37,
 'DRXTCARB': 350.37000000000006,
 'DRXTCHOL': 313.95,
 'DRXTCOPP': 2.08,
 'DRXTFIBE': 36.99,
 'DRXTIRON': 37.29,
 'DRXTKCAL': 2463.0,
 'DRXTMAGN': 502.25,
 'DRXTPHOS': 1974.57,
 'DRXTPOTA': 4672.480000000001,
 'DRXTPROT': 123.16,
 'DRXTTFAT': 71.95,
 'DRXTVARE': 923.91,
 'DRXTVB1': 2.11,
 'DRXTVB12': 8.68,
 'DRXTVB2': 3.25,
 'DRXTVB6': 2.9,
 'DRXTVC': 119.12,
 'DRXTZINC': 41.61,
 'HID010': 1.0,
 'HUQ010': 2.0,
 'HUQ020': 2.0,
 'HUQ030': 1.0,
 'HUQ050': 3.0,
 'HUQ070': 1.0,
 'INDFMINC': 8.0,
 'MCQ010': 2.0,
 'MCQ160K': 0,
 'MEC18YR': 3408.0443815555554,
 'PAQ635': 2.0,
 'PAQ650': 2.0,
 'PAQ66

In [106]:
CL_no_ohe.insert_many(CL_no_ohe_dict)

<pymongo.results.InsertManyResult at 0x12791f5c8>

In [107]:
db.list_collection_names()

['CL', 'CL_no_ohe', 'HD', 'HD_no_ohe', 'CA_no_ohe', 'CA']