In [1]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import re
from collections import defaultdict

In [2]:
#To display all columns in Jupyter Notebooks
pd.set_option('display.max_columns', 500)

In [3]:
#Import MongoClient
from pymongo import MongoClient

#Create a MongoClient to run the MongoDB instance
client = MongoClient('localhost', 27017)

In [4]:
#Connect to existing datbase
db = client.NHANES

In [5]:
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'NHANES')

In [6]:
col = db.list_collection_names()
col.sort()
col

['alq',
 'bmx',
 'bpq',
 'bpx',
 'demo',
 'demo_p',
 'descr',
 'diq',
 'drxtot',
 'hiq',
 'huq',
 'mcq_a',
 'mcq_b',
 'mcq_c',
 'mcq_h',
 'paq',
 'rdq',
 'smq',
 'smqfam',
 'tchol',
 'whq']

In [7]:
#Collections
demo = db.demo
alq = db.alq
diq = db.diq
drxtot = db.drxtot
bpq = db.bpq
bpx = db.bpx
tchol = db.tchol
bmx = db.bmx
paq = db.paq
smq = db.smq
smqfam = db.smqfam

mcq_a = db.mcq_a #Asthma
mcq_h = db.mcq_h #Heart Disease

hiq = db.hiq
huq = db.huq
whq = db.whq

descr = db.descr

In [8]:
#Create dataframes from database
df_demo = pd.DataFrame(list(demo.find()))
df_alq = pd.DataFrame(list(alq.find()))
df_diq = pd.DataFrame(list(diq.find()))
df_drxtot = pd.DataFrame(list(drxtot.find()))
df_bpq = pd.DataFrame(list(bpq.find()))
df_bpx = pd.DataFrame(list(bpx.find()))
df_tchol = pd.DataFrame(list(tchol.find()))
df_bmx = pd.DataFrame(list(bmx.find()))
df_paq = pd.DataFrame(list(paq.find()))
df_smq = pd.DataFrame(list(smq.find()))
df_smqfam = pd.DataFrame(list(smqfam.find()))

df_mcq_a = pd.DataFrame(list(mcq_a.find()))
df_mcq_h = pd.DataFrame(list(mcq_h.find()))
df_hiq = pd.DataFrame(list(hiq.find()))
df_huq = pd.DataFrame(list(huq.find()))
df_whq = pd.DataFrame(list(whq.find()))

df_descr = pd.DataFrame(list(descr.find()))

In [9]:
#All records
dfs = [df_demo, df_alq, df_diq, df_drxtot, df_bpq, df_bpx, df_tchol, df_bmx, df_paq, 
       df_smq, df_smqfam, df_mcq_a, df_mcq_h, df_hiq, df_huq, df_whq]

In [10]:
names = ['demo', 'alq', 'diq', 'drxtot', 'bpq', 'bpx', 'tchol', 'bmx', 'paq',
        'smq', 'smqfam', 'mcq_a', 'mcq_h', 'hiq', 'huq', 'whq']

In [11]:
data_dict = dict(zip(names,dfs))

## Functions:

In [12]:
#Function for inner join
def innerjoin_df(dfs_list, join_on):
    df_join = dfs_list[0]
    for d in dfs_list[1:]:
        df_join = df_join.merge(d, how='inner', on=join_on)
    return df_join

In [13]:
#Function for getting info from list of collections
#Look at records and features for each
def get_info(dfs, names):
    shape = [x.shape for x in dfs]
    d = defaultdict(str)
    for i in range(0,len(shape)):
        d[names[i]] = shape[i]
    info = pd.DataFrame.from_dict(d, orient='index').reset_index()
    info.columns = ['_id', 'Records', 'Features']
    return info

In [14]:
info = get_info(dfs, names)
info_join = innerjoin_df([info, df_descr], ['_id'])
info_join = info_join.sort_values(by='Records', ascending=False)
info_join[:10]

Unnamed: 0,_id,Records,Features,Description
13,hiq,91399,3,Health Insurance
10,smqfam,90825,3,Household Smoking
2,diq,87852,3,Diabetes
11,mcq_a,87808,3,Asthma
14,huq,87707,7,Hospital Utilization
0,demo,81635,11,Demographics
3,drxtot,80241,26,Dietary
7,bmx,76742,6,Body Measures
8,paq,64823,5,Physical Activity
5,bpx,62273,5,Blood Pressure - Measures


### Select data to use

In [15]:
#Get relevant data
def get_reldata(df):
    dfs = []
    for c in df:
        dfs.append(data_dict[c])
    return dfs

In [16]:
#Selected risk factors for hospital utilization
dfs = get_reldata(info_join['_id'][:8])

In [17]:
df_j = innerjoin_df(dfs, ['_id', 'Year'])
df_j.shape

(64583, 48)

In [18]:
df_j.head()

Unnamed: 0,HID010,Year,_id,SMD410,DIQ010,MCQ010,HUQ010,HUQ020,HUQ030,HUQ050,HUQ070,DMDBORN4,DMDCITZN,DMDHHSIZ,DMDHREDU,INDFMINC,MEC18YR,RIAGENDR,RIDAGEYR,RIDRETH1,DRD320GW,DRDTSODI,DRX18YR,DRXTALCO,DRXTCAFF,DRXTCALC,DRXTCARB,DRXTCHOL,DRXTCOPP,DRXTFIBE,DRXTIRON,DRXTKCAL,DRXTMAGN,DRXTPHOS,DRXTPOTA,DRXTPROT,DRXTTFAT,DRXTVARE,DRXTVB1,DRXTVB12,DRXTVB2,DRXTVB6,DRXTVC,DRXTZINC,BMXBMI,BMXHT,BMXWAIST,BMXWT
0,1.0,1999-2000,1.0,2.0,2.0,2.0,1.0,3.0,1.0,3.0,1.0,1.0,1.0,3.0,3.0,3.0,990.268132,2.0,2.0,4.0,243.38,1621.35,1348.028592,5.397605e-79,5.397605e-79,541.3,250.36,46.55,0.41,7.41,12.18,1358.88,126.8,503.75,1387.7,31.96,27.24,604.33,1.5,2.42,1.55,1.43,220.64,4.69,14.9,91.6,45.7,12.5
1,1.0,1999-2000,2.0,2.0,2.0,2.0,2.0,2.0,1.0,3.0,1.0,1.0,1.0,1.0,5.0,8.0,3408.044382,1.0,77.0,3.0,5.397605e-79,5710.03,3315.985398,5.397605e-79,530.45,925.37,350.37,313.95,2.08,36.99,37.29,2463.0,502.25,1974.57,4672.48,123.16,71.95,923.91,2.11,8.68,3.25,2.9,119.12,41.61,24.9,174.0,98.0,75.4
2,1.0,1999-2000,3.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,4.0,4.0,6.0,4724.103694,2.0,10.0,3.0,531.0,1676.51,3525.877253,5.397605e-79,35.4,415.27,233.63,86.22,0.85,11.16,9.78,1517.69,172.58,674.82,1487.16,40.19,49.94,885.72,1.21,1.59,1.43,1.37,57.95,5.65,17.63,136.6,64.7,32.9
3,1.0,1999-2000,5.0,2.0,2.0,2.0,2.0,3.0,1.0,3.0,2.0,1.0,1.0,3.0,4.0,11.0,10219.103963,1.0,49.0,3.0,1298.0,3756.36,13105.246918,34.56,5.397605e-79,1626.38,253.98,180.57,1.95,17.28,27.22,2658.14,367.72,1811.55,3743.15,97.13,114.52,1298.44,2.62,6.66,3.05,2.92,112.19,10.17,29.1,178.3,99.9,92.5
4,1.0,1999-2000,6.0,2.0,2.0,2.0,2.0,3.0,1.0,3.0,2.0,1.0,1.0,2.0,4.0,3.0,4074.958535,2.0,19.0,5.0,1711.0,949.52,8884.256881,5.397605e-79,267.23,382.23,202.11,38.45,0.81,6.24,10.59,1113.66,136.78,571.68,1684.5,24.63,25.15,389.86,0.72,1.67,0.9,0.97,37.31,6.83,22.56,162.0,81.6,59.2


## Reorder columns

In [19]:
#Get a list of columns
cols = list(df_j)

In [20]:
#Move '_id' column to head of list using dex, pop and insert
cols.insert(0, cols.pop(cols.index('_id')))

#Move 'Year' column to back of list using index, pop and insert
cols.insert(len(df_j.columns)-1, cols.pop(cols.index('Year')))

In [21]:
#Reorder dataframe
df_j = df_j.loc[:, cols]
df_j.head()

Unnamed: 0,_id,HID010,SMD410,DIQ010,MCQ010,HUQ010,HUQ020,HUQ030,HUQ050,HUQ070,DMDBORN4,DMDCITZN,DMDHHSIZ,DMDHREDU,INDFMINC,MEC18YR,RIAGENDR,RIDAGEYR,RIDRETH1,DRD320GW,DRDTSODI,DRX18YR,DRXTALCO,DRXTCAFF,DRXTCALC,DRXTCARB,DRXTCHOL,DRXTCOPP,DRXTFIBE,DRXTIRON,DRXTKCAL,DRXTMAGN,DRXTPHOS,DRXTPOTA,DRXTPROT,DRXTTFAT,DRXTVARE,DRXTVB1,DRXTVB12,DRXTVB2,DRXTVB6,DRXTVC,DRXTZINC,BMXBMI,BMXHT,BMXWAIST,BMXWT,Year
0,1.0,1.0,2.0,2.0,2.0,1.0,3.0,1.0,3.0,1.0,1.0,1.0,3.0,3.0,3.0,990.268132,2.0,2.0,4.0,243.38,1621.35,1348.028592,5.397605e-79,5.397605e-79,541.3,250.36,46.55,0.41,7.41,12.18,1358.88,126.8,503.75,1387.7,31.96,27.24,604.33,1.5,2.42,1.55,1.43,220.64,4.69,14.9,91.6,45.7,12.5,1999-2000
1,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,3.0,1.0,1.0,1.0,1.0,5.0,8.0,3408.044382,1.0,77.0,3.0,5.397605e-79,5710.03,3315.985398,5.397605e-79,530.45,925.37,350.37,313.95,2.08,36.99,37.29,2463.0,502.25,1974.57,4672.48,123.16,71.95,923.91,2.11,8.68,3.25,2.9,119.12,41.61,24.9,174.0,98.0,75.4,1999-2000
2,3.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,4.0,4.0,6.0,4724.103694,2.0,10.0,3.0,531.0,1676.51,3525.877253,5.397605e-79,35.4,415.27,233.63,86.22,0.85,11.16,9.78,1517.69,172.58,674.82,1487.16,40.19,49.94,885.72,1.21,1.59,1.43,1.37,57.95,5.65,17.63,136.6,64.7,32.9,1999-2000
3,5.0,1.0,2.0,2.0,2.0,2.0,3.0,1.0,3.0,2.0,1.0,1.0,3.0,4.0,11.0,10219.103963,1.0,49.0,3.0,1298.0,3756.36,13105.246918,34.56,5.397605e-79,1626.38,253.98,180.57,1.95,17.28,27.22,2658.14,367.72,1811.55,3743.15,97.13,114.52,1298.44,2.62,6.66,3.05,2.92,112.19,10.17,29.1,178.3,99.9,92.5,1999-2000
4,6.0,1.0,2.0,2.0,2.0,2.0,3.0,1.0,3.0,2.0,1.0,1.0,2.0,4.0,3.0,4074.958535,2.0,19.0,5.0,1711.0,949.52,8884.256881,5.397605e-79,267.23,382.23,202.11,38.45,0.81,6.24,10.59,1113.66,136.78,571.68,1684.5,24.63,25.15,389.86,0.72,1.67,0.9,0.97,37.31,6.83,22.56,162.0,81.6,59.2,1999-2000


## Remap years to number categories

In [22]:
di = {"1999-2000": 0, "2001-2002": 1, "2003-2004": 2, "2005-2006": 3, "2007-2008": 4, 
      "2009-2010": 5, "2011-2012": 6, "2013-2014": 7, "2015-2016": 8}

In [23]:
#Map categorical years to numerical
df_j['Year'] = df_j['Year'].map(di)

In [24]:
df_j.head()

Unnamed: 0,_id,HID010,SMD410,DIQ010,MCQ010,HUQ010,HUQ020,HUQ030,HUQ050,HUQ070,DMDBORN4,DMDCITZN,DMDHHSIZ,DMDHREDU,INDFMINC,MEC18YR,RIAGENDR,RIDAGEYR,RIDRETH1,DRD320GW,DRDTSODI,DRX18YR,DRXTALCO,DRXTCAFF,DRXTCALC,DRXTCARB,DRXTCHOL,DRXTCOPP,DRXTFIBE,DRXTIRON,DRXTKCAL,DRXTMAGN,DRXTPHOS,DRXTPOTA,DRXTPROT,DRXTTFAT,DRXTVARE,DRXTVB1,DRXTVB12,DRXTVB2,DRXTVB6,DRXTVC,DRXTZINC,BMXBMI,BMXHT,BMXWAIST,BMXWT,Year
0,1.0,1.0,2.0,2.0,2.0,1.0,3.0,1.0,3.0,1.0,1.0,1.0,3.0,3.0,3.0,990.268132,2.0,2.0,4.0,243.38,1621.35,1348.028592,5.397605e-79,5.397605e-79,541.3,250.36,46.55,0.41,7.41,12.18,1358.88,126.8,503.75,1387.7,31.96,27.24,604.33,1.5,2.42,1.55,1.43,220.64,4.69,14.9,91.6,45.7,12.5,0
1,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,3.0,1.0,1.0,1.0,1.0,5.0,8.0,3408.044382,1.0,77.0,3.0,5.397605e-79,5710.03,3315.985398,5.397605e-79,530.45,925.37,350.37,313.95,2.08,36.99,37.29,2463.0,502.25,1974.57,4672.48,123.16,71.95,923.91,2.11,8.68,3.25,2.9,119.12,41.61,24.9,174.0,98.0,75.4,0
2,3.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,4.0,4.0,6.0,4724.103694,2.0,10.0,3.0,531.0,1676.51,3525.877253,5.397605e-79,35.4,415.27,233.63,86.22,0.85,11.16,9.78,1517.69,172.58,674.82,1487.16,40.19,49.94,885.72,1.21,1.59,1.43,1.37,57.95,5.65,17.63,136.6,64.7,32.9,0
3,5.0,1.0,2.0,2.0,2.0,2.0,3.0,1.0,3.0,2.0,1.0,1.0,3.0,4.0,11.0,10219.103963,1.0,49.0,3.0,1298.0,3756.36,13105.246918,34.56,5.397605e-79,1626.38,253.98,180.57,1.95,17.28,27.22,2658.14,367.72,1811.55,3743.15,97.13,114.52,1298.44,2.62,6.66,3.05,2.92,112.19,10.17,29.1,178.3,99.9,92.5,0
4,6.0,1.0,2.0,2.0,2.0,2.0,3.0,1.0,3.0,2.0,1.0,1.0,2.0,4.0,3.0,4074.958535,2.0,19.0,5.0,1711.0,949.52,8884.256881,5.397605e-79,267.23,382.23,202.11,38.45,0.81,6.24,10.59,1113.66,136.78,571.68,1684.5,24.63,25.15,389.86,0.72,1.67,0.9,0.97,37.31,6.83,22.56,162.0,81.6,59.2,0


In [25]:
#Check if any NaN
df_j.isnull().values.any()

False

## Recategorize HUQ050 - Hospital Utilization to Binary

Hospital utilization is too difficult to predict with 5 categoires.  
Remapped number of times received healthcare over past year:
1. 0 - One and greater
2. 1 - None

In [26]:
#Recategorize function
def recategorize(df, name, replace_dict):
    df[name].replace(
    to_replace=replace_dict,
    inplace=True
)

In [27]:
recategorize(df_j, 'HUQ050', {2:1, 3:1, 4:1, 5:1})

#### Switch the variables 0 and 1: 0 

In [28]:
recategorize(df_j, 'HUQ050', {0:1, 1:0})

## Categorize features that need to be One Hot Encoded

In [29]:
df_j.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64583 entries, 0 to 64582
Data columns (total 48 columns):
_id         64583 non-null float64
HID010      64583 non-null float64
SMD410      64583 non-null float64
DIQ010      64583 non-null float64
MCQ010      64583 non-null float64
HUQ010      64583 non-null float64
HUQ020      64583 non-null float64
HUQ030      64583 non-null float64
HUQ050      64583 non-null float64
HUQ070      64583 non-null float64
DMDBORN4    64583 non-null float64
DMDCITZN    64583 non-null float64
DMDHHSIZ    64583 non-null float64
DMDHREDU    64583 non-null float64
INDFMINC    64583 non-null float64
MEC18YR     64583 non-null float64
RIAGENDR    64583 non-null float64
RIDAGEYR    64583 non-null float64
RIDRETH1    64583 non-null float64
DRD320GW    64583 non-null float64
DRDTSODI    64583 non-null float64
DRX18YR     64583 non-null float64
DRXTALCO    64583 non-null float64
DRXTCAFF    64583 non-null float64
DRXTCALC    64583 non-null float64
DRXTCARB    6458

In [30]:
#Change columns to category
#Columns to remove: 
#DRX18YR - 18 Year weight
#MEC18YR - 18 year Weight
#Year
#_id

cat_cols = ['DMDBORN4',
            'DMDCITZN',
            'RIAGENDR',
            'RIDRETH1',
            'ALQ101',
            'DIQ010',
            'BPQ020',
            'BPXPULS',
            'PAQ635',
            'PAQ650',
            'PAQ665',
            'SMAQUEX',
            'SMQ680',
            'SMD410',
            'MCQ010',
            'MCQ160C',
            'HID010',
            'HUQ020',
            'HUQ030',
            'HUQ070',
            'WHQ030',
            'WHQ040']

def recat_cols(df, col_names):
    for x in col_names:
        if x in cat_cols:
            df[x] = df[x].astype('category')
    return df

col_names = df_j.columns
df_ohe = recat_cols(df_j, col_names)

In [31]:
df_j.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64583 entries, 0 to 64582
Data columns (total 48 columns):
_id         64583 non-null float64
HID010      64583 non-null category
SMD410      64583 non-null category
DIQ010      64583 non-null category
MCQ010      64583 non-null category
HUQ010      64583 non-null float64
HUQ020      64583 non-null category
HUQ030      64583 non-null category
HUQ050      64583 non-null float64
HUQ070      64583 non-null category
DMDBORN4    64583 non-null category
DMDCITZN    64583 non-null category
DMDHHSIZ    64583 non-null float64
DMDHREDU    64583 non-null float64
INDFMINC    64583 non-null float64
MEC18YR     64583 non-null float64
RIAGENDR    64583 non-null category
RIDAGEYR    64583 non-null float64
RIDRETH1    64583 non-null category
DRD320GW    64583 non-null float64
DRDTSODI    64583 non-null float64
DRX18YR     64583 non-null float64
DRXTALCO    64583 non-null float64
DRXTCAFF    64583 non-null float64
DRXTCALC    64583 non-null float64
DRXTC

In [33]:
df_ohe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64583 entries, 0 to 64582
Data columns (total 48 columns):
_id         64583 non-null float64
HID010      64583 non-null category
SMD410      64583 non-null category
DIQ010      64583 non-null category
MCQ010      64583 non-null category
HUQ010      64583 non-null float64
HUQ020      64583 non-null category
HUQ030      64583 non-null category
HUQ050      64583 non-null float64
HUQ070      64583 non-null category
DMDBORN4    64583 non-null category
DMDCITZN    64583 non-null category
DMDHHSIZ    64583 non-null float64
DMDHREDU    64583 non-null float64
INDFMINC    64583 non-null float64
MEC18YR     64583 non-null float64
RIAGENDR    64583 non-null category
RIDAGEYR    64583 non-null float64
RIDRETH1    64583 non-null category
DRD320GW    64583 non-null float64
DRDTSODI    64583 non-null float64
DRX18YR     64583 non-null float64
DRXTALCO    64583 non-null float64
DRXTCAFF    64583 non-null float64
DRXTCALC    64583 non-null float64
DRXTC

## One Hot Encoding Cateogires

In [34]:
#Function to One Hot Encode Categories
def ohe(df_j, label=None):
    #Make copy of df
    df_t = df_j.copy()
    #Select datatypes that are categories
    X_cat = df_t.select_dtypes(include=['category'])
    if(label != None):
        #Drop label and year
        X_cat = X_cat.drop([label], axis=1)
    #Copy df with categories that dropped label and year
    X_enc = X_cat.copy()
    #Create dummies from categories
    X_enc_d = pd.get_dummies(X_enc, drop_first=True)
    #Drop original non-OHE columns from original df
    df = df_j.drop(list(X_enc), axis=1)
    df = pd.concat([df,X_enc_d], axis=1)
    if(label != None):
        df[label] = df[label].astype(np.uint8)
    df['Year'] = df['Year'].astype(np.uint8)
    return df

In [35]:
df_ohe = ohe(df_ohe)
df_no_ohe = df_j.copy()

In [36]:
df_ohe[:1].shape

(1, 54)

In [37]:
df_ohe.shape

(64583, 54)

In [38]:
df_ohe.head()

Unnamed: 0,_id,HUQ010,HUQ050,DMDHHSIZ,DMDHREDU,INDFMINC,MEC18YR,RIDAGEYR,DRD320GW,DRDTSODI,DRX18YR,DRXTALCO,DRXTCAFF,DRXTCALC,DRXTCARB,DRXTCHOL,DRXTCOPP,DRXTFIBE,DRXTIRON,DRXTKCAL,DRXTMAGN,DRXTPHOS,DRXTPOTA,DRXTPROT,DRXTTFAT,DRXTVARE,DRXTVB1,DRXTVB12,DRXTVB2,DRXTVB6,DRXTVC,DRXTZINC,BMXBMI,BMXHT,BMXWAIST,BMXWT,Year,HID010_2.0,SMD410_2.0,DIQ010_2.0,DIQ010_3.0,MCQ010_2.0,HUQ020_2.0,HUQ020_3.0,HUQ030_2.0,HUQ030_3.0,HUQ070_2.0,DMDBORN4_2.0,DMDCITZN_2.0,RIAGENDR_2.0,RIDRETH1_2.0,RIDRETH1_3.0,RIDRETH1_4.0,RIDRETH1_5.0
0,1.0,1.0,0.0,3.0,3.0,3.0,990.268132,2.0,243.38,1621.35,1348.028592,5.397605e-79,5.397605e-79,541.3,250.36,46.55,0.41,7.41,12.18,1358.88,126.8,503.75,1387.7,31.96,27.24,604.33,1.5,2.42,1.55,1.43,220.64,4.69,14.9,91.6,45.7,12.5,0,0,1,1,0,1,0,1,0,0,0,0,0,1,0,0,1,0
1,2.0,2.0,0.0,1.0,5.0,8.0,3408.044382,77.0,5.397605e-79,5710.03,3315.985398,5.397605e-79,530.45,925.37,350.37,313.95,2.08,36.99,37.29,2463.0,502.25,1974.57,4672.48,123.16,71.95,923.91,2.11,8.68,3.25,2.9,119.12,41.61,24.9,174.0,98.0,75.4,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0
2,3.0,1.0,0.0,4.0,4.0,6.0,4724.103694,10.0,531.0,1676.51,3525.877253,5.397605e-79,35.4,415.27,233.63,86.22,0.85,11.16,9.78,1517.69,172.58,674.82,1487.16,40.19,49.94,885.72,1.21,1.59,1.43,1.37,57.95,5.65,17.63,136.6,64.7,32.9,0,0,0,1,0,1,0,0,0,0,1,1,1,1,0,1,0,0
3,5.0,2.0,0.0,3.0,4.0,11.0,10219.103963,49.0,1298.0,3756.36,13105.246918,34.56,5.397605e-79,1626.38,253.98,180.57,1.95,17.28,27.22,2658.14,367.72,1811.55,3743.15,97.13,114.52,1298.44,2.62,6.66,3.05,2.92,112.19,10.17,29.1,178.3,99.9,92.5,0,0,1,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0
4,6.0,2.0,0.0,2.0,4.0,3.0,4074.958535,19.0,1711.0,949.52,8884.256881,5.397605e-79,267.23,382.23,202.11,38.45,0.81,6.24,10.59,1113.66,136.78,571.68,1684.5,24.63,25.15,389.86,0.72,1.67,0.9,0.97,37.31,6.83,22.56,162.0,81.6,59.2,0,0,1,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1


## MongoDB Insertion

In [39]:
#Import MongoClient
from pymongo import MongoClient

#Create a MongoClient to run the MongoDB instance
client = MongoClient("localhost", 27017)

In [40]:
#Connect to existing database
db = client.NHANES_Q
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'NHANES_Q')

In [41]:
db.list_collection_names()

['HU', 'HU_no_ohe']

In [42]:
#Creating a collection
HU = db.HU
HU_no_ohe = db.HU_no_ohe

In [43]:
#If collections exist, then drop
if 'HU' in db.list_collection_names():
    HU.drop()
    db.list_collection_names()
    
if 'HU_no_ohe' in db.list_collection_names():
    HU_no_ohe.drop()
    db.list_collection_names()

In [44]:
#MongoDB keys can't contain '.'
df_ohe.columns = df_ohe.columns.str.replace(".", "_")

In [45]:
df_ohe.head()

Unnamed: 0,_id,HUQ010,HUQ050,DMDHHSIZ,DMDHREDU,INDFMINC,MEC18YR,RIDAGEYR,DRD320GW,DRDTSODI,DRX18YR,DRXTALCO,DRXTCAFF,DRXTCALC,DRXTCARB,DRXTCHOL,DRXTCOPP,DRXTFIBE,DRXTIRON,DRXTKCAL,DRXTMAGN,DRXTPHOS,DRXTPOTA,DRXTPROT,DRXTTFAT,DRXTVARE,DRXTVB1,DRXTVB12,DRXTVB2,DRXTVB6,DRXTVC,DRXTZINC,BMXBMI,BMXHT,BMXWAIST,BMXWT,Year,HID010_2_0,SMD410_2_0,DIQ010_2_0,DIQ010_3_0,MCQ010_2_0,HUQ020_2_0,HUQ020_3_0,HUQ030_2_0,HUQ030_3_0,HUQ070_2_0,DMDBORN4_2_0,DMDCITZN_2_0,RIAGENDR_2_0,RIDRETH1_2_0,RIDRETH1_3_0,RIDRETH1_4_0,RIDRETH1_5_0
0,1.0,1.0,0.0,3.0,3.0,3.0,990.268132,2.0,243.38,1621.35,1348.028592,5.397605e-79,5.397605e-79,541.3,250.36,46.55,0.41,7.41,12.18,1358.88,126.8,503.75,1387.7,31.96,27.24,604.33,1.5,2.42,1.55,1.43,220.64,4.69,14.9,91.6,45.7,12.5,0,0,1,1,0,1,0,1,0,0,0,0,0,1,0,0,1,0
1,2.0,2.0,0.0,1.0,5.0,8.0,3408.044382,77.0,5.397605e-79,5710.03,3315.985398,5.397605e-79,530.45,925.37,350.37,313.95,2.08,36.99,37.29,2463.0,502.25,1974.57,4672.48,123.16,71.95,923.91,2.11,8.68,3.25,2.9,119.12,41.61,24.9,174.0,98.0,75.4,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0
2,3.0,1.0,0.0,4.0,4.0,6.0,4724.103694,10.0,531.0,1676.51,3525.877253,5.397605e-79,35.4,415.27,233.63,86.22,0.85,11.16,9.78,1517.69,172.58,674.82,1487.16,40.19,49.94,885.72,1.21,1.59,1.43,1.37,57.95,5.65,17.63,136.6,64.7,32.9,0,0,0,1,0,1,0,0,0,0,1,1,1,1,0,1,0,0
3,5.0,2.0,0.0,3.0,4.0,11.0,10219.103963,49.0,1298.0,3756.36,13105.246918,34.56,5.397605e-79,1626.38,253.98,180.57,1.95,17.28,27.22,2658.14,367.72,1811.55,3743.15,97.13,114.52,1298.44,2.62,6.66,3.05,2.92,112.19,10.17,29.1,178.3,99.9,92.5,0,0,1,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0
4,6.0,2.0,0.0,2.0,4.0,3.0,4074.958535,19.0,1711.0,949.52,8884.256881,5.397605e-79,267.23,382.23,202.11,38.45,0.81,6.24,10.59,1113.66,136.78,571.68,1684.5,24.63,25.15,389.86,0.72,1.67,0.9,0.97,37.31,6.83,22.56,162.0,81.6,59.2,0,0,1,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1


In [46]:
#Dataframe to dictionary
HU_dict = df_ohe.to_dict(orient='records')

HU_no_ohe_dict = df_no_ohe.to_dict(orient='records')

In [47]:
HU_dict[0]

{'BMXBMI': 14.9,
 'BMXHT': 91.6,
 'BMXWAIST': 45.7,
 'BMXWT': 12.5,
 'DIQ010_2_0': 1,
 'DIQ010_3_0': 0,
 'DMDBORN4_2_0': 0,
 'DMDCITZN_2_0': 0,
 'DMDHHSIZ': 3.0,
 'DMDHREDU': 3.0,
 'DRD320GW': 243.38,
 'DRDTSODI': 1621.3500000000006,
 'DRX18YR': 1348.0285918620227,
 'DRXTALCO': 5.397605346934027e-79,
 'DRXTCAFF': 5.397605346934027e-79,
 'DRXTCALC': 541.3,
 'DRXTCARB': 250.36,
 'DRXTCHOL': 46.55,
 'DRXTCOPP': 0.41,
 'DRXTFIBE': 7.41,
 'DRXTIRON': 12.18,
 'DRXTKCAL': 1358.88,
 'DRXTMAGN': 126.8,
 'DRXTPHOS': 503.75,
 'DRXTPOTA': 1387.7,
 'DRXTPROT': 31.96,
 'DRXTTFAT': 27.24,
 'DRXTVARE': 604.33,
 'DRXTVB1': 1.5,
 'DRXTVB12': 2.42,
 'DRXTVB2': 1.55,
 'DRXTVB6': 1.4300000000000013,
 'DRXTVC': 220.64,
 'DRXTZINC': 4.69,
 'HID010_2_0': 0,
 'HUQ010': 1.0,
 'HUQ020_2_0': 0,
 'HUQ020_3_0': 1,
 'HUQ030_2_0': 0,
 'HUQ030_3_0': 0,
 'HUQ050': 0.0,
 'HUQ070_2_0': 0,
 'INDFMINC': 3.0,
 'MCQ010_2_0': 1,
 'MEC18YR': 990.2681319999998,
 'RIAGENDR_2_0': 1,
 'RIDAGEYR': 2.0,
 'RIDRETH1_2_0': 0,
 'RIDRETH

In [48]:
#Insert collection
HU.insert_many(HU_dict)

<pymongo.results.InsertManyResult at 0x1250e9ec8>

In [49]:
HU_no_ohe_dict[0]

{'BMXBMI': 14.9,
 'BMXHT': 91.6,
 'BMXWAIST': 45.7,
 'BMXWT': 12.5,
 'DIQ010': 2.0,
 'DMDBORN4': 1.0,
 'DMDCITZN': 1.0,
 'DMDHHSIZ': 3.0,
 'DMDHREDU': 3.0,
 'DRD320GW': 243.38,
 'DRDTSODI': 1621.3500000000006,
 'DRX18YR': 1348.0285918620227,
 'DRXTALCO': 5.397605346934027e-79,
 'DRXTCAFF': 5.397605346934027e-79,
 'DRXTCALC': 541.3,
 'DRXTCARB': 250.36,
 'DRXTCHOL': 46.55,
 'DRXTCOPP': 0.41,
 'DRXTFIBE': 7.41,
 'DRXTIRON': 12.18,
 'DRXTKCAL': 1358.88,
 'DRXTMAGN': 126.8,
 'DRXTPHOS': 503.75,
 'DRXTPOTA': 1387.7,
 'DRXTPROT': 31.96,
 'DRXTTFAT': 27.24,
 'DRXTVARE': 604.33,
 'DRXTVB1': 1.5,
 'DRXTVB12': 2.42,
 'DRXTVB2': 1.55,
 'DRXTVB6': 1.4300000000000013,
 'DRXTVC': 220.64,
 'DRXTZINC': 4.69,
 'HID010': 1.0,
 'HUQ010': 1.0,
 'HUQ020': 3.0,
 'HUQ030': 1.0,
 'HUQ050': 0.0,
 'HUQ070': 1.0,
 'INDFMINC': 3.0,
 'MCQ010': 2.0,
 'MEC18YR': 990.2681319999998,
 'RIAGENDR': 2.0,
 'RIDAGEYR': 2.0,
 'RIDRETH1': 4.0,
 'SMD410': 2.0,
 'Year': 0,
 '_id': 1.0}

In [50]:
HU_no_ohe.insert_many(HU_no_ohe_dict)

<pymongo.results.InsertManyResult at 0x1250eb848>

In [51]:
db.list_collection_names()

['HU', 'HU_no_ohe']