In [20]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import re
from collections import defaultdict

In [21]:
#Use one ./ if running from run_notebooks.py
#Use two ../if running directly from Jupyter Notebooks
#file_beg = '../NHANES-Downloader/data/csv_data/'

file_beg = './NHANES-Downloader/data/csv_data/'

In [22]:
#Import all the files
files1 = glob.glob(file_beg+'1999-2000/*/*.csv')
files2 = glob.glob(file_beg+'2001-2002/*/*.csv')
files3 = glob.glob(file_beg+'2003-2004/*/*.csv')
files4 = glob.glob(file_beg+'2005-2006/*/*.csv')
files5 = glob.glob(file_beg+'2007-2008/*/*.csv')
files6 = glob.glob(file_beg+'2009-2010/*/*.csv')
files7 = glob.glob(file_beg+'2011-2012/*/*.csv')
files8 = glob.glob(file_beg+'2013-2014/*/*.csv')
files9 = glob.glob(file_beg+'2015-2016/*/*.csv')

In [23]:
#Add files into a list
file_list = [files1, files2, files3, files4, files5, 
          files6, files7, files8, files9]

In [24]:
#Add files into a list of sorted and dictionaries
file_list_dict = []
for x in file_list:
    x.sort()
    file_list_dict.append(dict(enumerate(x)))

In [25]:
bpq_indx = [76, 89, 103, 95, 92, 99, 90, 110, 64]

In [26]:
dfs = defaultdict(int)
for i in range(0,len(bpq_indx)):
    dfs[i] = pd.read_csv(file_list_dict[i][bpq_indx[i]])

## Important Variables

In [27]:
#1999-2016
var_names = ["SEQN", "BPQ020"]

In [28]:
#To display all columns in Jupyter Notebooks
#pd.set_option('display.max_columns', 500)

## Functions: Recategorize values, Count Values, Drop Rows

In [29]:
#Recategorize function
def recategorize(df, name, replace_dict):
    df[name].replace(
    to_replace=replace_dict,
    inplace=True
)
    
#Count values function
def count_vals(df, name):
    df_count = df.groupby(name)['SEQN'].nunique()
    print(df_count,"\n\n","NaN: ", df[name].isnull().sum())
    
#Drop rows that include certain values
def drop_rows(df, name, val_list):
    df.drop(df[df[name].isin(val_list)].index, inplace=True)

## Make a copy of the dataframes

In [30]:
#1999-2004
df0 = dfs[0][var_names].copy() #1999-2000
df1 = dfs[1][var_names].copy() #2001-2002
df2 = dfs[2][var_names].copy() #2003-2004
df3 = dfs[3][var_names].copy() #2005-2006
df4 = dfs[4][var_names].copy() #2007-2008
df5 = dfs[5][var_names].copy() #2009-2010
df6 = dfs[6][var_names].copy() #2011-2012
df7 = dfs[7][var_names].copy() #2013-2014
df8 = dfs[8][var_names].copy() #2015-2016

## Recategorize values

In [31]:
#No recategorizing needed

## Rename columns 1999 - 2004

In [32]:
#No renaming needed

## Append years 1999 - 2016

In [33]:
years = ["1999-2000","2001-2002","2003-2004","2005-2006","2007-2008", 
        "2009-2010","2011-2012","2013-2014","2015-2016"]

In [34]:
frames = [df0, df1, df2, df3, df4, df5, df6, df7, df8]

In [35]:
#Add years as a column
for i, df in enumerate(frames):
    df["Year"] = years[i]

In [36]:
result_1999_2016 = pd.concat(frames, keys = years)

In [37]:
result_1999_2016_cleaned = result_1999_2016.copy()

In [38]:
result_1999_2016

Unnamed: 0,Unnamed: 1,SEQN,BPQ020,Year
1999-2000,0,2.0,2.0,1999-2000
1999-2000,1,5.0,1.0,1999-2000
1999-2000,2,6.0,2.0,1999-2000
1999-2000,3,7.0,1.0,1999-2000
1999-2000,4,10.0,2.0,1999-2000
1999-2000,5,12.0,1.0,1999-2000
1999-2000,6,13.0,1.0,1999-2000
1999-2000,7,14.0,2.0,1999-2000
1999-2000,8,15.0,2.0,1999-2000
1999-2000,9,16.0,2.0,1999-2000


## Count values for each column

In [39]:
col_names = var_names

In [40]:
len(col_names)

2

In [41]:
#BPQ020 - Ever told you had high blood pressure
count_vals(result_1999_2016, col_names[1])

BPQ020
1.0    17297
2.0    39771
9.0       94
Name: SEQN, dtype: int64 

 NaN:  269


In [42]:
len(result_1999_2016)

57431

## Remove missing values:

In [43]:
drop_rows(result_1999_2016_cleaned, col_names[1], [np.nan, 7, 9])

## See if missing values have been correctly removed:

In [44]:
count_vals(result_1999_2016_cleaned, col_names[1])

BPQ020
1.0    17297
2.0    39771
Name: SEQN, dtype: int64 

 NaN:  0


In [45]:
before = len(result_1999_2016)
before

57431

In [46]:
after = len(result_1999_2016_cleaned)
after

57068

In [47]:
(before-after)/after

0.006360832690824981

## MongoDB Insertion

In [48]:
#Import MongoClient
from pymongo import MongoClient

#Create a MongoClient to run the MongoDB instance
client = MongoClient('localhost', 27017)

In [49]:
#Connect to existing database
db = client.NHANES
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'NHANES')

In [50]:
#Create collection in database
bpq = db.bpq

In [51]:
#If collections exist, then drop
if 'bpq' in db.list_collection_names():
    bpq.drop()
    db.list_collection_names()

## Create new collection to input into database

In [52]:
result_1999_2016_cleaned[:3]

Unnamed: 0,Unnamed: 1,SEQN,BPQ020,Year
1999-2000,0,2.0,2.0,1999-2000
1999-2000,1,5.0,1.0,1999-2000
1999-2000,2,6.0,2.0,1999-2000


In [53]:
#Set SEQN as _id (Primary Key)
result_1999_2016_cleaned.rename(columns={'SEQN':'_id'}, inplace=True)

In [54]:
#Dataframe to dictionary
bpq_dict = result_1999_2016_cleaned.to_dict(orient='records')

In [55]:
bpq_dict[0]

{'BPQ020': 2.0, 'Year': '1999-2000', '_id': 2.0}

In [56]:
#Insert collection
bpq.insert_many(bpq_dict)

<pymongo.results.InsertManyResult at 0x121fcb848>

In [57]:
#View collections
db.list_collection_names()

['smqfam',
 'smq',
 'bpx',
 'mcq',
 'hiq',
 'demo',
 'bpq',
 'drxtot',
 'huq',
 'tchol',
 'paq',
 'whq',
 'alq',
 'rdq',
 'diq',
 'bmx']