In [1]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import re
from collections import defaultdict

In [2]:
#Use one ./ if running from run_notebooks.py
#Use two ../if running directly from Jupyter Notebooks
#file_beg = '../NHANES-Downloader/data/csv_data/'

file_beg = './NHANES-Downloader/data/csv_data/'

In [3]:
#Import all the files
files1 = glob.glob(file_beg+'1999-2000/*/*.csv')
files2 = glob.glob(file_beg+'2001-2002/*/*.csv')
files3 = glob.glob(file_beg+'2003-2004/*/*.csv')
files4 = glob.glob(file_beg+'2005-2006/*/*.csv')
files5 = glob.glob(file_beg+'2007-2008/*/*.csv')
files6 = glob.glob(file_beg+'2009-2010/*/*.csv')
files7 = glob.glob(file_beg+'2011-2012/*/*.csv')
files8 = glob.glob(file_beg+'2013-2014/*/*.csv')
files9 = glob.glob(file_beg+'2015-2016/*/*.csv')

In [3]:
#Add files into a list
file_list = [files1, files2, files3, files4, files5, 
          files6, files7, files8, files9]

In [4]:
#Add files into a list of sorted and dictionaries
file_list_dict = []
for x in file_list:
    x.sort()
    file_list_dict.append(dict(enumerate(x)))

In [5]:
hiq_indx = [88, 102, 115, 105, 104, 112, 104, 125, 75]

In [6]:
dfs = defaultdict(int)
for i in range(0,len(hiq_indx)):
    dfs[i] = pd.read_csv(file_list_dict[i][hiq_indx[i]])

## Important Variables

In [7]:
#1999-2004
var_names = ["SEQN", "HID010"]

#2005-2016
var_names1 = ["SEQN", "HIQ011"]

In [8]:
#To display all columns in Jupyter Notebooks
#pd.set_option('display.max_columns', 500)

## Functions: Recategorize values, Count Values, Drop Rows

In [9]:
#Recategorize function
def recategorize(df, name, replace_dict):
    df[name].replace(
    to_replace=replace_dict,
    inplace=True
)
    
#Count values function
def count_vals(df, name):
    df_count = df.groupby(name)['SEQN'].nunique()
    print(df_count,"\n\n","NaN: ", df[name].isnull().sum())
    
#Drop rows that include certain values
def drop_rows(df, name, val_list):
    df.drop(df[df[name].isin(val_list)].index, inplace=True)

## Make a copy of the dataframes

In [10]:
#1999-2004
df0 = dfs[0][var_names].copy() #1999-2000
df1 = dfs[1][var_names].copy() #2001-2002
df2 = dfs[2][var_names].copy() #2003-2004
#2005-2016
df3 = dfs[3][var_names1].copy() #2005-2006
df4 = dfs[4][var_names1].copy() #2007-2008
df5 = dfs[5][var_names1].copy() #2009-2010
df6 = dfs[6][var_names1].copy() #2011-2012
df7 = dfs[7][var_names1].copy() #2013-2014
df8 = dfs[8][var_names1].copy() #2015-2016

## Recategorize values

In [11]:
#Nothing requires recategorizing

## Rename columns 1999 - 2004

In [12]:
col_names = ['SEQN', 'HID010']

In [13]:
df0.columns = col_names
df1.columns = col_names
df2.columns = col_names
df3.columns = col_names
df4.columns = col_names
df5.columns = col_names
df6.columns = col_names
df7.columns = col_names
df8.columns = col_names

## Append years 1999 - 2016

In [14]:
years = ["1999-2000","2001-2002","2003-2004","2005-2006","2007-2008", 
        "2009-2010","2011-2012","2013-2014","2015-2016"]

In [15]:
frames = [df0, df1, df2, df3, df4, df5, df6, df7, df8]

In [16]:
#Add years as a column
for i, df in enumerate(frames):
    df["Year"] = years[i]

In [17]:
result_1999_2016 = pd.concat(frames, keys = years)

In [18]:
result_1999_2016_cleaned = result_1999_2016.copy()

## Count values for each column

In [19]:
len(col_names)

2

In [20]:
#HID010 - Covered by health insurance
count_vals(result_1999_2016, col_names[1])

HID010
1.0    75246
2.0    16153
7.0       37
9.0      157
Name: SEQN, dtype: int64 

 NaN:  469


In [21]:
len(result_1999_2016)

92062

## Remove missing values:

In [22]:
drop_rows(result_1999_2016_cleaned, col_names[1], [np.nan, 7, 9])

## See if missing values have been correctly removed:

In [23]:
count_vals(result_1999_2016_cleaned, col_names[1])

HID010
1.0    75246
2.0    16153
Name: SEQN, dtype: int64 

 NaN:  0


In [24]:
before = len(result_1999_2016)
before

92062

In [25]:
after = len(result_1999_2016_cleaned)
after

91399

In [26]:
(before-after)/after

0.007253908686090657

## MongoDB Insertion

In [27]:
#Import MongoClient
from pymongo import MongoClient

#Create a MongoClient to run the MongoDB instance
client = MongoClient('localhost', 27017)

In [28]:
#Connect to existing datbase
db = client.NHANES

In [29]:
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'NHANES')

In [30]:
db.list_collection_names()

['smqfam',
 'smq',
 'mcq',
 'hiq',
 'demo',
 'bpq',
 'huq',
 'paq',
 'whq',
 'alq',
 'diq']

In [31]:
#Create collection in database
hiq = db.hiq

In [32]:
#If collections exist, then drop
if 'hiq' in db.list_collection_names():
    hiq.drop()
    db.list_collection_names()
else:
    print("Doesn't exist yet")

## Create new collection to input into database

In [33]:
result_1999_2016_cleaned[:3]

Unnamed: 0,Unnamed: 1,SEQN,HID010,Year
1999-2000,0,1.0,1.0,1999-2000
1999-2000,1,2.0,1.0,1999-2000
1999-2000,2,3.0,1.0,1999-2000


In [34]:
#Set SEQN as _id (Primary Key)
result_1999_2016_cleaned.rename(columns={'SEQN':'_id'}, inplace=True)

In [35]:
#Dataframe to dictionary
hiq_dict = result_1999_2016_cleaned.to_dict(orient='records')

In [36]:
hiq_dict[0]

{'HID010': 1.0, 'Year': '1999-2000', '_id': 1.0}

In [37]:
#Insert collection 
hiq.insert_many(hiq_dict)

<pymongo.results.InsertManyResult at 0x11dfa1b48>

In [38]:
#View collections
db.list_collection_names()

['smqfam',
 'smq',
 'mcq',
 'hiq',
 'demo',
 'bpq',
 'huq',
 'paq',
 'whq',
 'alq',
 'diq']

In [39]:
db.hiq.count() #91399

  """Entry point for launching an IPython kernel.


91399

In [40]:
# for h in hiq.find():
#     print(h)