In [None]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import re
from collections import defaultdict

In [None]:
#To display all columns in Jupyter Notebooks
pd.set_option('display.max_columns', 500)

In [None]:
#Import MongoClient
from pymongo import MongoClient

#Create a MongoClient to run the MongoDB instance
client = MongoClient('localhost', 27017)

In [None]:
#Connect to existing datbase
db = client.NHANES

In [None]:
db

In [None]:
col = db.list_collection_names()
col.sort()
col

In [None]:
#Collections
demo_p = db.demo_p
alq = db.alq
diq = db.diq
drxtot = db.drxtot
bpq = db.bpq
bpx = db.bpx
tchol = db.tchol
bmx = db.bmx
paq = db.paq
smq = db.smq
smqfam = db.smqfam

mcq_a = db.mcq_a #Asthma
mcq_h = db.mcq_h #Heart Disease

hiq = db.hiq
huq = db.huq
whq = db.whq

descr = db.descr

In [None]:
#Create dataframes from database
df_demo_p = pd.DataFrame(list(demo_p.find()))
df_alq = pd.DataFrame(list(alq.find()))
df_diq = pd.DataFrame(list(diq.find()))
df_drxtot = pd.DataFrame(list(drxtot.find()))
df_bpq = pd.DataFrame(list(bpq.find()))
df_bpx = pd.DataFrame(list(bpx.find()))
df_tchol = pd.DataFrame(list(tchol.find()))
df_bmx = pd.DataFrame(list(bmx.find()))
df_paq = pd.DataFrame(list(paq.find()))
df_smq = pd.DataFrame(list(smq.find()))
df_smqfam = pd.DataFrame(list(smqfam.find()))

df_mcq_a = pd.DataFrame(list(mcq_a.find()))
df_mcq_h = pd.DataFrame(list(mcq_h.find()))
df_hiq = pd.DataFrame(list(hiq.find()))
df_huq = pd.DataFrame(list(huq.find()))
df_whq = pd.DataFrame(list(whq.find()))

df_descr = pd.DataFrame(list(descr.find()))

In [None]:
#All records
dfs = [df_demo_p, df_alq, df_diq, df_drxtot, df_bpq, df_bpx, df_tchol, df_bmx, df_paq, 
       df_smq, df_smqfam, df_mcq_a, df_mcq_h, df_hiq, df_huq, df_whq]

In [None]:
names = ['demo_p', 'alq', 'diq', 'drxtot', 'bpq', 'bpx', 'tchol', 'bmx', 'paq',
        'smq', 'smqfam', 'mcq_a', 'mcq_h', 'hiq', 'huq', 'whq']

In [None]:
data_dict = dict(zip(names,dfs))

## Functions:

In [None]:
#Function for inner join
def innerjoin_df(dfs_list, join_on):
    df_join = dfs_list[0]
    for d in dfs_list[1:]:
        df_join = df_join.merge(d, how='inner', on=join_on)
    return df_join

In [None]:
#Function for getting info from list of collections
#Look at records and features for each
def get_info(dfs, names):
    shape = [x.shape for x in dfs]
    d = defaultdict(str)
    for i in range(0,len(shape)):
        d[names[i]] = shape[i]
    info = pd.DataFrame.from_dict(d, orient='index').reset_index()
    info.columns = ['_id', 'Records', 'Features']
    return info

In [None]:
info = get_info(dfs, names)
info_join = innerjoin_df([info, df_descr], ['_id'])
info_join = info_join.sort_values(by='Records', ascending=False)
info_join[:10]

### Select data to use

In [None]:
#Get relevant data
def get_reldata(df):
    dfs = []
    for c in df:
        dfs.append(data_dict[c])
    return dfs

In [None]:
#Selected risk factors for hospital utilization
dfs = get_reldata(info_join['_id'][:8])

In [None]:
df_j = innerjoin_df(dfs, ['_id', 'Year'])
df_j.shape

In [None]:
df_j.head()

## Reorder columns

In [None]:
#Get a list of columns
cols = list(df_j)

In [None]:
#Move '_id' column to head of list using dex, pop and insert
cols.insert(0, cols.pop(cols.index('_id')))

#Move 'Year' column to back of list using index, pop and insert
cols.insert(len(df_j.columns)-1, cols.pop(cols.index('Year')))

In [None]:
#Reorder dataframe
df_j = df_j.loc[:, cols]
df_j.head()

## Remap years to number categories

In [None]:
di = {"1999-2000": 0, "2001-2002": 1, "2003-2004": 2, "2005-2006": 3, "2007-2008": 4, 
      "2009-2010": 5, "2011-2012": 6, "2013-2014": 7, "2015-2016": 8}

In [None]:
#Map categorical years to numerical
df_j['Year'] = df_j['Year'].map(di)

In [None]:
df_j.head()

In [None]:
#Check if any NaN
df_j.isnull().values.any()

## Recategorize HUQ050 - Hospital Utilization to Binary

Hospital utilization is too difficult to predict with 5 categoires.  
Remapped number of times received healthcare over past year:
1. 0 - One and greater
2. 1 - None

In [None]:
#Recategorize function
def recategorize(df, name, replace_dict):
    df[name].replace(
    to_replace=replace_dict,
    inplace=True
)

In [None]:
recategorize(df_j, 'HUQ050', {2:1, 3:1, 4:1, 5:1})

#### Switch the variables 0 and 1: 0 

In [None]:
recategorize(df_j, 'HUQ050', {0:1, 1:0})

## Categorize features that need to be One Hot Encoded

In [None]:
df_j.info()

In [None]:
#Change columns to category
#Columns to remove: 
#DRX18YR - 18 Year weight
#MEC18YR - 18 year Weight
#Year
#_id

cat_cols = ['DMDBORN4',
            'DMDCITZN',
            'RIAGENDR',
            'RIDRETH1',
            'ALQ101',
            'DIQ010',
            'BPQ020',
            'BPXPULS',
            'PAQ635',
            'PAQ650',
            'PAQ665',
            'SMAQUEX',
            'SMQ680',
            'SMD410',
            'MCQ010',
            'MCQ160C',
            'HID010',
            'HUQ020',
            'HUQ030',
            'HUQ070',
            'WHQ030',
            'WHQ040']

def recat_cols(df, col_names):
    for x in col_names:
        if x in cat_cols:
            df[x] = df[x].astype('category')
    return df

col_names = df_j.columns
df_ohe = recat_cols(df_j, col_names)

In [None]:
df_j.info()

In [None]:
df_ohe.info()

## One Hot Encoding Cateogires

In [None]:
#Function to One Hot Encode Categories
def ohe(df_j, label=None):
    #Make copy of df
    df_t = df_j.copy()
    #Select datatypes that are categories
    X_cat = df_t.select_dtypes(include=['category'])
    if(label != None):
        #Drop label and year
        X_cat = X_cat.drop([label], axis=1)
    #Copy df with categories that dropped label and year
    X_enc = X_cat.copy()
    #Create dummies from categories
    X_enc_d = pd.get_dummies(X_enc, drop_first=True)
    #Drop original non-OHE columns from original df
    df = df_j.drop(list(X_enc), axis=1)
    df = pd.concat([df,X_enc_d], axis=1)
    if(label != None):
        df[label] = df[label].astype(np.uint8)
    df['Year'] = df['Year'].astype(np.uint8)
    return df

In [None]:
df_ohe = ohe(df_ohe)
df_no_ohe = df_j.copy()

In [None]:
df_ohe[:1].shape

In [None]:
df_ohe.shape

In [None]:
df_ohe.head()

In [None]:
df_ohe[df_ohe['RIDAGEYR']==2]

# Bin Important Risk Factors Numeric Data into Category Bins

### Age

In [None]:
df_ohe['RIDAGEYR'].describe()

In [None]:
bins = [0, 34, 48, 63, df_ohe['RIDAGEYR'].max()]
names = ['20-34', '35-48', '49-63', '64-85']
df_ohe['RIDAGEYR'] = pd.cut(df_ohe['RIDAGEYR'], bins, labels = names)

### Height

In [None]:
df_ohe['BMXHT'].describe()

In [None]:
bins = [0, 160, 167, 172, 190, df_ohe['BMXHT'].max()]
names = ['<5\'3', '5\'3 to 5\'5.75', '5\'5.75 to 5\'9', '5\'9 to 6\'3', '6\'3 and up']
df_ohe['BMXHT'] = pd.cut(df_ohe['BMXHT'], bins, labels = names)

# To CSV for R

In [None]:
df_ohe.to_csv('./Prevalence/Q1/Q1.csv')