In [1]:
#Dependancies

import pandas as pd
import numpy as np

In [2]:
#Reading csv files

demographicDf = pd.read_csv("resources/demographic.csv")
dietDf = pd.read_csv("resources/diet.csv")
examinationDf = pd.read_csv("resources/examination.csv")
labsDf = pd.read_csv("resources/labs.csv")
medicationsDf = pd.read_csv("resources/medications.csv", encoding="ISO-8859-1")
questionnaireDf = pd.read_csv("resources/questionnaire.csv")

In [3]:
#Drops conditions in medicationsDf that appear less than 100 times

medValues = medicationsDf["RXDRSD1"].value_counts(dropna=False)
medicationsDf = medicationsDf[medicationsDf['RXDRSD1'].isin(medValues[medValues >= 100].index)]

In [4]:
#Replaces null values with a "Healthy" string

medicationsDf["RXDRSD1"] = medicationsDf["RXDRSD1"].fillna("Healthy")

In [5]:
#Renames the columns in medicationsDf

medicationsDf = medicationsDf.rename(columns=\
                                    {"RXDUSE": "Medications_Teken_Last_30_Days",\
                                      "RXDDRUG": "Generic_Drug_Name",\
                                      "RXDDRGID": "Generic_Drug_Code", \
                                      "RXQSEEN": "Was_Prescription_Container_Seen_By_Interviewer",\
                                      "RXDDAYS": "Duration_Taken_Medications",\
                                      "RXDRSC1": "ICD_10_CM_Code_1",\
                                      "RXDRSC2": "ICD_10_CM_Code_2",\
                                      "RXDRSC3": "ICD_10_CM_Code_3",\
                                      "RXDRSD1": "ICD_10_CM_Code_1_Description",\
                                      "RXDRSD2": "ICD_10_CM_Code_2_Description",\
                                      "RXDRSD3": "ICD_10_CM_Code_3_Description",\
                                      "RXDCOUNT":"The_Number_Of_Prescription_Medicines_Reported"
                                    })

In [6]:
#Combines medicationsDf with labsDf

joinedDf = medicationsDf.join(labsDf.set_index('SEQN'), on='SEQN', how='outer')

In [7]:
#Drops rows with duplicate SEQN values

joinedDf = joinedDf.drop_duplicates(subset='SEQN',keep='last')

In [8]:
joinedDf["ICD_10_CM_Code_1_Description"] = joinedDf["ICD_10_CM_Code_1_Description"].dropna()

In [9]:
#Isolates the SEQN column into it's own dataframe
labSEQN = labsDf[["SEQN"]]

#Sets null values to 0 and non-null values to 1
newLab = labsDf.notnull().astype('int')

#Drops the current SEQN column which only has values of 1 currently
newLab = newLab.drop(columns=["SEQN"])

#Brings the isolated SEQN column back into the dataframe
newLab = newLab.join(labSEQN, how='outer')

#Moves the SEQN column to the far left of our dataframe
columnList = list(newLab.columns.values)
columnList.pop(columnList.index('SEQN'))
newLab = newLab[['SEQN']+columnList]

#Drops new null values
newLab = newLab.dropna()

In [13]:
for col in columnList:
    print(newLab[col].value_counts())

1    8052
0    1761
Name: URXUMA, dtype: int64
1    8052
0    1761
Name: URXUMS, dtype: int64
1    8052
0    1761
Name: URXUCR.x, dtype: int64
1    8052
0    1761
Name: URXCRS, dtype: int64
1    8052
0    1761
Name: URDACT, dtype: int64
0    6484
1    3329
Name: WTSAF2YR.x, dtype: int64
0    6668
1    3145
Name: LBXAPB, dtype: int64
0    6668
1    3145
Name: LBDAPBSI, dtype: int64
1    6553
0    3260
Name: LBXSAL, dtype: int64
1    6553
0    3260
Name: LBDSALSI, dtype: int64
1    6552
0    3261
Name: LBXSAPSI, dtype: int64
1    6551
0    3262
Name: LBXSASSI, dtype: int64
1    6551
0    3262
Name: LBXSATSI, dtype: int64
1    6553
0    3260
Name: LBXSBU, dtype: int64
1    6553
0    3260
Name: LBDSBUSI, dtype: int64
1    6553
0    3260
Name: LBXSC3SI, dtype: int64
1    6511
0    3302
Name: LBXSCA, dtype: int64
1    6511
0    3302
Name: LBDSCASI, dtype: int64
1    6551
0    3262
Name: LBXSCH, dtype: int64
1    6551
0    3262
Name: LBDSCHSI, dtype: int64
1    6542
0    3271
Name: LBXSCK, dt

Name: ORXH84, dtype: int64
1    5057
0    4756
Name: ORXHPC, dtype: int64
1    5057
0    4756
Name: ORXHPI, dtype: int64
1    5057
0    4756
Name: ORXHPV, dtype: int64
0    7945
1    1868
Name: LBDRPCR.x, dtype: int64
0    8056
1    1757
Name: LBDRHP.x, dtype: int64
0    8056
1    1757
Name: LBDRLP.x, dtype: int64
0    7945
1    1868
Name: LBDR06.x, dtype: int64
0    7945
1    1868
Name: LBDR11.x, dtype: int64
0    7945
1    1868
Name: LBDR16.x, dtype: int64
0    7945
1    1868
Name: LBDR18.x, dtype: int64
0    7945
1    1868
Name: LBDR26.x, dtype: int64
0    7945
1    1868
Name: LBDR31.x, dtype: int64
0    7945
1    1868
Name: LBDR33.x, dtype: int64
0    7945
1    1868
Name: LBDR35.x, dtype: int64
0    7945
1    1868
Name: LBDR39.x, dtype: int64
0    7945
1    1868
Name: LBDR40.x, dtype: int64
0    7945
1    1868
Name: LBDR42.x, dtype: int64
0    7945
1    1868
Name: LBDR45.x, dtype: int64
0    7945
1    1868
Name: LBDR51.x, dtype: int64
0    7945
1    1868
Name: LBDR52.x, dtype: int6

1    5215
0    4598
Name: LBDBPBSI, dtype: int64
1    5215
0    4598
Name: LBDBPBLC, dtype: int64
1    5215
0    4598
Name: LBXBCD, dtype: int64
1    5215
0    4598
Name: LBDBCDSI, dtype: int64
1    5215
0    4598
Name: LBDBCDLC, dtype: int64
1    5215
0    4598
Name: LBXTHG, dtype: int64
1    5215
0    4598
Name: LBDTHGSI, dtype: int64
1    5215
0    4598
Name: LBDTHGLC, dtype: int64
1    5215
0    4598
Name: LBXBSE, dtype: int64
1    5215
0    4598
Name: LBDBSESI, dtype: int64
1    5215
0    4598
Name: LBDBSELC, dtype: int64
1    5215
0    4598
Name: LBXBMN, dtype: int64
1    5215
0    4598
Name: LBDBMNSI, dtype: int64
1    5215
0    4598
Name: LBDBMNLC, dtype: int64
0    5756
1    4057
Name: URXUTRI, dtype: int64
0    7159
1    2654
Name: URXUAS3, dtype: int64
0    7159
1    2654
Name: URDUA3LC, dtype: int64
0    7159
1    2654
Name: URXUAS5, dtype: int64
0    7159
1    2654
Name: URDUA5LC, dtype: int64
0    7159
1    2654
Name: URXUAB, dtype: int64
0    7159
1    2654
Name: URDUABL