In [1]:
#Dependancies

import pandas as pd
import numpy as np

In [2]:
#Reading csv files

demographicDf = pd.read_csv("resources/demographic.csv")
dietDf = pd.read_csv("resources/diet.csv")
examinationDf = pd.read_csv("resources/examination.csv")
labsDf = pd.read_csv("resources/labs.csv")
medicationsDf = pd.read_csv("resources/medications.csv", encoding="ISO-8859-1")
questionnaireDf = pd.read_csv("resources/questionnaire.csv")

In [3]:
#Drops conditions in medicationsDf that appear less than 100 times

medValues = medicationsDf["RXDRSD1"].value_counts(dropna=False)
medicationsDf = medicationsDf[medicationsDf['RXDRSD1'].isin(medValues[medValues >= 100].index)]

In [4]:
#Replaces null values with a "Healthy" string

medicationsDf["RXDRSD1"] = medicationsDf["RXDRSD1"].fillna("Healthy")

In [5]:
#Renames the columns in medicationsDf

medicationsDf = medicationsDf.rename(columns=\
                                    {"RXDUSE": "Medications_Teken_Last_30_Days",\
                                      "RXDDRUG": "Generic_Drug_Name",\
                                      "RXDDRGID": "Generic_Drug_Code", \
                                      "RXQSEEN": "Was_Prescription_Container_Seen_By_Interviewer",\
                                      "RXDDAYS": "Duration_Taken_Medications",\
                                      "RXDRSC1": "ICD_10_CM_Code_1",\
                                      "RXDRSC2": "ICD_10_CM_Code_2",\
                                      "RXDRSC3": "ICD_10_CM_Code_3",\
                                      "RXDRSD1": "ICD_10_CM_Code_1_Description",\
                                      "RXDRSD2": "ICD_10_CM_Code_2_Description",\
                                      "RXDRSD3": "ICD_10_CM_Code_3_Description",\
                                      "RXDCOUNT":"The_Number_Of_Prescription_Medicines_Reported"
                                    })

In [6]:
#Combines medicationsDf with labsDf

joinedDf = medicationsDf.join(labsDf.set_index('SEQN'), on='SEQN', how='outer')

In [7]:
#Drops rows with duplicate SEQN values

joinedDf = joinedDf.drop_duplicates(subset='SEQN',keep='last')

In [8]:
joinedDf["ICD_10_CM_Code_1_Description"] = joinedDf["ICD_10_CM_Code_1_Description"].dropna()

In [9]:
#Isolates the SEQN column into it's own dataframe
labSEQN = labsDf[["SEQN"]]

#Sets null values to 0 and non-null values to 1
newLab = labsDf.notnull().astype('int')

#Drops the current SEQN column which only has values of 1 currently
newLab = newLab.drop(columns=["SEQN"])

#Brings the isolated SEQN column back into the dataframe
newLab = newLab.join(labSEQN, how='outer')

#Moves the SEQN column to the far left of our dataframe
columnList = list(newLab.columns.values)
columnList.pop(columnList.index('SEQN'))
newLab = newLab[['SEQN']+columnList]

#Drops new null values
newLab = newLab.dropna()

#Changes 0 values to 2 values for future division
for col in columnList:
    newLab[col] = newLab[col].replace(0, 2, regex=True)
    
#Views how many people have taken or haven't taken each test
for col in columnList:
    print(newLab[col].value_counts())

In [66]:
#Isolates the three columns we will be working with from the medicationsDf
newMed = medicationsDf[["SEQN", "Generic_Drug_Name", "ICD_10_CM_Code_1_Description"]]

#Renames the third column for readability
newMed = newMed.rename(columns={"ICD_10_CM_Code_1_Description": "Condition"})

#Replaces null perscriptions with a string value of "NONE"
newMed["Generic_Drug_Name"] = newMed["Generic_Drug_Name"].fillna("NONE")

#Corrects false data
newMed["Generic_Drug_Name"] = newMed["Generic_Drug_Name"].replace("99999", "NONE", regex=True)
newMed["Generic_Drug_Name"] = newMed["Generic_Drug_Name"].replace("55555", "NONE", regex=True)
newMed["Generic_Drug_Name"] = newMed["Generic_Drug_Name"].replace("77777", "NONE", regex=True)

#Cleans up condition names
newMed["Condition"] = newMed["Condition"].replace("Hypothyroidism, unspecified", "Hypothyroidism", regex=True)
newMed["Condition"] = newMed["Condition"].replace("Major depressive disorder, single episode, unspecified", "Major depressive disorder", regex=True)
newMed["Condition"] = newMed["Condition"].replace("Anxiety disorder, unspecified", "Anxiety disorder", regex=True)
newMed["Condition"] = newMed["Condition"].replace("Sleep disorder, unspecified", "Sleep disorder", regex=True)
newMed["Condition"] = newMed["Condition"].replace("Allergic rhinitis, unspecified", "Allergic rhinitis", regex=True)
newMed["Condition"] = newMed["Condition"].replace("Dorsalgia, unspecified", "Dorsalgia", regex=True)
newMed["Condition"] = newMed["Condition"].replace("Chronic obstructive pulmonary disease, unspecified", "Chronic obstructive pulmonary disease", regex=True)
newMed["Condition"] = newMed["Condition"].replace("Neuralgia and neuritis, unspecified", "Neuralgia and neuritis", regex=True)
newMed["Condition"] = newMed["Condition"].replace("Edema, unspecified", "Edema", regex=True)
newMed["Condition"] = newMed["Condition"].replace("Heart failure, unspecified", "Heart failure", regex=True)
newMed["Condition"] = newMed["Condition"].replace("Cardiac arrhythmia, unspecified", "Cardiac arrhythmia", regex=True)
newMed["Condition"] = newMed["Condition"].replace("Allergy, unspecified", "Allergy", regex=True)