In [1]:
#Dependancies

import pandas as pd
import numpy as np

In [2]:
#Reading csv files

demographicDf = pd.read_csv("resources/demographic.csv")
dietDf = pd.read_csv("resources/diet.csv")
examinationDf = pd.read_csv("resources/examination.csv")
labsDf = pd.read_csv("resources/labs.csv")
medicationsDf = pd.read_csv("resources/medications.csv", encoding="ISO-8859-1")
questionnaireDf = pd.read_csv("resources/questionnaire.csv")

In [3]:
#Drops conditions in medicationsDf that appear less than 100 times

medValues = medicationsDf["RXDRSD1"].value_counts(dropna=False)
medicationsDf = medicationsDf[medicationsDf['RXDRSD1'].isin(medValues[medValues >= 100].index)]

In [4]:
#Replaces null values with a "Healthy" string

medicationsDf["RXDRSD1"] = medicationsDf["RXDRSD1"].fillna("Healthy")

In [5]:
#Renames the columns in medicationsDf

medicationsDf = medicationsDf.rename(columns=\
                                    {"RXDUSE": "Medications_Teken_Last_30_Days",\
                                      "RXDDRUG": "Generic_Drug_Name",\
                                      "RXDDRGID": "Generic_Drug_Code", \
                                      "RXQSEEN": "Was_Prescription_Container_Seen_By_Interviewer",\
                                      "RXDDAYS": "Duration_Taken_Medications",\
                                      "RXDRSC1": "ICD_10_CM_Code_1",\
                                      "RXDRSC2": "ICD_10_CM_Code_2",\
                                      "RXDRSC3": "ICD_10_CM_Code_3",\
                                      "RXDRSD1": "ICD_10_CM_Code_1_Description",\
                                      "RXDRSD2": "ICD_10_CM_Code_2_Description",\
                                      "RXDRSD3": "ICD_10_CM_Code_3_Description",\
                                      "RXDCOUNT":"The_Number_Of_Prescription_Medicines_Reported"
                                    })

In [6]:
#Combines medicationsDf with labsDf

joinedDf = medicationsDf.join(labsDf.set_index('SEQN'), on='SEQN', how='outer')

In [7]:
#Drops rows with duplicate SEQN values

joinedDf = joinedDf.drop_duplicates(subset='SEQN',keep='last')

In [8]:
joinedDf["ICD_10_CM_Code_1_Description"] = joinedDf["ICD_10_CM_Code_1_Description"].dropna()

In [9]:
#Isolates the SEQN column into it's own dataframe
labSEQN = labsDf[["SEQN"]]

#Sets null values to 0 and non-null values to 1
newLab = labsDf.notnull().astype('int')

#Drops the current SEQN column which only has values of 1 currently
newLab = newLab.drop(columns=["SEQN"])

#Brings the isolated SEQN column back into the dataframe
newLab = newLab.join(labSEQN, how='outer')

#Moves the SEQN column to the far left of our dataframe
columnList = list(newLab.columns.values)
columnList.pop(columnList.index('SEQN'))
newLab = newLab[['SEQN']+columnList]

#Drops new null values
newLab = newLab.dropna()

In [15]:
counts = []
for col in columnList:
    counts.append(newLab[col].value_counts())
    #print(newLab[col].value_counts(1))

In [22]:
countsdf = pd.DataFrame(data=counts)
#countsdf

In [19]:
orderedDf = countsdf.sort_values(1, ascending=False)

In [32]:
orderedDf.head(30)

Unnamed: 0,0,1
PHDSESN,391,9422
PHQ030,631,9182
PHQ040,631,9182
PHQ050,631,9182
PHQ060,631,9182
PHAFSTHR.x,631,9182
PHAFSTMN.x,631,9182
PHQ020,631,9182
LBXRDW,1269,8544
LBXRBCSI,1269,8544


In [47]:
toplabsDf = newLab[["SEQN","LBXRDW","LBXRBCSI","LBXHGB","LBXHCT","LBXMCVSI","LBXMCHSI","LBXPLTSI","LBXMPSI","LBXWBCSI","LBDBANO","LBDEONO","LBDMONO","LBDLYMNO","LBXBAPCT","LBXEOPCT","LBXNEPCT","LBXMOPCT","LBXLYPCT","LBDNENO","LBXHA","LBXHBS"]]

In [48]:
toplabsDf.head()

Unnamed: 0,SEQN,LBXRDW,LBXRBCSI,LBXHGB,LBXHCT,LBXMCVSI,LBXMCHSI,LBXPLTSI,LBXMPSI,LBXWBCSI,...,LBDMONO,LBDLYMNO,LBXBAPCT,LBXEOPCT,LBXNEPCT,LBXMOPCT,LBXLYPCT,LBDNENO,LBXHA,LBXHBS
0,73557,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,73558,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,73559,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,73560,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,73561,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [50]:
toplabsDf = toplabsDf.rename(columns=\
                             {"LBXRDW":"Red cell distribution width (%)",
                             "LBXRBCSI":"Red blood cell count (million cells/uL)",
                             "LBXHGB":"Hemoglobin (g/dL)","LBXHCT":"Hematocrit (%)","LBXMCVSI":"Mean cell volume (fL)",
"LBXMCHSI":"Mean cell hemoglobin (pg)",
"LBXPLTSI":"Platelet count (1000 cells/uL)",
"LBXMPSI":"Mean platelet volume (fL)",
"LBXWBCSI":"White blood cell count (1000 cells/uL)",
"LBDBANO":"Basophils number (1000 cells/uL)",
"LBDEONO":"Eosinophils number (1000 cells/uL)",
"LBDMONO":"Monocyte number (1000 cells/uL)",
"LBDLYMNO":"Lymphocyte number (1000 cells/uL)",
"LBXBAPCT":"Basophils percent (%)",
"LBXEOPCT":"Eosinophils percent (%)",
"LBXNEPCT":"Segmented neutrophils percent (%)",
"LBXMOPCT":"Monocyte percent (%)",
"LBXLYPCT":"Lymphocyte percent (%)",
"LBDNENO":"Segmented neutrophils num (1000 cell/uL)",
"LBXHA":"Hepatitis A antibody",
"LBXHBS":"Hepatitis B Surface Antibody"
                             })

In [52]:
#toplabsDf

In [56]:
columnList = list(toplabsDf.columns.values)
columnList.pop(columnList.index('SEQN'))

'SEQN'

In [57]:
for col in columnList:
   toplabsDf[col] = toplabsDf[col].replace(0, 2, regex=True)

In [59]:
toplabsDf.head(20)

Unnamed: 0,SEQN,Red cell distribution width (%),Red blood cell count (million cells/uL),Hemoglobin (g/dL),Hematocrit (%),Mean cell volume (fL),Mean cell hemoglobin (pg),Platelet count (1000 cells/uL),Mean platelet volume (fL),White blood cell count (1000 cells/uL),...,Monocyte number (1000 cells/uL),Lymphocyte number (1000 cells/uL),Basophils percent (%),Eosinophils percent (%),Segmented neutrophils percent (%),Monocyte percent (%),Lymphocyte percent (%),Segmented neutrophils num (1000 cell/uL),Hepatitis A antibody,Hepatitis B Surface Antibody
0,73557,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,73558,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,73559,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,73560,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,73561,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
5,73562,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
6,73563,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
7,73564,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
8,73566,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
9,73567,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [1]:
#Renames the columns in demographicDf

demographicDf = demographicDf.rename(columns=\
                                    {
"SDDSRVYR":"Data release cycle",
"RIDSTATR":"Interview and examination status of the participant.",
"RIAGENDR":"Gender of the participant.",
"RIDAGEYR":"Age in years of the participant at the time of screening. Individuals 80 and over are topcoded at 80 years of age.",
"RIDAGEMN":"Age in months of the participant at the time of screening. Reported for persons aged 24 months or younger at the time of exam (or screening if not examined).",
"RIDRETH1":"Recode of reported race and Hispanic origin information",
"RIDRETH3":"Recode of reported race and Hispanic origin information, with Non-Hispanic Asian Category",
"RIDEXMON":"Six month time period when the examination was performed - two categories: November 1 through April 30, May 1 through October 31.",
"RIDEXAGM":"Age in months of the participant at the time of examination. Reported for persons aged 19 years or younger at the time of examination.",
"DMQMILIZ":"{Have you/Has SP} ever served on active duty in the U.S. Armed Forces, military Reserves, or National Guard? (Active duty does not include training for the Reserves or National Guard, but does include activation, for service in the U.S. or in a foreign country, in support of military or humanitarian operations.)",
"DMQADFC":"Did {you/SP} ever serve in a foreign country during a time of armed conflict or on a humanitarian or peace-keeping mission? (This would include National Guard or reserve or active duty monitoring or conducting peace keeping operations in Bosnia and Kosovo, in the Sinai between Egypt and Israel, or in response to the 2004 tsunami or Haiti in 2010.)",
"DMDBORN4":"In what country {were you/was SP} born?",
"DMDCITZN":"{Are you/Is SP} a citizen of the United States? [Information about citizenship is being collected by the U.S. Public Health Service to perform health related research. Providing this information is voluntary and is collected under the authority of the Public Health Service Act. There will be no effect on pending immigration or citizenship petitions.]",
"DMDYRSUS":"Length of time the participant has been in the US.",
"DMDEDUC3":"What is the highest grade or level of school {you have/SP has} completed or the highest degree {you have/s/he has} received?",
"DMDEDUC2":"What is the highest grade or level of school {you have/SP has} completed or the highest degree {you have/s/he has} received?",
"DMDMARTL":"Marital status",
"RIDEXPRG":"Pregnancy status for females between 20 and 44 years of age at the time of MEC exam.",
"SIALANG":"Language of the Sample Person Interview Instrument",
"SIAPROXY":"Was a Proxy respondent used in conducting the Sample Person (SP) interview?",
"SIAINTRP":"Was an interpreter used to conduct the Sample Person (SP) interview?",
"FIALANG":"Language of the Family Interview Instrument",
"FIAPROXY":"Was a Proxy respondent used in conducting the Family Interview?",
"FIAINTRP":"Was an interpreter used to conduct the Family interview?",
"MIALANG":"Language of the MEC CAPI Interview Instrument",
"MIAPROXY":"Was a Proxy respondent used in conducting the MEC CAPI Interview?",
"MIAINTRP":"Was an interpreter used to conduct the MEC CAPI interview?",
"AIALANGA":"Language of the MEC ACASI Interview Instrument",
"DMDHHSIZ":"Total number of people in the Household",
"DMDFMSIZ":"Total number of people in the Family",
"DMDHHSZA":"Number of children aged 5 years or younger in the household",
"DMDHHSZB":"Number of children aged 6-17 years old in the household",
"DMDHHSZE":"Number of adults aged 60 years or older in the household",
"DMDHRGND":"HH reference person's gender",
"DMDHRAGE":"HH reference person's age in years",
"DMDHRBR4":"HH reference person's country of birth",
"DMDHREDU":"HH reference person's education level",
"DMDHRMAR":"HH reference person's marital status",
"DMDHSEDU":"HH reference person's spouse's education level",
"WTINT2YR":"Full sample 2 year interview weight.",
"WTMEC2YR":"Full sample 2 year MEC exam weight.",
"SDMVPSU":"Masked variance unit pseudo-PSU variable for variance estimation",
"SDMVSTRA":"Masked variance unit pseudo-stratum variable for variance estimation",
"INDHHIN2":"Total household income (reported as a range value in dollars)",
"INDFMIN2":"Total family income (reported as a range value in dollars)",
"INDFMPIR":"A ratio of family income to poverty guidelines."})

NameError: name 'demographicDf' is not defined