In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
# thyroid demographics with omitted columns
thyroid_demographics = pd.read_csv('data/thyroid_demographics.csv')
thyroid_demographics.tail()

Unnamed: 0,Discharge ID,Medical Record Number,Hospital Number,Admit Age In Years,Gender Title,Ethnicity Title,Race - American Indian,Race - Asian,Race - Black,Race - Other,...,Infection Flag,Medical Complication Flag,Surgical Complication Flag,Post Operative LOS,RACHS Score,Principal Dx (ICD),Principal Dx Code - Title (ICD),Risk Of Mortality Title,Total Ped Expected Mortalities,Total Days In ICU
19764,1000568000000.0,7058656.0,2029.0,0.0,Female,Hispanic or Latino,N,N,N,N,...,N,N,N,8.0,2.0,7454,7454 - Ventricular sept defect,Minor,0.0011,6.0
19765,1000568000000.0,5366128.0,2029.0,3.0,Male,Unknown,N,N,N,N,...,Y,N,N,0.0,-1.0,4644,4644 - Croup,Moderate,0.0011,1.0
19766,1000568000000.0,5099111.0,2029.0,11.0,Male,Unknown,N,N,N,N,...,Y,N,N,0.0,-1.0,29689,29689 - Bipolar disorder NEC,Moderate,0.0,0.0
19767,1000577000000.0,72439859.0,2037.0,0.0,Male,Unknown,N,N,N,N,...,N,N,N,17.0,-1.0,V3001,V3001 - Single LB-hospital by CD,Moderate,0.0396,
19768,,,,,,,,,,,...,,,,,,,,,,


In [3]:
# read pharmacy csv file
drugs = pd.read_csv('data/thyroid_hormone_data_2004_2018.csv')

# show only the patients that received thyroid agent
thyroid_agent  = drugs.loc[drugs['Therapeutic Category Title'] == 'Thyroid agent']

In [12]:
thyroid_agent.dtypes

Discharge ID                   int64
Therapeutic Category Title    object
dtype: object

In [13]:
# drop last row, it contains NaN
# thyroid_demographics = thyroid_demographics.drop([0,19768])

In [10]:
thyroid_demographics.tail()

Unnamed: 0,Discharge ID,Medical Record Number,Hospital Number,Admit Age In Years,Gender Title,Ethnicity Title,Race - American Indian,Race - Asian,Race - Black,Race - Other,...,Infection Flag,Medical Complication Flag,Surgical Complication Flag,Post Operative LOS,RACHS Score,Principal Dx (ICD),Principal Dx Code - Title (ICD),Risk Of Mortality Title,Total Ped Expected Mortalities,Total Days In ICU
19763,1000568000000.0,5394523,2029.0,16.0,Female,Unknown,N,N,N,N,...,N,N,N,2,-1.0,42732,42732 - Atrial flutter,Minor,0.0,4.0
19764,1000568000000.0,7058656,2029.0,0.0,Female,Hispanic or Latino,N,N,N,N,...,N,N,N,8,2.0,7454,7454 - Ventricular sept defect,Minor,0.0011,6.0
19765,1000568000000.0,5366128,2029.0,3.0,Male,Unknown,N,N,N,N,...,Y,N,N,0,-1.0,4644,4644 - Croup,Moderate,0.0011,1.0
19766,1000568000000.0,5099111,2029.0,11.0,Male,Unknown,N,N,N,N,...,Y,N,N,0,-1.0,29689,29689 - Bipolar disorder NEC,Moderate,0.0,0.0
19767,1000577000000.0,72439859,2037.0,0.0,Male,Unknown,N,N,N,N,...,N,N,N,17,-1.0,V3001,V3001 - Single LB-hospital by CD,Moderate,0.0396,


In [14]:
thyroid_demographics.dtypes

Discharge ID                       float64
Medical Record Number               object
Hospital Number                    float64
Admit Age In Years                 float64
Gender Title                        object
Ethnicity Title                     object
Race - American Indian              object
Race - Asian                        object
Race - Black                        object
Race - Other                        object
Race - Pacific Islander             object
Race - Version 1                   float64
Race Title - Version 1              object
Length Of Stay                      object
Total Ped Expected LOS             float64
Discharge Mortality Flag            object
Mechanical Vent Flag                object
ECMO Flag                           object
Infection Flag                      object
Medical Complication Flag           object
Surgical Complication Flag          object
Post Operative LOS                  object
RACHS Score                        float64
Principal D

In [15]:
# convert discharge id in thyroid demographics csv from float to int
thyroid_demographics['Discharge ID'] = thyroid_demographics['Discharge ID'].astype(int)
thyroid_demographics.dtypes

Discharge ID                         int64
Medical Record Number               object
Hospital Number                    float64
Admit Age In Years                 float64
Gender Title                        object
Ethnicity Title                     object
Race - American Indian              object
Race - Asian                        object
Race - Black                        object
Race - Other                        object
Race - Pacific Islander             object
Race - Version 1                   float64
Race Title - Version 1              object
Length Of Stay                      object
Total Ped Expected LOS             float64
Discharge Mortality Flag            object
Mechanical Vent Flag                object
ECMO Flag                           object
Infection Flag                      object
Medical Complication Flag           object
Surgical Complication Flag          object
Post Operative LOS                  object
RACHS Score                        float64
Principal D

In [16]:
drugs.head()

Unnamed: 0,Discharge ID,Therapeutic Category Title
0,2605809,Aminoglycoside/penicillin
1,2605809,Analgesic and antipyretic
2,2605809,Antiarrhythmic/adrenergic agent
3,2605809,Antidiabetic agent
4,2605809,Electrolyte and replenishment agent


In [17]:
# merge thyroid demographics and thyroid hormones csv
thyroid_drugs = thyroid_demographics.merge(thyroid_agent, on='Discharge ID', how='left')

In [18]:
thyroid_drugs

Unnamed: 0,Discharge ID,Medical Record Number,Hospital Number,Admit Age In Years,Gender Title,Ethnicity Title,Race - American Indian,Race - Asian,Race - Black,Race - Other,...,Medical Complication Flag,Surgical Complication Flag,Post Operative LOS,RACHS Score,Principal Dx (ICD),Principal Dx Code - Title (ICD),Risk Of Mortality Title,Total Ped Expected Mortalities,Total Days In ICU,Therapeutic Category Title
0,2605821,72624346,2037.0,0.0,Male,Unknown,N,N,N,N,...,N,Y,65,6.0,V3001,V3001 - Single LB-hospital by CD,Extreme,0.0794,61.0,Thyroid agent
1,2606060,72763853,2037.0,1.0,Female,Unknown,N,N,N,N,...,N,N,9,-1.0,49391,49391 - Asthma w status asth,Moderate,0.0007,4.0,Thyroid agent
2,2606387,72048700,2037.0,7.0,Male,Unknown,N,N,Y,N,...,N,N,,-1.0,49391,49391 - Asthma w status asth,Moderate,0.0007,0.0,Thyroid agent
3,2606550,73189908,2037.0,0.0,Male,Unknown,N,N,N,N,...,N,N,25,-1.0,769,769 - Resp distress syndrome,Extreme,0.3899,0.0,Thyroid agent
4,2607156,72763853,2037.0,1.0,Female,Unknown,N,N,N,N,...,N,N,2,-1.0,4280,4280 - CHF NOS,Moderate,0.0099,0.0,Thyroid agent
5,2930060,1359678,1001.0,0.0,Male,Not Hispanic or Latino,N,N,N,Y,...,N,N,80,-1.0,7536,7536 - Cong urethral stenosis,Extreme,0.1897,0.0,Thyroid agent
6,2930060,1359678,1001.0,0.0,Male,Not Hispanic or Latino,N,N,N,Y,...,N,N,80,-1.0,7536,7536 - Cong urethral stenosis,Extreme,0.1897,0.0,Thyroid agent
7,2930266,1501562,1001.0,0.0,Female,Not Hispanic or Latino,N,N,N,N,...,N,N,24,4.0,7450,7450 - Common truncus,Major,0.0231,0.0,Thyroid agent
8,2930266,1501562,1001.0,0.0,Female,Not Hispanic or Latino,N,N,N,N,...,N,N,24,4.0,7450,7450 - Common truncus,Major,0.0231,0.0,Thyroid agent
9,2930266,1501562,1001.0,0.0,Female,Not Hispanic or Latino,N,N,N,N,...,N,N,24,4.0,7450,7450 - Common truncus,Major,0.0231,0.0,Thyroid agent


In [19]:
thyroid_demographics['Discharge ID'].nunique()

19767

In [20]:
thyroid_drugs['Discharge ID'].nunique()

19767

In [None]:
thyroid_original = pd.read_csv('data/thyroid.csv')
thyroid_original['Ethnicity Title']