In [215]:
import pandas as pd
import seaborn as sns
import numpy as np
%matplotlib inline
import pickle

Dataset - https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008
Column key - https://www.hindawi.com/journals/bmri/2014/781670/tab1/
Diagnosis key - https://www.hindawi.com/journals/bmri/2014/781670/tab2/

In [216]:
data2 = pd.read_csv('/Users/johnnosal/Downloads/dataset_diabetes/diabetic_data.csv')

In [217]:
data2.columns

Index([u'encounter_id', u'patient_nbr', u'race', u'gender', u'age', u'weight',
       u'admission_type_id', u'discharge_disposition_id',
       u'admission_source_id', u'time_in_hospital', u'payer_code',
       u'medical_specialty', u'num_lab_procedures', u'num_procedures',
       u'num_medications', u'number_outpatient', u'number_emergency',
       u'number_inpatient', u'diag_1', u'diag_2', u'diag_3',
       u'number_diagnoses', u'max_glu_serum', u'A1Cresult', u'metformin',
       u'repaglinide', u'nateglinide', u'chlorpropamide', u'glimepiride',
       u'acetohexamide', u'glipizide', u'glyburide', u'tolbutamide',
       u'pioglitazone', u'rosiglitazone', u'acarbose', u'miglitol',
       u'troglitazone', u'tolazamide', u'examide', u'citoglipton', u'insulin',
       u'glyburide-metformin', u'glipizide-metformin',
       u'glimepiride-pioglitazone', u'metformin-rosiglitazone',
       u'metformin-pioglitazone', u'change', u'diabetesMed', u'readmitted'],
      dtype='object')

In [219]:
data2 = data2.drop('examide', axis = 1) #Remove 'examide' from working dataset - only option is NO
data2 = data2.drop('citoglipton', axis = 1) #Remove 'citoglipton' from working dataset - only option is NO

In [220]:
data2['diabetesMed'] = data2['diabetesMed'].replace('No', 0) #Turn diabetesMed column to numbers
data2['diabetesMed'] = data2['diabetesMed'].replace('Yes', 1)

In [221]:
data2['change'] = data2['change'].replace('No', 0) #Turn change column to numbers
data2['change'] = data2['change'].replace('Ch', 1)

In [222]:
data2['gender'] = data2['gender'].replace('Male', 0) #Turn gender column to numbers: Male =0, Female=1
data2['gender'] = data2['gender'].replace('Female', 1)
data2['gender'] = data2['gender'].replace('Unknown/Invalid', 2)

In [223]:
data2 = data2[data2.gender!=2] #Remove three rows where gender is NA

In [224]:
data2 = data2[data2.discharge_disposition_id!=11]
data2 = data2[data2.discharge_disposition_id!=13]
data2 = data2[data2.discharge_disposition_id!=14]
data2 = data2[data2.discharge_disposition_id!=19]
data2 = data2[data2.discharge_disposition_id!=20]
data2 = data2[data2.discharge_disposition_id!=21]#Remove rows where patient expired or was sent to hospice

In [225]:
data2 = data2.drop('acetohexamide', axis = 1) 
#Remove 'citoglipton' from working dataset - only one option is not NO
data2 = data2.drop('tolbutamide', axis = 1) 
#Remove 'citoglipton' from working dataset - only 23 options are not NO
data2 = data2.drop('troglitazone', axis = 1) 
#Remove 'citoglipton' from working dataset - only 3 options are not NO
data2 = data2.drop('glipizide-metformin', axis = 1) 
#Remove 'citoglipton' from working dataset - only 13 option are not NO
data2 = data2.drop('glimepiride-pioglitazone', axis = 1) 
#Remove 'citoglipton' from working dataset - only option is NO
data2 = data2.drop('metformin-rosiglitazone', axis = 1) 
#Remove 'citoglipton' from working dataset - only 2 options are not NO
data2 = data2.drop('metformin-pioglitazone', axis = 1) 
#Remove 'citoglipton' from working dataset - only option is NO

In [226]:
data2 = data2.drop('chlorpropamide', axis = 1) 
#Remove 'chlorpropamide' from working dataset - only 86 options are not NO
data2 = data2.drop('acarbose', axis = 1) 
#Remove 'acarbose' from working dataset - only 308 options are not NO
data2 = data2.drop('miglitol', axis = 1) 
#Remove 'miglitol' from working dataset - only 38 options are not NO
data2 = data2.drop('tolazamide', axis = 1) 
#Remove 'tolazamide' from working dataset - only 39 options are not NO

In [227]:
data2 = data2.drop('weight', axis = 1) 
#Remove 'weight' as only 3% of rows have data and this element was removed from the original analysis of the data
data2 = data2.drop('payer_code', axis = 1) 
#Remove 'payer_code' as a majority of rows are missing data and this field is not relevant to readmission

In [232]:
data2['time_in_hospital'].mean()

4.379363801087175

In [152]:
data2 = data2.reset_index(drop=True) #Reset index

In [153]:
#Convert select columns to floats to normalize below
data2['time_in_hospital'] = data2['time_in_hospital'].astype(float)
data2['num_lab_procedures'] = data2['num_lab_procedures'].astype(float)
data2['num_procedures'] = data2['num_procedures'].astype(float)
data2['num_medications'] = data2['num_medications'].astype(float)
data2['number_outpatient'] = data2['number_outpatient'].astype(float)
data2['number_emergency'] = data2['number_emergency'].astype(float)
data2['number_inpatient'] = data2['number_inpatient'].astype(float)
data2['number_diagnoses'] = data2['number_diagnoses'].astype(float)


In [156]:
from sklearn.preprocessing import StandardScaler

In [157]:
#Get dummy variables for 'race', dropping 'race_?'
data2 = pd.get_dummies(data2, columns=['race']).drop('race_?', axis=1)

In [158]:
#Get dummy variables for 'age', dropping 'age_[0-10)'
data2 = pd.get_dummies(data2, columns=['age']).drop('age_[0-10)', axis=1)

In [162]:
#Inflate importance of number inpatient
data2['number_inpatient'] = data2['number_inpatient']**2
data2['num_lab_procedures'] = data2['num_lab_procedures']**2
data2['num_medications'] = data2['num_medications']**2
data2['number_diagnoses'] = data2['number_diagnoses']**2
data2['time_in_hospital'] = data2['time_in_hospital']**2
data2['num_procedures'] = data2['num_procedures']**2
data2['number_outpatient'] = data2['number_outpatient']**2
data2['number_emergency'] = data2['number_emergency']**2

In [164]:
data2['time_in_hospital'] = StandardScaler().fit_transform(data2['time_in_hospital'])



In [165]:
data2['num_lab_procedures'] = StandardScaler().fit_transform(data2['num_lab_procedures'])
data2['num_procedures'] = StandardScaler().fit_transform(data2['num_procedures'])
data2['num_medications'] = StandardScaler().fit_transform(data2['num_medications'])
data2['number_outpatient'] = StandardScaler().fit_transform(data2['number_outpatient'])
data2['number_emergency'] = StandardScaler().fit_transform(data2['number_emergency'])
data2['number_inpatient'] = StandardScaler().fit_transform(data2['number_inpatient'])
data2['number_diagnoses'] = StandardScaler().fit_transform(data2['number_diagnoses'])



In [166]:
pd.DataFrame(StandardScaler().fit_transform(data2['number_inpatient'])).describe()



Unnamed: 0,0
count,99340.0
mean,-1.466411e-14
std,1.000005
min,-0.2239301
25%,-0.2239301
50%,-0.2239301
75%,-0.1112213
max,49.48064


In [168]:
data2['readmitted'] = data2['readmitted'].replace('NO', 0) #Turn readmitted column to a boolean 
data2['readmitted'] = data2['readmitted'].replace('>30', 1)
data2['readmitted'] = data2['readmitted'].replace('<30', 1)

In [169]:
data3 = data2.sample(frac=1).reset_index(drop=True) #shuffle order of rows

In [170]:
data3 = data3.drop_duplicates('patient_nbr') #remove duplicate patient records

In [171]:
data3 = data3.reset_index(drop=True) #reset index

In [172]:
#Change letters in diagnosis columns to numbers so they can be converted to floats
data3['diag_1'] = data3['diag_1'].apply(lambda x: 9999 if x[0][0] == 'V' else x)
data3['diag_2'] = data3['diag_2'].apply(lambda x: 9999 if x[0][0] == 'V' else x)
data3['diag_3'] = data3['diag_3'].apply(lambda x: 9999 if x[0][0] == 'V' else x)

In [177]:
data3['diag_1'] = data3['diag_1'].astype(str)
data3['diag_3'] = data3['diag_3'].astype(str)
data3['diag_2'] = data3['diag_2'].astype(str)

In [174]:
data3['diag_1'] = data3['diag_1'].apply(lambda x: 9999 if x[0][0] == 'E' else x)

In [175]:
data3['diag_2'] = data3['diag_2'].apply(lambda x: 9999 if x[0][0] == 'E' else x)
data3['diag_3'] = data3['diag_3'].apply(lambda x: 9999 if x[0][0] == 'E' else x)

In [178]:
data3['diag_2'] = data3['diag_2'].apply(lambda x: 9999 if x[0][0] == '?' else x)
data3['diag_3'] = data3['diag_3'].apply(lambda x: 9999 if x[0][0] == '?' else x)

In [179]:
data3['diag_1'] = data3['diag_1'].replace('?', 9999)
data3['diag_2'] = data3['diag_2'].replace('?', 9999)
data3['diag_3'] = data3['diag_3'].replace('?', 9999)

In [180]:
#Convert diagnosis columns to floats
data3['diag_1'] = data3['diag_1'].astype(float)

In [181]:
data3['diag_2'] = data3['diag_2'].astype(float)
data3['diag_3'] = data3['diag_3'].astype(float)

In [182]:
#Decode diagnosis columns
data3.loc[(data3['diag_1'] >= 390) & (data3['diag_1'] <=459), 'diag_1'] = 'Circulatory'
data3.loc[(data3['diag_1'] >= 460) & (data3['diag_1'] <=519), 'diag_1'] = 'Respiratory'
data3.loc[(data3['diag_1'] >= 520) & (data3['diag_1'] <=579), 'diag_1'] = 'Digestive'
data3.loc[(data3['diag_1'] >= 800) & (data3['diag_1'] <=999), 'diag_1'] = 'Injury'
data3.loc[(data3['diag_1'] >= 710) & (data3['diag_1'] <=739), 'diag_1'] = 'Musculoskeletal'
data3.loc[(data3['diag_1'] >= 580) & (data3['diag_1'] <=629), 'diag_1'] = 'Genitourinary'
data3.loc[(data3['diag_1'] >= 140) & (data3['diag_1'] <=239), 'diag_1'] = 'Neoplasm'
data3.loc[(data3['diag_1'] >= 280) & (data3['diag_1'] <=359), 'diag_1'] = 'Other'
data3.loc[(data3['diag_1'] >= 630) & (data3['diag_1'] <=679), 'diag_1'] = 'Other'
data3.loc[(data3['diag_1'] >= 360) & (data3['diag_1'] <=389), 'diag_1'] = 'Other'
data3.loc[(data3['diag_1'] >= 740) & (data3['diag_1'] <=759), 'diag_1'] = 'Other'
data3.loc[(data3['diag_1'] >= 780) & (data3['diag_1'] <=784), 'diag_1'] = 'Other'
data3.loc[(data3['diag_1'] >= 1) & (data3['diag_1'] <=139), 'diag_1'] = 'Other'
data3.loc[(data3['diag_1'] >= 790) & (data3['diag_1'] <=799), 'diag_1'] = 'Other'
data3.loc[(data3['diag_1'] >= 680) & (data3['diag_1'] <=709), 'diag_1'] = 'Other'
data3.loc[(data3['diag_1'] >= 240) & (data3['diag_1'] <=249), 'diag_1'] = 'Other'
data3.loc[(data3['diag_1'] >= 251) & (data3['diag_1'] <=279), 'diag_1'] = 'Other'
data3.loc[(data3['diag_1'] >= 250) & (data3['diag_1'] <=250.99), 'diag_1'] = 'Diabetes'
data3['diag_1'] = data3['diag_1'].replace(785, 'Circulatory')
data3['diag_1'] = data3['diag_1'].replace(786, 'Respiratory')
data3['diag_1'] = data3['diag_1'].replace(787, 'Circulatory')
data3['diag_1'] = data3['diag_1'].replace(788, 'Genitourinary')
data3['diag_1'] = data3['diag_1'].replace(9999, 'Other')
data3['diag_1'] = data3['diag_1'].replace(789, 'Other')


In [183]:
data3.loc[(data3['diag_2'] >= 390) & (data3['diag_2'] <=459), 'diag_2'] = 'Circulatory'
data3.loc[(data3['diag_2'] >= 460) & (data3['diag_2'] <=519), 'diag_2'] = 'Respiratory'
data3.loc[(data3['diag_2'] >= 520) & (data3['diag_2'] <=579), 'diag_2'] = 'Digestive'
data3.loc[(data3['diag_2'] >= 800) & (data3['diag_2'] <=999), 'diag_2'] = 'Injury'
data3.loc[(data3['diag_2'] >= 710) & (data3['diag_2'] <=739), 'diag_2'] = 'Musculoskeletal'
data3.loc[(data3['diag_2'] >= 580) & (data3['diag_2'] <=629), 'diag_2'] = 'Genitourinary'
data3.loc[(data3['diag_2'] >= 140) & (data3['diag_2'] <=239), 'diag_2'] = 'Neoplasm'
data3.loc[(data3['diag_2'] >= 280) & (data3['diag_2'] <=359), 'diag_2'] = 'Other'
data3.loc[(data3['diag_2'] >= 630) & (data3['diag_2'] <=679), 'diag_2'] = 'Other'
data3.loc[(data3['diag_2'] >= 360) & (data3['diag_2'] <=389), 'diag_2'] = 'Other'
data3.loc[(data3['diag_2'] >= 740) & (data3['diag_2'] <=759), 'diag_2'] = 'Other'
data3.loc[(data3['diag_2'] >= 780) & (data3['diag_2'] <=784), 'diag_2'] = 'Other'
data3.loc[(data3['diag_2'] >= 1) & (data3['diag_2'] <=139), 'diag_2'] = 'Other'
data3.loc[(data3['diag_2'] >= 790) & (data3['diag_2'] <=799), 'diag_2'] = 'Other'
data3.loc[(data3['diag_2'] >= 680) & (data3['diag_2'] <=709), 'diag_2'] = 'Other'
data3.loc[(data3['diag_2'] >= 240) & (data3['diag_2'] <=249), 'diag_2'] = 'Other'
data3.loc[(data3['diag_2'] >= 251) & (data3['diag_2'] <=279), 'diag_2'] = 'Other'
data3.loc[(data3['diag_2'] >= 250) & (data3['diag_2'] <=250.99), 'diag_2'] = 'Diabetes'
data3['diag_2'] = data3['diag_2'].replace(785, 'Circulatory')
data3['diag_2'] = data3['diag_2'].replace(786, 'Respiratory')
data3['diag_2'] = data3['diag_2'].replace(787, 'Circulatory')
data3['diag_2'] = data3['diag_2'].replace(788, 'Genitourinary')
data3['diag_2'] = data3['diag_2'].replace(9999, 'Other')
data3['diag_2'] = data3['diag_2'].replace(789, 'Other')

In [184]:
data3.loc[(data3['diag_3'] >= 390) & (data3['diag_3'] <=459), 'diag_3'] = 'Circulatory'
data3.loc[(data3['diag_3'] >= 460) & (data3['diag_3'] <=519), 'diag_3'] = 'Respiratory'
data3.loc[(data3['diag_3'] >= 520) & (data3['diag_3'] <=579), 'diag_3'] = 'Digestive'
data3.loc[(data3['diag_3'] >= 800) & (data3['diag_3'] <=999), 'diag_3'] = 'Injury'
data3.loc[(data3['diag_3'] >= 710) & (data3['diag_3'] <=739), 'diag_3'] = 'Musculoskeletal'
data3.loc[(data3['diag_3'] >= 580) & (data3['diag_3'] <=629), 'diag_3'] = 'Genitourinary'
data3.loc[(data3['diag_3'] >= 140) & (data3['diag_3'] <=239), 'diag_3'] = 'Neoplasm'
data3.loc[(data3['diag_3'] >= 280) & (data3['diag_3'] <=359), 'diag_3'] = 'Other'
data3.loc[(data3['diag_3'] >= 630) & (data3['diag_3'] <=679), 'diag_3'] = 'Other'
data3.loc[(data3['diag_3'] >= 360) & (data3['diag_3'] <=389), 'diag_3'] = 'Other'
data3.loc[(data3['diag_3'] >= 740) & (data3['diag_3'] <=759), 'diag_3'] = 'Other'
data3.loc[(data3['diag_3'] >= 780) & (data3['diag_3'] <=784), 'diag_3'] = 'Other'
data3.loc[(data3['diag_3'] >= 1) & (data3['diag_3'] <=139), 'diag_3'] = 'Other'
data3.loc[(data3['diag_3'] >= 790) & (data3['diag_3'] <=799), 'diag_3'] = 'Other'
data3.loc[(data3['diag_3'] >= 680) & (data3['diag_3'] <=709), 'diag_3'] = 'Other'
data3.loc[(data3['diag_3'] >= 240) & (data3['diag_3'] <=249), 'diag_3'] = 'Other'
data3.loc[(data3['diag_3'] >= 251) & (data3['diag_3'] <=279), 'diag_3'] = 'Other'
data3.loc[(data3['diag_3'] >= 250) & (data3['diag_3'] <=250.99), 'diag_3'] = 'Diabetes'
data3['diag_3'] = data3['diag_3'].replace(785, 'Circulatory')
data3['diag_3'] = data3['diag_3'].replace(786, 'Respiratory')
data3['diag_3'] = data3['diag_3'].replace(787, 'Circulatory')
data3['diag_3'] = data3['diag_3'].replace(788, 'Genitourinary')
data3['diag_3'] = data3['diag_3'].replace(9999, 'Other')
data3['diag_3'] = data3['diag_3'].replace(789, 'Other')

In [185]:
print data3['diag_1'].value_counts()
print data3['diag_2'].value_counts()
print data3['diag_3'].value_counts()

Circulatory        21306
Other              12382
Respiratory         9505
Digestive           6356
Diabetes            5654
Injury              4803
Musculoskeletal     3984
Genitourinary       3497
Neoplasm            2500
Name: diag_1, dtype: int64
Circulatory        22180
Other              18498
Diabetes            9391
Respiratory         7001
Genitourinary       5440
Digestive           2694
Injury              1826
Neoplasm            1649
Musculoskeletal     1308
Name: diag_2, dtype: int64
Other              21225
Circulatory        21101
Diabetes           12429
Respiratory         4684
Genitourinary       4148
Digestive           2401
Injury              1415
Musculoskeletal     1378
Neoplasm            1206
Name: diag_3, dtype: int64


In [186]:
#Get dummy variables for all three diagnosis columns
data3 = pd.get_dummies(data3, columns=['diag_1']).drop('diag_1_Other', axis=1)
data3 = pd.get_dummies(data3, columns=['diag_2']).drop('diag_2_Other', axis=1)
data3 = pd.get_dummies(data3, columns=['diag_3']).drop('diag_3_Other', axis=1)

In [190]:
data3['admission_type_id'].value_counts()

Emergency        35799
Elective         13795
Urgent           12800
NotMapped         7567
Trauma Center       18
Newborn              8
Name: admission_type_id, dtype: int64

In [188]:
#Combine all admission type IDs that are 'not available'
data3['admission_type_id'] = data3['admission_type_id'].replace(5, 8)
data3['admission_type_id'] = data3['admission_type_id'].replace(6, 8)

In [189]:
#Decode admission type ids
data3['admission_type_id'] = data3['admission_type_id'].replace(1, 'Emergency')
data3['admission_type_id'] = data3['admission_type_id'].replace(2, 'Urgent')
data3['admission_type_id'] = data3['admission_type_id'].replace(3, 'Elective')
data3['admission_type_id'] = data3['admission_type_id'].replace(4, 'Newborn')
data3['admission_type_id'] = data3['admission_type_id'].replace(7, 'Trauma Center')
data3['admission_type_id'] = data3['admission_type_id'].replace(8, 'NotMapped')

In [191]:
#Get dummy variables for Admission Type ID drop 'Not Mapped'
data3 = pd.get_dummies(data3, columns=['admission_type_id']).drop('admission_type_id_NotMapped', axis=1)

In [194]:
data3['medical_specialty'].value_counts()

Unknown                   33647
InternalMedicine          10701
Family/GeneralPractice     5116
Emergency/Trauma           4448
Cardiology                 4183
Surgery                    3802
Orthopedics                2110
Other                      1188
Nephrology                  808
OB/GYN                      684
Psychiatry                  680
Pulmonology                 645
Urology                     516
Gastroenterology            376
Cancer                      354
Endocrinology               244
Neurology                   176
Otolaryngology              112
Podiatry                     60
InfectiousDiseases           42
Osteopath                    38
Ophthalmology                34
Rheumatology                 14
AllergyandImmunology          7
Neurophysiology               1
Proctology                    1
Name: medical_specialty, dtype: int64

In [193]:
#Clean and combine Medical Speciality fields
data3['medical_specialty'] = data3['medical_specialty'].replace('Surgery-General', 'Surgery')
data3['medical_specialty'] = data3['medical_specialty'].replace('Surgery-Cardiovascular/Thoracic', 'Surgery')
data3['medical_specialty'] = data3['medical_specialty'].replace('Surgery-Vascular', 'Surgery')
data3['medical_specialty'] = data3['medical_specialty'].replace('Surgery-Neuro', 'Surgery')
data3['medical_specialty'] = data3['medical_specialty'].replace('Surgery-Thoracic', 'Surgery')
data3['medical_specialty'] = data3['medical_specialty'].replace('Surgery-Cardiovascular', 'Surgery')
data3['medical_specialty'] = data3['medical_specialty'].replace('SurgicalSpecialty', 'Surgery')
data3['medical_specialty'] = data3['medical_specialty'].replace('Surgery-Colon&Rectal', 'Surgery')
data3['medical_specialty'] = data3['medical_specialty'].replace('Surgery-Pediatric', 'Surgery')
data3['medical_specialty'] = data3['medical_specialty'].replace('Surgery-Maxillofacial', 'Surgery')
data3['medical_specialty'] = data3['medical_specialty'].replace('Surgeon', 'Surgery')
data3['medical_specialty'] = data3['medical_specialty'].replace('Surgery-Plastic', 'Surgery')
data3['medical_specialty'] = data3['medical_specialty'].replace('Surgery-PlasticwithinHeadandNeck', 'Surgery')

data3['medical_specialty'] = data3['medical_specialty'].replace('ObstetricsandGynecology', 'OB/GYN')
data3['medical_specialty'] = data3['medical_specialty'].replace('Obsterics&Gynecology-GynecologicOnco', 'OB/GYN')
data3['medical_specialty'] = data3['medical_specialty'].replace('Obstetrics', 'OB/GYN')
data3['medical_specialty'] = data3['medical_specialty'].replace('Gynecology', 'OB/GYN')
data3['medical_specialty'] = data3['medical_specialty'].replace('Perinatology', 'OB/GYN')

data3['medical_specialty'] = data3['medical_specialty'].replace('?', 'Unknown')
data3['medical_specialty'] = data3['medical_specialty'].replace('PhysicianNotFound', 'Unknown')

data3['medical_specialty'] = data3['medical_specialty'].replace('PhysicalMedicineandRehabilitation', 'Other')
data3['medical_specialty'] = data3['medical_specialty'].replace('OutreachServices', 'Other')
data3['medical_specialty'] = data3['medical_specialty'].replace('Dentistry', 'Other')
data3['medical_specialty'] = data3['medical_specialty'].replace('Speech', 'Other')
data3['medical_specialty'] = data3['medical_specialty'].replace('Resident', 'Other')
data3['medical_specialty'] = data3['medical_specialty'].replace('Hospitalist', 'Other')
data3['medical_specialty'] = data3['medical_specialty'].replace('Anesthesiology', 'Other')
data3['medical_specialty'] = data3['medical_specialty'].replace('DCPTEAM', 'Other')
data3['medical_specialty'] = data3['medical_specialty'].replace('Radiologist', 'Other')
data3['medical_specialty'] = data3['medical_specialty'].replace('Radiology', 'Other')
data3['medical_specialty'] = data3['medical_specialty'].replace('Anesthesiology-Pediatric', 'Other')

data3['medical_specialty'] = data3['medical_specialty'].replace('Psychiatry', 'Psychiatry')
data3['medical_specialty'] = data3['medical_specialty'].replace('Psychology', 'Psychiatry')
data3['medical_specialty'] = data3['medical_specialty'].replace('Psychiatry-Child/Adolescent', 'Psychiatry')
data3['medical_specialty'] = data3['medical_specialty'].replace('Psychiatry-Addictive', 'Psychiatry')

data3['medical_specialty'] = data3['medical_specialty'].replace('Oncology', 'Cancer')
data3['medical_specialty'] = data3['medical_specialty'].replace('Hematology/Oncology', 'Cancer')
data3['medical_specialty'] = data3['medical_specialty'].replace('Hematology', 'Cancer')
data3['medical_specialty'] = data3['medical_specialty'].replace('Pediatrics-Hematology-Oncology', 'Cancer')

data3['medical_specialty'] = data3['medical_specialty'].replace('Orthopedics-Reconstructive', 'Orthopedics')

data3['medical_specialty'] = data3['medical_specialty'].replace('Pediatrics-EmergencyMedicine', 'Emergency/Trauma')
data3['medical_specialty'] = data3['medical_specialty'].replace('Pediatrics-CriticalCare', 'Emergency/Trauma')

data3['medical_specialty'] = data3['medical_specialty'].replace('Pediatrics-Endocrinology', 'Endocrinology')

data3['medical_specialty'] = data3['medical_specialty'].replace('Cardiology-Pediatric', 'Cardiology')

data3['medical_specialty'] = data3['medical_specialty'].replace('Pediatrics-AllergyandImmunology', 'AllergyandImmunology')

data3['medical_specialty'] = data3['medical_specialty'].replace('Pediatrics', 'Family/GeneralPractice')

data3['medical_specialty'] = data3['medical_specialty'].replace('Pediatrics-Pulmonology', 'Pulmonology')

data3['medical_specialty'] = data3['medical_specialty'].replace('Endocrinology-Metabolism', 'Endocrinology')

data3['medical_specialty'] = data3['medical_specialty'].replace('Pediatrics-Neurology', 'Neurology')

data3['medical_specialty'] = data3['medical_specialty'].replace('Pathology', 'InfectiousDiseases')
data3['medical_specialty'] = data3['medical_specialty'].replace('Pediatrics-InfectiousDiseases', 'InfectiousDiseases')

In [195]:
#Get dummy variables for Medical Speciality drop 'Proctology'
data3 = pd.get_dummies(data3, columns=['medical_specialty']).drop('medical_specialty_Proctology', axis=1)

In [196]:
#Drop encourter ID from data set
data3 = data3.drop('encounter_id', axis=1)

In [199]:
data3['discharge_disposition_id'].value_counts()

Home                           52586
Other facility                 11951
Unknown                         3109
Hospital                        1909
Left against Medical Advice      416
outpatient services               16
Name: discharge_disposition_id, dtype: int64

In [198]:
#Decode and clean up Discharge ID column
data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(16, 'outpatient services')
data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(17, 'outpatient services')
data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(12, 'outpatient services')

data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(25, 'Unknown')
data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(26, 'Unknown')
data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(18, 'Unknown')

data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(2, 'Hospital')
data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(9, 'Hospital')
data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(10, 'Hospital')
data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(23, 'Hospital')
data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(28, 'Hospital')
data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(29, 'Hospital')
data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(15, 'Hospital')

data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(1, 'Home')
data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(6, 'Home')
data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(8, 'Home')

data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(7, 'Left against Medical Advice')

data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(3, 'Other facility')
data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(4, 'Other facility')
data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(5, 'Other facility')
data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(22, 'Other facility')
data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(24, 'Other facility')
data3['discharge_disposition_id'] = data3['discharge_disposition_id'].replace(27, 'Other facility')


In [200]:
data3.readmitted.value_counts()

0    45921
1    24066
Name: readmitted, dtype: int64

In [201]:
#Get dummy variables for Discharge ID drop 'Outpatient Services'
data3 = pd.get_dummies(data3, columns=['discharge_disposition_id']).drop('discharge_disposition_id_outpatient services', axis=1)

In [205]:
data3['admission_source_id'].value_counts()

Emergency Room     37509
Doctor Referral    22493
Unknown             5028
Transfer            4942
Other                 10
Recent birth           5
Name: admission_source_id, dtype: int64

In [204]:
data3['admission_source_id'] = data3['admission_source_id'].replace(9, 'Unknown')
data3['admission_source_id'] = data3['admission_source_id'].replace(15, 'Unknown')
data3['admission_source_id'] = data3['admission_source_id'].replace(17, 'Unknown')
data3['admission_source_id'] = data3['admission_source_id'].replace(20, 'Unknown')
data3['admission_source_id'] = data3['admission_source_id'].replace(21, 'Unknown')

data3['admission_source_id'] = data3['admission_source_id'].replace(11, 'Recent birth')
data3['admission_source_id'] = data3['admission_source_id'].replace(12, 'Recent birth')
data3['admission_source_id'] = data3['admission_source_id'].replace(13, 'Recent birth')
data3['admission_source_id'] = data3['admission_source_id'].replace(14, 'Recent birth')

data3['admission_source_id'] = data3['admission_source_id'].replace(1, 'Doctor Referral')
data3['admission_source_id'] = data3['admission_source_id'].replace(2, 'Doctor Referral')
data3['admission_source_id'] = data3['admission_source_id'].replace(3, 'Doctor Referral')

data3['admission_source_id'] = data3['admission_source_id'].replace(7, 'Emergency Room')

data3['admission_source_id'] = data3['admission_source_id'].replace(4, 'Transfer')
data3['admission_source_id'] = data3['admission_source_id'].replace(5, 'Transfer')
data3['admission_source_id'] = data3['admission_source_id'].replace(6, 'Transfer')
data3['admission_source_id'] = data3['admission_source_id'].replace(10, 'Transfer')
data3['admission_source_id'] = data3['admission_source_id'].replace(18, 'Transfer')
data3['admission_source_id'] = data3['admission_source_id'].replace(19, 'Transfer')
data3['admission_source_id'] = data3['admission_source_id'].replace(22, 'Transfer')
data3['admission_source_id'] = data3['admission_source_id'].replace(25, 'Transfer')
data3['admission_source_id'] = data3['admission_source_id'].replace(26, 'Transfer')

data3['admission_source_id'] = data3['admission_source_id'].replace(8, 'Other')

In [206]:
data3['admission_source_id'].value_counts()

Emergency Room     37509
Doctor Referral    22493
Unknown             5028
Transfer            4942
Other                 10
Recent birth           5
Name: admission_source_id, dtype: int64

In [207]:
#Get dummy variables for Admission Source ID drop 'Other'
data3 = pd.get_dummies(data3, columns=['admission_source_id']).drop('admission_source_id_Other', axis=1)


In [210]:
#Get dummy variables for 'max_glu_serum'
data3 = pd.get_dummies(data3, columns=['max_glu_serum'])

In [211]:
#Get Dummy variables
data3 = pd.get_dummies(data3, columns=['A1Cresult'])
data3 = pd.get_dummies(data3, columns=['metformin'])
data3 = pd.get_dummies(data3, columns=['repaglinide'])
data3 = pd.get_dummies(data3, columns=['nateglinide'])
data3 = pd.get_dummies(data3, columns=['glimepiride'])

In [212]:
#Get Dummy variables
data3 = pd.get_dummies(data3, columns=['glipizide'])
data3 = pd.get_dummies(data3, columns=['glyburide'])
data3 = pd.get_dummies(data3, columns=['pioglitazone'])
data3 = pd.get_dummies(data3, columns=['rosiglitazone'])
data3 = pd.get_dummies(data3, columns=['insulin'])
data3 = pd.get_dummies(data3, columns=['glyburide-metformin'])

In [213]:
data3.columns

Index([u'patient_nbr', u'gender', u'time_in_hospital', u'num_lab_procedures',
       u'num_procedures', u'num_medications', u'number_outpatient',
       u'number_emergency', u'number_inpatient', u'number_diagnoses',
       ...
       u'rosiglitazone_Steady', u'rosiglitazone_Up', u'insulin_Down',
       u'insulin_No', u'insulin_Steady', u'insulin_Up',
       u'glyburide-metformin_Down', u'glyburide-metformin_No',
       u'glyburide-metformin_Steady', u'glyburide-metformin_Up'],
      dtype='object', length=139)

In [214]:
#Save Data3
with open('data4.pkl', 'w') as picklefile:
    pickle.dump(data3, picklefile)

Column Status:

**patient_nbr - removed duplicates**

**gender - Male 0, Female 1, removed Other/Unknown**

**admission_type_id - Dummy variable created, removed Not Mapped**

**discharge_disposition_id - cleaned with Dummy variables created, remove Outpatient Services**

**admission_source_id - cleaned with Dummy variables created, remove Other**

**time_in_hospital - normalized**

**medical_specialty - cleaned with Dummy variables created, removed Proctology**

**num_lab_procedures - normalized**

**num_procedures - normalized**

**num_medications - normalized**

**number_outpatient - normalized**

**number_emergency - normailzed**

**number_inpatient - normalized**

**diag_1 - Dummy variable created, removed Other**

**diag_2 - Dummy variable created, removed Other**

**diag_3 - Dummy variable created, removed Other**

**number_diagnoses - normalized**

**max_glu_serum  - Dummy variable created**

**A1Cresult  - Dummy variable created**

**metformin  - Dummy variable created**

**repaglinide  - Dummy variable created**

**nateglinide  - Dummy variable created**

**glimepiride  - Dummy variable created**

**glipizide  - Dummy variable created**

**glyburide  - Dummy variable created**

**pioglitazone  - Dummy variable created**

**rosiglitazone  - Dummy variable created**

**insulin  - Dummy variable created**

**glyburide-metformin  - Dummy variable created**

**change - No 0, Change 1**

**diabetesMed - No 0, Yes 1**

**readmitted - changed to boolean with No 0, Yes 1**

**race_AfricanAmerican race_Asian race_Caucasian race_Hispanic race_Other - Dummy variable created, removed ?**

**age_[10-20) age_[20-30) age_[30-40) age_[40-50) age_[50-60) age_[60-70) age_[70-80) age_[80-90) age_[90-100) - Dummy variable created, removed [0-10)**