In [1]:
import pandas as pd
import numpy as np

In [5]:
diabetes01 = pd.read_csv("0406_diabetes_clean.csv")

In [6]:
#Medical speciality has 40% missing values, but we can rewrite many of those that exist:
diabetes01['medical_specialty'].value_counts()

?                                    47015
InternalMedicine                     13576
Emergency/Trauma                      7327
Family/GeneralPractice                6954
Cardiology                            5147
Surgery-General                       2930
Nephrology                            1507
Orthopedics                           1347
Orthopedics-Reconstructive            1147
Radiologist                           1096
Psychiatry                             820
Pulmonology                            815
Urology                                633
ObstetricsandGynecology                632
Surgery-Cardiovascular/Thoracic        615
Gastroenterology                       530
Surgery-Vascular                       516
Surgery-Neuro                          420
PhysicalMedicineandRehabilitation      382
Oncology                               315
Pediatrics                             201
Neurology                              194
Hematology/Oncology                    179
Endocrinolo

In [7]:
diabetes01['medical_specialty'] = diabetes01['medical_specialty'].str.replace('-.*$', '', regex=True)

In [8]:
diabetes01['medical_specialty'] = diabetes01['medical_specialty'].replace('Hematology/Oncology', 'Oncology')
diabetes01['medical_specialty'] = diabetes01['medical_specialty'].replace('ObstetricsandGynecology', 'OBGYN')
diabetes01['medical_specialty'] = diabetes01['medical_specialty'].replace('Gynecology', 'OBGYN')
diabetes01['medical_specialty'] = diabetes01['medical_specialty'].replace('Obsterics&Gynecology', 'OBGYN')
diabetes01['medical_specialty'] = diabetes01['medical_specialty'].replace('Obsterics', 'OBGYN')
diabetes01['medical_specialty'] = diabetes01['medical_specialty'].replace('Neurophysiology', 'Neurology')
diabetes01['medical_specialty'] = diabetes01['medical_specialty'].replace('Surgeon', 'Surgery')

In [9]:
valueseries = diabetes01['medical_specialty'].value_counts().copy()

for i in range(len(valueseries)):
    if valueseries[i] > 500:
        continue
    else:
        diabetes01['medical_specialty'] = diabetes01['medical_specialty'].replace(
            [valueseries.index[i]], ['Other'])

In [10]:
#We are left with <15 categories, which may add value to our analysis:
diabetes01['medical_specialty'].value_counts()

?                         47015
InternalMedicine          13576
Emergency/Trauma           7327
Family/GeneralPractice     6954
Cardiology                 5152
Surgery                    4793
Orthopedics                2494
Other                      2245
Nephrology                 1507
Radiologist                1096
Psychiatry                  827
Pulmonology                 815
OBGYN                       709
Urology                     633
Gastroenterology            530
Name: medical_specialty, dtype: int64

In [12]:
diabetes01['diag_3_cat'].value_counts()

Circulatory        28938
Diabetes           16522
Neoplasms          15106
Other              14500
Respiratory         6853
Genitourinary       6285
Digestive           3780
Musculoskeletal     1845
Injury              1844
Name: diag_3_cat, dtype: int64

In [13]:
diabetes01['diag_2_cat'].value_counts()

Circulatory        30398
Neoplasms          15819
Diabetes           11776
Other              11603
Respiratory        10140
Genitourinary       7977
Digestive           3967
Injury              2286
Musculoskeletal     1707
Name: diag_2_cat, dtype: int64

In [14]:
diabetes01['diag_1_cat'].value_counts()

Circulatory        28888
Respiratory        13511
Neoplasms          10500
Other               9682
Digestive           9045
Diabetes            7870
Injury              6590
Genitourinary       4870
Musculoskeletal     4717
Name: diag_1_cat, dtype: int64

In [16]:
#We want to combine data for the 3 diag types, but ALSO keep diag_1 as the primary as well. So let's make a copy of it then dummify that copy separately:
diabetes01['primarydiag'] = diabetes01['diag_1_cat'].copy()

In [20]:
#Let's combine diagnosis (diag_1, diag_2, diag_3) into dummy variables and add them. We will do this for diag and diabfeature:

#This is a function to do this
def createcombineddummies(df, c1, c2, c3=None, c4=None, c5=None, prefix=''):
    
    '''This would need to be modified if you have values not in every Series (we don't have that problem here)'''
    
    collist = [c1, c2]
    
    if c3 is not None:
        collist.append(c3)
    if c4 is not None:
        collist.append(c4)
    if c5 is not None:
        collist.append(c5)
        
    for i in range(len(collist)):
        
        if i == 0:
            tempDF = pd.get_dummies(df[collist[i]], prefix=prefix)
        
        if i > 0:
            tempDF1 = pd.get_dummies(df[collist[i]], prefix=prefix)
            tempDF = tempDF + tempDF1
        
    #Do we need this code? It's probably useful. Reduces everything to 1 (if a patien has 2 resporatory conditions, for example)
    tempDF = tempDF.clip(upper=1)
    #tempDF = tempDF.drop(prefix + '_Nothing', axis=1)
    
    return tempDF

In [21]:
#Let's try creating this dummy DF and see how it looks:
diagDummy = createcombineddummies(diabetes01, 'diag_1_cat', 'diag_2_cat', 'diag_3_cat', prefix='diag')

In [22]:
#This looks good. Let's combine it with diabetes01 and remove diag_1-3
diagDummy.head(10)

Unnamed: 0,diag_Circulatory,diag_Diabetes,diag_Digestive,diag_Genitourinary,diag_Injury,diag_Musculoskeletal,diag_Neoplasms,diag_Other,diag_Respiratory
0,0,1,0,0,0,0,1,0,0
1,0,1,0,0,0,0,0,1,0
2,1,1,0,0,0,0,0,1,0
3,0,1,0,0,0,0,1,0,0
4,1,1,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,1,0
6,1,1,0,0,0,0,0,0,1
7,1,0,0,0,0,0,0,1,0
8,1,0,0,0,0,0,1,0,1
9,1,1,0,0,1,0,0,0,0


In [23]:
# Drop diag_1-3 and add diagDummy to DF
diabetes02 = diabetes01.drop(['diag_1_cat', 'diag_2_cat', 'diag_3_cat'], axis=1)
diabetes02 = pd.concat([diabetes02, diagDummy], axis=1)

In [25]:
diabetes02.head()

Unnamed: 0.1,Unnamed: 0,race,gender,age,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,...,primarydiag,diag_Circulatory,diag_Diabetes,diag_Digestive,diag_Genitourinary,diag_Injury,diag_Musculoskeletal,diag_Neoplasms,diag_Other,diag_Respiratory
0,0,Caucasian,Female,[10-20),3,?,59,0,18,0,...,Neoplasms,0,1,0,0,0,0,1,0,0
1,1,AfricanAmerican,Female,[20-30),2,?,11,5,13,2,...,Other,0,1,0,0,0,0,0,1,0
2,2,Caucasian,Male,[30-40),2,?,44,1,16,0,...,Other,1,1,0,0,0,0,0,1,0
3,3,Caucasian,Male,[40-50),1,?,51,0,8,0,...,Neoplasms,0,1,0,0,0,0,1,0,0
4,4,Caucasian,Male,[50-60),3,?,31,6,16,0,...,Circulatory,1,1,0,0,0,0,0,0,0


### make age numeric 

In [27]:
#Age is strictly divided into decades of life. We should make this numeric for now:
diabetes02['age'].value_counts()
diabetes02['age'] = diabetes02['age'].replace(['[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', '[50-60)', 
                                              '[60-70)', '[70-80)', '[80-90)', '[90-100)'], [1,2,3,4,5,6,7,8,9,10])

In [28]:
diabetes02.head()

Unnamed: 0.1,Unnamed: 0,race,gender,age,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,...,primarydiag,diag_Circulatory,diag_Diabetes,diag_Digestive,diag_Genitourinary,diag_Injury,diag_Musculoskeletal,diag_Neoplasms,diag_Other,diag_Respiratory
0,0,Caucasian,Female,2,3,?,59,0,18,0,...,Neoplasms,0,1,0,0,0,0,1,0,0
1,1,AfricanAmerican,Female,3,2,?,11,5,13,2,...,Other,0,1,0,0,0,0,0,1,0
2,2,Caucasian,Male,4,2,?,44,1,16,0,...,Other,1,1,0,0,0,0,0,1,0
3,3,Caucasian,Male,5,1,?,51,0,8,0,...,Neoplasms,0,1,0,0,0,0,1,0,0
4,4,Caucasian,Male,6,3,?,31,6,16,0,...,Circulatory,1,1,0,0,0,0,0,0,0


## dummify response variable 
readmitted = yes -> 1
readmitted = no -> 0

In [74]:
diabetes02['readmitted'] = diabetes02['readmitted'].replace(['No','Yes'],[0,1])

In [75]:
diabetes02['readmitted'].value_counts()

0    84650
1    11023
Name: readmitted, dtype: int64

##  create 3 slightly different DF's to test in our modeling. 
One which dummifies all 4 medication categories ("No", "Steady", "Up", "Down,"), 
one which says if the patient is taking that medication at all ("Yes", "No"), 
and another one which says whether the patient has has any change in that medication ("No", "Steady", "Change"). 

We are also adding another column (to all 3 DF's!) named "diabchange", indicating if there was any change at all to any diabetic medicine (1 = yes, 0 = no)

In [77]:
#Create three DF's, each corresponding to one of the criteria listed above:
DiabetesAllDummy = diabetes02.copy()
DiabetesTakingMed = diabetes02.copy()
DiabetesAnyChange = diabetes02.copy()

In [78]:
#Function to replace a DF with dummy variables:
def ReplaceWithDummies(df, dummylist):
    df2 = df.copy()
    for var in dummylist:
        topindex = df2[var].value_counts().sort_values(ascending=False).index[0]
        dummies = pd.get_dummies(df2[var], prefix=var)
        dummies = dummies.drop(var + "_" + str(topindex), axis=1)
        df2 = pd.concat([df2, dummies], axis=1)
        df2 = df2.drop(var, axis=1)
    return df2

In [79]:
#Define columns to be dummified: (AllDummy vs OrdMed):
MedColumns = list(diabetes02.columns[15:23])

OtherDummyColumns = ['race', 'gender', 'dischargeDisposition', 'max_glu_serum', 'A1Cresult', 'change', 'diabetesMed',
                     'admissionType', 'admissionSource', 'medical_specialty', 'primarydiag']
AllDummyColumns = OtherDummyColumns.copy()
AllDummyColumns.extend(MedColumns)

In [80]:
#Make all dummy DFs (all columns for the "AlDummy", the non-Med Columns for TakingMed and AnyChange)
DiabetesAllDummy = ReplaceWithDummies(DiabetesAllDummy, AllDummyColumns)
DiabetesTakingMed = ReplaceWithDummies(DiabetesTakingMed, OtherDummyColumns)
DiabetesAnyChange = ReplaceWithDummies(DiabetesAnyChange, OtherDummyColumns)

In [81]:
#Now for the TakingMed DF, replace anything other than No with 1:
for med in MedColumns:
    DiabetesTakingMed[med] = DiabetesTakingMed[med].replace(['No', 'Steady', 'Down', 'Up'], [0, 1, 1, 1])

In [82]:
#Now for the AnyChange DF, replace No and Steady with 0 and Up or Down with 1:
for med in MedColumns:
    DiabetesAnyChange[med] = DiabetesAnyChange[med].replace(['No', 'Steady', 'Down', 'Up'], [0, 0, 1, 1])

In [83]:
#Next, create a new feature, "diabchange", which says if ANY diabeteic medication was changed at all:
DiabetesAnyChange['diabchange'] = 0
for med in MedColumns:
    DiabetesAnyChange['diabchange'] += DiabetesAnyChange[med]
DiabetesAnyChange['diabchange'] = DiabetesAnyChange['diabchange'].clip(upper=1)

In [84]:
DiabetesAnyChange.head()

Unnamed: 0.1,Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,...,medical_specialty_Urology,primarydiag_Diabetes,primarydiag_Digestive,primarydiag_Genitourinary,primarydiag_Injury,primarydiag_Musculoskeletal,primarydiag_Neoplasms,primarydiag_Other,primarydiag_Respiratory,diabchange
0,0,2,3,59,0,18,0,0,0,9,...,0,0,0,0,0,0,1,0,0,1
1,1,3,2,11,5,13,2,0,1,6,...,0,0,0,0,0,0,0,1,0,0
2,2,4,2,44,1,16,0,0,0,7,...,0,0,0,0,0,0,0,1,0,1
3,3,5,1,51,0,8,0,0,0,5,...,0,0,0,0,0,0,1,0,0,0
4,4,6,3,31,6,16,0,0,0,9,...,0,0,0,0,0,0,0,0,0,0


In [85]:
#Add these diabchange columns to DiabetesAllDummy and DiabetesTakingMed:
DiabetesAllDummy['diabchange'] = DiabetesAnyChange['diabchange']
DiabetesTakingMed['diabchange'] = DiabetesAnyChange['diabchange']

In [86]:
DiabetesAllDummy.head()

Unnamed: 0.1,Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,...,pioglitazone_Down,pioglitazone_Steady,pioglitazone_Up,rosiglitazone_Down,rosiglitazone_Steady,rosiglitazone_Up,insulin_Down,insulin_Steady,insulin_Up,diabchange
0,0,2,3,59,0,18,0,0,0,9,...,0,0,0,0,0,0,0,0,1,1
1,1,3,2,11,5,13,2,0,1,6,...,0,0,0,0,0,0,0,0,0,0
2,2,4,2,44,1,16,0,0,0,7,...,0,0,0,0,0,0,0,0,1,1
3,3,5,1,51,0,8,0,0,0,5,...,0,0,0,0,0,0,0,1,0,0
4,4,6,3,31,6,16,0,0,0,9,...,0,0,0,0,0,0,0,1,0,0


In [87]:
DiabetesAllDummy.dtypes

Unnamed: 0                                                                        int64
age                                                                               int64
time_in_hospital                                                                  int64
num_lab_procedures                                                                int64
num_procedures                                                                    int64
num_medications                                                                   int64
number_outpatient                                                                 int64
number_emergency                                                                  int64
number_inpatient                                                                  int64
number_diagnoses                                                                  int64
readmitted                                                                        int64
diag_Circulatory                

In [88]:
DiabetesTakingMed.head()

Unnamed: 0.1,Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,...,medical_specialty_Urology,primarydiag_Diabetes,primarydiag_Digestive,primarydiag_Genitourinary,primarydiag_Injury,primarydiag_Musculoskeletal,primarydiag_Neoplasms,primarydiag_Other,primarydiag_Respiratory,diabchange
0,0,2,3,59,0,18,0,0,0,9,...,0,0,0,0,0,0,1,0,0,1
1,1,3,2,11,5,13,2,0,1,6,...,0,0,0,0,0,0,0,1,0,0
2,2,4,2,44,1,16,0,0,0,7,...,0,0,0,0,0,0,0,1,0,1
3,3,5,1,51,0,8,0,0,0,5,...,0,0,0,0,0,0,1,0,0,0
4,4,6,3,31,6,16,0,0,0,9,...,0,0,0,0,0,0,0,0,0,0


In [89]:
#Check if all values are numeric by converting to numeric:
for var in list(DiabetesAllDummy.columns):
    DiabetesAllDummy[var]  = pd.to_numeric(DiabetesAllDummy[var])
#Yes

for var in list(DiabetesTakingMed.columns):
    DiabetesTakingMed[var]  = pd.to_numeric(DiabetesTakingMed[var])
#Yes
    
for var in list(DiabetesAnyChange.columns):
    DiabetesAnyChange[var]  = pd.to_numeric(DiabetesAnyChange[var])
#Yes

In [91]:

DiabetesAllDummy.index = list(range(len(DiabetesAllDummy)))
DiabetesTakingMed.index = list(range(len(DiabetesTakingMed)))
DiabetesAnyChange.index = list(range(len(DiabetesAnyChange)))

In [92]:
#Write these three DF's to CSV and use for further analysis:
DiabetesAllDummy.to_csv('DiabetesAllDummyF.csv')
DiabetesTakingMed.to_csv('DiabetesTakingMedF.csv')
DiabetesAnyChange.to_csv('DiabetesAnyChangeF.csv')