In [34]:
import numpy as np
import sklearn
from sklearn import preprocessing
import pandas as pd

In [35]:
#data = pandas.read_csv('dataset_diabetes/diabetic_data.csv', ',')

In [36]:
df = pd.DataFrame.from_csv('dataset_diabetes/diabetic_data.csv')

In [37]:
df.head()

Unnamed: 0_level_0,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
encounter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,...,No,No,No,No,No,No,No,No,No,NO
149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,...,No,Up,No,No,No,No,No,Ch,Yes,>30
64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,...,No,No,No,No,No,No,No,No,Yes,NO
500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,...,No,Up,No,No,No,No,No,Ch,Yes,NO
16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


# Preprocessing

In [38]:
df.readmitted.unique()

array(['NO', '>30', '<30'], dtype=object)

In [39]:
# transform readmitted categories into numerical values
#readmittedEncoder = preprocessing.LabelEncoder()
#readmittedEncoder.fit(df.readmitted)
#df.readmitted = readmittedEncoder.transform(df.readmitted)

In [40]:
df.gender.unique()

array(['Female', 'Male', 'Unknown/Invalid'], dtype=object)

In [41]:
# transform genders into numerical values (female = 0, male = 1, unknown = 2)
genderEncoder = preprocessing.LabelEncoder()
genderEncoder.fit(df.gender)
df.gender = genderEncoder.transform(df.gender)

In [42]:
df.age.unique()

array(['[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', '[50-60)',
       '[60-70)', '[70-80)', '[80-90)', '[90-100)'], dtype=object)

In [43]:
# transforms range string into a numerical value in the middle of the range
def rangeStringToMiddle(rangeString):
    rangeString = rangeString[1:-1] #remove brackets
    rangeString = str.split(rangeString, '-')
    return (int(rangeString[0]) + int(rangeString[1])) / 2

In [44]:
rangeStringToMiddle('[70-80)')

75

In [45]:
# transform age range into one numerical value
df.age = df.age.apply(rangeStringToMiddle)

In [46]:
df.weight.unique()

array(['?', '[75-100)', '[50-75)', '[0-25)', '[100-125)', '[25-50)',
       '[125-150)', '[175-200)', '[150-175)', '>200'], dtype=object)

In [47]:
# transforms age range string into an age in the middle of the range
def weightRangeToWeight(weightRange):
    if weightRange == '?':
        return 0
    if weightRange == '>200':
        return 200
    return rangeStringToMiddle(weightRange)

In [48]:
# transform weight range into one numerical value
df.weight = df.weight.apply(weightRangeToWeight)

In [49]:
df.weight.unique()

array([  0,  87,  62,  12, 112,  37, 137, 187, 162, 200])

In [50]:
df.race.unique()

array(['Caucasian', 'AfricanAmerican', '?', 'Other', 'Asian', 'Hispanic'], dtype=object)

In [51]:
df.admission_type_id.unique()

array([6, 1, 2, 3, 4, 5, 8, 7])

In [52]:
df.admission_source_id.unique()

array([ 1,  7,  2,  4,  5,  6, 20,  3, 17,  8,  9, 14, 10, 22, 11, 25, 13])

In [53]:
df.payer_code.unique()

array(['?', 'MC', 'MD', 'HM', 'UN', 'BC', 'SP', 'CP', 'SI', 'DM', 'CM',
       'CH', 'PO', 'WC', 'OT', 'OG', 'MP', 'FR'], dtype=object)

In [54]:
df.discharge_disposition_id.unique()

array([25,  1,  3,  6,  2,  5, 11,  7, 10,  4, 14, 18,  8, 13, 12, 16, 17,
       22, 23,  9, 20, 15, 24, 28, 19, 27])

In [55]:
df.medical_specialty.unique()

array(['Pediatrics-Endocrinology', '?', 'InternalMedicine',
       'Family/GeneralPractice', 'Cardiology', 'Surgery-General',
       'Orthopedics', 'Gastroenterology',
       'Surgery-Cardiovascular/Thoracic', 'Nephrology',
       'Orthopedics-Reconstructive', 'Psychiatry', 'Emergency/Trauma',
       'Pulmonology', 'Surgery-Neuro',
       'Obsterics&Gynecology-GynecologicOnco', 'ObstetricsandGynecology',
       'Pediatrics', 'Hematology/Oncology', 'Otolaryngology',
       'Surgery-Colon&Rectal', 'Pediatrics-CriticalCare', 'Endocrinology',
       'Urology', 'Psychiatry-Child/Adolescent', 'Pediatrics-Pulmonology',
       'Neurology', 'Anesthesiology-Pediatric', 'Radiology',
       'Pediatrics-Hematology-Oncology', 'Psychology', 'Podiatry',
       'Gynecology', 'Oncology', 'Pediatrics-Neurology', 'Surgery-Plastic',
       'Surgery-Thoracic', 'Surgery-PlasticwithinHeadandNeck',
       'Ophthalmology', 'Surgery-Pediatric',
       'Pediatrics-EmergencyMedicine', 'PhysicalMedicineandRehabilit

In [56]:
categoricalColumns = ['readmitted', 'race', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'payer_code', 'medical_specialty']
for columnName in categoricalColumns:
    oneHotColumns = pd.get_dummies(df[columnName], prefix=(columnName + '_'), prefix_sep='')
    df = df.join(oneHotColumns)

In [57]:
df.head()

Unnamed: 0_level_0,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,...,medical_specialty_Surgery-General,medical_specialty_Surgery-Maxillofacial,medical_specialty_Surgery-Neuro,medical_specialty_Surgery-Pediatric,medical_specialty_Surgery-Plastic,medical_specialty_Surgery-PlasticwithinHeadandNeck,medical_specialty_Surgery-Thoracic,medical_specialty_Surgery-Vascular,medical_specialty_SurgicalSpecialty,medical_specialty_Urology
encounter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2278392,8222157,Caucasian,0,5,0,6,25,1,1,?,...,0,0,0,0,0,0,0,0,0,0
149190,55629189,Caucasian,0,15,0,1,1,7,3,?,...,0,0,0,0,0,0,0,0,0,0
64410,86047875,AfricanAmerican,0,25,0,1,1,7,2,?,...,0,0,0,0,0,0,0,0,0,0
500364,82442376,Caucasian,1,35,0,1,1,7,2,?,...,0,0,0,0,0,0,0,0,0,0
16680,42519267,Caucasian,1,45,0,1,1,7,1,?,...,0,0,0,0,0,0,0,0,0,0


# Training

In [65]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [66]:
clf = OneVsRestClassifier(LinearDiscriminantAnalysis())

In [67]:
featureColumns = ['age', 'weight', 'gender']
labelColumns = ['readmitted_<30', 'readmitted_>30', 'readmitted_NO']
features = df[list(featureColumns)].values
labels = df[list(labelColumns)].values

In [68]:
clf.fit(features,labels)

OneVsRestClassifier(estimator=LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001),
          n_jobs=1)

In [69]:
cross_val_score(clf, features, labels, n_jobs=1)

array([ 0.54133011,  0.48528978,  0.55489063])