In [1]:
from IPython.display import Image
import numpy as np
import os
import pandas as pd

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline

In [3]:
np.random.seed(42)
pd.set_option('display.max_columns', None)

In [4]:
dataDirectory = os.path.join(os.getcwd(), 'dataset_diabetes')

In [5]:
trainingSet = pd.read_csv(os.path.join(dataDirectory, 'diabetic_data_train.csv'))
diabetes = trainingSet.copy()

In [6]:
diabetes.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,81844290,94788,Caucasian,Female,[70-80),?,1,1,7,4,?,InternalMedicine,48,0,11,0,0,0,276,402,428,9,,Norm,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,396159158,135023315,Caucasian,Male,[50-60),?,1,1,7,1,BC,?,42,0,5,0,0,0,427,250,278,6,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,>30
2,31258956,18397782,Caucasian,Male,[80-90),?,1,1,7,4,?,?,44,0,10,0,0,0,599,788,599,7,,,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,210691074,67509558,Caucasian,Male,[80-90),?,1,3,7,3,MC,?,54,0,8,0,0,0,331,309,331,8,,,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
4,104902980,23272362,AfricanAmerican,Female,[70-80),?,1,11,7,11,MC,Nephrology,35,3,23,0,0,1,38,486,403,8,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO


In [7]:
# find column numbers using column names of a Pandas DataFrame
def find_column_numbers(dataFrame, columnNamesList):
    return [dataFrame.columns.get_loc(col) for col in columnNamesList]

In [8]:
# Transformer for diagnoses columns
from sklearn.base import BaseEstimator, TransformerMixin
class RenameDiagnosesColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columnNumbers):
        self.circulatory = self.int_to_str_list(390,460) + ['785']
        self.respiratory = self.int_to_str_list(460,520) + ['786']
        self.digestive = self.int_to_str_list(520,580) + ['787']
        self.injury = self.int_to_str_list(800,1000)
        self.musculoskeletal = self.int_to_str_list(710,740)
        self.genitourinary = self.int_to_str_list(580,630) + ['788']
        self.neoplasms = self.int_to_str_list(140, 240)
        # self.notOthers = ['Circulatory', 'Respiratory', 'Digestive', 'Injury', 'Diabetes', 'Musculoskeletal', 'Genitourinary',
             # 'Neoplasms']
        self.notOthers = [1, 2, 3, 4, 5, 6, 7, 8]
        self.columnNumbers = columnNumbers
    def fit(self, X, y=None):
        return self
    def int_to_str_list(self, l, h):
        return list(map(str,list(range(l,h))))
    def transform(self, X):
        for col in self.columnNumbers:
            X[:, col] = np.array([1 if x in self.circulatory else x for x in X[:, col]])
            X[:, col] = np.array([2 if x in self.respiratory else x for x in X[:, col] ])
            X[:, col] = np.array([3 if x in self.digestive else x for x in X[:, col] ])
            X[:, col] = np.array([4 if x in self.injury else x for x in X[:, col]])
            X[:, col] = np.array([5 if x.isdigit() and int(x) == 250 else x for x in X[:, col]])
            X[:, col] = np.array([6 if x in self.musculoskeletal else x for x in X[:, col]])
            X[:, col] = np.array([7 if x in self.genitourinary else x for x in X[:, col]])
            X[:, col] = np.array([8 if x in self.neoplasms else x for x in X[:, col]])
            X[:, col] = np.array([9 if x not in self.notOthers else x for x in X[:, col]])
        return X 

In [9]:
# imputer for filling missing values
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values = '?', fill_value = np.nan, strategy = 'constant')

In [10]:
# replace ? with NaN
diabetes.replace('?', np.NaN, inplace=True)
# drop irrelevant columns
diabetes.drop(['weight', 'payer_code', 'encounter_id', 'patient_nbr', 'medical_specialty',
              'admission_source_id', 'admission_type_id', 'number_outpatient', 'number_emergency'],
              axis=1, inplace=True)
# drop rows with missing values
diabetes.dropna(subset=['race', 'diag_1', 'diag_2', 'diag_3'], inplace=True)

In [13]:
# diabetes.head()

In [14]:
readmittedLabels = diabetes['readmitted'].copy()
diabetes = diabetes.drop('readmitted', axis=1)
# Label Transforms
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
readmittedLabelsEncoded = labelEncoder.fit_transform(readmittedLabels)

In [15]:
readmittedLabelsEncoded

array([2, 1, 2, ..., 2, 1, 2])

In [16]:
diabetes.head()

Unnamed: 0,race,gender,age,discharge_disposition_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
0,Caucasian,Female,[70-80),1,4,48,0,11,0,276,402,428,9,,Norm,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No
1,Caucasian,Male,[50-60),1,1,42,0,5,0,427,250,278,6,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No
2,Caucasian,Male,[80-90),1,4,44,0,10,0,599,788,599,7,,,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes
3,Caucasian,Male,[80-90),3,3,54,0,8,0,331,309,331,8,,,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes
4,AfricanAmerican,Female,[70-80),11,11,35,3,23,1,38,486,403,8,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No


In [17]:
#get numerical and categorical columns
def get_num_and_cat_columns(dataFrame):
    numerical_ix = dataFrame.select_dtypes(include=['int64', 'float64']).columns
    categorical_ix = dataFrame.select_dtypes(include=['object', 'bool', 'category']).columns
    return numerical_ix, categorical_ix

In [18]:
diagnosesColumnNames = ['diag_1', 'diag_2', 'diag_3']
columnNumbers = find_column_numbers(diabetes, diagnosesColumnNames)

In [19]:
num_attribs, cat_attribs = get_num_and_cat_columns(diabetes)
cat_attribs = [x for x in list(cat_attribs) if x not in diagnosesColumnNames]

In [20]:
le = LabelEncoder()

In [21]:
for col in cat_attribs:
    diabetes[col] = le.fit_transform(diabetes[col])

In [22]:
pipeline1 = Pipeline([
    ('imputer', imputer), # adding this for no reason at all
    ('dt', RenameDiagnosesColumns(columnNumbers))
])

In [23]:
full_pipeline = ColumnTransformer([
    ('pipe1', pipeline1, list(diabetes)),
    ('ohe', OneHotEncoder(), cat_attribs + diagnosesColumnNames)
], remainder='passthrough')

In [24]:
diabetesPrepared = full_pipeline.fit_transform(diabetes)

In [25]:
from sklearn.linear_model import LinearRegression

In [26]:
lin_reg = LinearRegression()
lin_reg.fit(diabetesPrepared, readmittedLabelsEncoded)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [27]:
some_data = diabetes.iloc[:5]
some_labels = readmittedLabelsEncoded[:5]
some_data_prepared = full_pipeline.transform(some_data)
print('Predictions: ', lin_reg.predict(some_data_prepared))

Predictions:  [1.42527356 1.70249949 1.45377887 1.54554608 1.45854716]


In [28]:
print('Labels: ', list(some_labels))

Labels:  [2, 1, 2, 2, 2]


In [29]:
# obviously linear regression doesn't work since the labels are categorical

In [31]:
from sklearn.metrics import mean_squared_error
diabetes_predictions = lin_reg.predict(diabetesPrepared)
lin_mse = mean_squared_error(readmittedLabelsEncoded, diabetes_predictions)
lin_mse = np.sqrt(lin_mse)
lin_mse

0.6459575823502438

In [32]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(diabetesPrepared, readmittedLabelsEncoded)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [33]:
diabetes_predictions = tree_reg.predict(diabetesPrepared)
tree_mse = mean_squared_error(readmittedLabelsEncoded, diabetes_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0