In [151]:
from IPython.display import Image
import numpy as np
import os
import pandas as pd

In [152]:
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline

In [153]:
np.random.seed(42)
pd.set_option('display.max_columns', None)

In [154]:
dataDirectory = os.path.join(os.getcwd(), 'dataset_diabetes')

In [155]:
trainingSet = pd.read_csv(os.path.join(dataDirectory, 'diabetic_data_train.csv'))
diabetes = trainingSet.drop('readmitted', axis=1)
readmittedLabels = trainingSet['readmitted']

In [156]:
# Label Transforms
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
readmittedLabelsEncoded = labelEncoder.fit_transform(readmittedLabels)

In [157]:
# find column numbers using column names of a Pandas DataFrame
def find_column_numbers(dataFrame, columnNamesList):
    return [dataFrame.columns.get_loc(col) for col in columnNamesList]

In [158]:
# Transformer for diagnoses columns
from sklearn.base import BaseEstimator, TransformerMixin
class RenameDiagnosesColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columnNumbers):
        self.circulatory = self.int_to_str_list(390,460) + ['785']
        self.respiratory = self.int_to_str_list(460,520) + ['786']
        self.digestive = self.int_to_str_list(520,580) + ['787']
        self.injury = self.int_to_str_list(800,1000)
        self.musculoskeletal = self.int_to_str_list(710,740)
        self.genitourinary = self.int_to_str_list(580,630) + ['788']
        self.neoplasms = self.int_to_str_list(140, 240)
        # self.notOthers = ['Circulatory', 'Respiratory', 'Digestive', 'Injury', 'Diabetes', 'Musculoskeletal', 'Genitourinary',
             # 'Neoplasms']
        self.notOthers = [1, 2, 3, 4, 5, 6, 7, 8]
        self.columnNumbers = columnNumbers
    def fit(self, X, y=None):
        return self
    def int_to_str_list(self, l, h):
        return list(map(str,list(range(l,h))))
    def transform(self, X):
        for col in self.columnNumbers:
            X[:, col] = np.array([1 if x in self.circulatory else x for x in X[:, col]])
            X[:, col] = np.array([2 if x in self.respiratory else x for x in X[:, col] ])
            X[:, col] = np.array([3 if x in self.digestive else x for x in X[:, col] ])
            X[:, col] = np.array([4 if x in self.injury else x for x in X[:, col]])
            X[:, col] = np.array([5 if x.isdigit() and int(x) == 250 else x for x in X[:, col]])
            X[:, col] = np.array([6 if x in self.musculoskeletal else x for x in X[:, col]])
            X[:, col] = np.array([7 if x in self.genitourinary else x for x in X[:, col]])
            X[:, col] = np.array([8 if x in self.neoplasms else x for x in X[:, col]])
            X[:, col] = np.array([9 if x not in self.notOthers else x for x in X[:, col]])
        return X 

In [159]:
# imputer for filling missing values
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values = '?', fill_value = np.nan, strategy = 'constant')

In [160]:
# replace ? with NaN
diabetes.replace('?', np.NaN, inplace=True)
# drop irrelevant columns
diabetes.drop(['weight', 'payer_code', 'encounter_id', 'patient_nbr', 'medical_specialty',
              'admission_source_id', 'admission_type_id', 'number_outpatient', 'number_emergency'],
              axis=1, inplace=True)
# drop rows with missing values
diabetes.dropna(subset=['race', 'diag_1', 'diag_2', 'diag_3'], inplace=True)

In [161]:
#get numerical and categorical columns
def get_num_and_cat_columns(dataFrame):
    numerical_ix = dataFrame.select_dtypes(include=['int64', 'float64']).columns
    categorical_ix = dataFrame.select_dtypes(include=['object', 'bool', 'category']).columns
    return numerical_ix, categorical_ix

In [162]:
diagnosesColumnNames = ['diag_1', 'diag_2', 'diag_3']
columnNumbers = find_column_numbers(diabetes, diagnosesColumnNames)

In [163]:
num_attribs, cat_attribs = get_num_and_cat_columns(diabetes)
cat_attribs = [x for x in list(cat_attribs) if x not in diagnosesColumnNames]

In [164]:
le = LabelEncoder()

In [165]:
for col in cat_attribs:
    diabetes[col] = le.fit_transform(diabetes[col])

In [166]:
pipeline1 = Pipeline([
    ('imputer', imputer), # adding this for no reason at all
    ('dt', RenameDiagnosesColumns(columnNumbers))
])

In [168]:
full_pipeline = ColumnTransformer([
    ('pipe1', pipeline1, list(diabetes)),
    ('ohe', OneHotEncoder(), cat_attribs + diagnosesColumnNames)
], remainder='passthrough')

In [169]:
diabetesPrepared = full_pipeline.fit_transform(diabetes)

(78446, 2290)