In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
np.random.seed(100)

from collections import defaultdict

In [2]:
# sklearn imports
from sklearn import preprocessing
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

# other stats/math imports
import math
from scipy.stats import chi2_contingency

In [5]:
diabetes = pd.read_csv("diabetic_data.csv", delimiter=None) 
diabetes = pd.DataFrame(diabetes)

In [6]:
diabetes.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [25]:
{x: sum(np.isnan(diabetes[x]))/len(diabetes) for x in diabetes.columns if type(diabetes[x][0])!=str}
# checking for NA values in numeric diabetes columns

{'encounter_id': 0.0,
 'patient_nbr': 0.0,
 'admission_type_id': 0.0,
 'discharge_disposition_id': 0.0,
 'admission_source_id': 0.0,
 'time_in_hospital': 0.0,
 'num_lab_procedures': 0.0,
 'num_procedures': 0.0,
 'num_medications': 0.0,
 'number_outpatient': 0.0,
 'number_emergency': 0.0,
 'number_inpatient': 0.0,
 'number_diagnoses': 0.0,
 'disposition_boolean': 0.0}

In [32]:
{x: len(diabetes[x].unique()) for x in diabetes.columns if type(diabetes[x][0])!=str}

{'encounter_id': 101766,
 'patient_nbr': 71518,
 'admission_type_id': 8,
 'discharge_disposition_id': 26,
 'admission_source_id': 17,
 'time_in_hospital': 14,
 'num_lab_procedures': 118,
 'num_procedures': 7,
 'num_medications': 75,
 'number_outpatient': 39,
 'number_emergency': 33,
 'number_inpatient': 21,
 'number_diagnoses': 16,
 'disposition_boolean': 2}

In [36]:
patientNums = diabetes["patient_nbr"].unique()

In [37]:
patientNums[0]

8222157

In [39]:
diabetes[diabetes["patient_nbr"]==patientNums[1]]

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,disposition_boolean
1,149190,55629189,Caucasian,Female,[10-20),[75-100),1,1,7,3,...,Up,No,No,No,No,No,Ch,Yes,>30,0


In [26]:
numUnique = {x: len(diabetes[x].unique()) for x in diabetes.columns if type(diabetes[x][0])==str}
numUnique

{'race': 6,
 'gender': 3,
 'age': 10,
 'weight': 9,
 'payer_code': 18,
 'medical_specialty': 73,
 'diag_1': 7,
 'diag_2': 7,
 'diag_3': 7,
 'max_glu_serum': 4,
 'A1Cresult': 4,
 'metformin': 4,
 'repaglinide': 4,
 'nateglinide': 4,
 'chlorpropamide': 4,
 'glimepiride': 4,
 'acetohexamide': 2,
 'glipizide': 4,
 'glyburide': 4,
 'tolbutamide': 2,
 'pioglitazone': 4,
 'rosiglitazone': 4,
 'acarbose': 4,
 'miglitol': 4,
 'troglitazone': 2,
 'tolazamide': 3,
 'insulin': 4,
 'glyburide-metformin': 4,
 'glipizide-metformin': 2,
 'glimepiride-pioglitazone': 2,
 'metformin-rosiglitazone': 2,
 'metformin-pioglitazone': 2,
 'change': 2,
 'diabetesMed': 2,
 'readmitted': 3}

In [7]:
diabetes.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [27]:
# unique readmitted values
diabetes['readmitted'].unique()

array(['NO', '>30', '<30'], dtype=object)

In [28]:
feat_num

['num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses']

In [30]:
jerrydata = diabetes[diabetes['readmitted'] != 'NO']

In [31]:
jerrydata

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,disposition_boolean
1,149190,55629189,Caucasian,Female,[10-20),[75-100),1,1,7,3,...,Up,No,No,No,No,No,Ch,Yes,>30,0
5,35754,82637451,Caucasian,Male,[50-60),[75-100),2,1,2,3,...,Steady,No,No,No,No,No,No,Yes,>30,0
7,63768,114882984,Caucasian,Male,[70-80),[75-100),1,1,7,5,...,No,No,No,No,No,No,No,Yes,>30,0
10,28236,89869032,AfricanAmerican,Female,[40-50),[75-100),1,1,7,9,...,Steady,No,No,No,No,No,No,Yes,>30,0
11,36900,77391171,AfricanAmerican,Male,[60-70),[75-100),2,1,4,7,...,Steady,No,No,No,No,No,Ch,Yes,<30,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101754,443842016,183087545,Caucasian,Female,[70-80),[75-100),1,1,7,9,...,Steady,No,No,No,No,No,Ch,Yes,>30,0
101755,443842022,188574944,Other,Female,[40-50),[75-100),1,1,7,14,...,Up,No,No,No,No,No,Ch,Yes,>30,0
101756,443842070,140199494,Other,Female,[60-70),[75-100),1,1,7,2,...,Steady,No,No,No,No,No,No,Yes,>30,0
101760,443847176,50375628,AfricanAmerican,Female,[60-70),[75-100),1,1,7,6,...,Down,No,No,No,No,No,Ch,Yes,>30,0


## Split data

In [8]:
# Xtr, Xte, Ytr, Yte = train_test_split(X, y, test_size=0.33, random_state=1)

# Data Preprocessing
## Feature Engineering
### Numerical Features 
In this dataset the feature names make numerical value features self-evident. Each column with numerical features starts with "num_..."
### Categorical Features
Essentially every feature that is numerical can be considered categorical but it is not as simple as that. 
1) 2 features are patient ID features: ['encounter_id', 'patient_nbr']. It does not make sense to include them (unless we are considering a personalized Machine Learning model) <br>
2) It also does not make sense to include the target feature which is also categorical: ['readmitted'] <br>
3) Certain features have numerical values that represent categories, such as: ['admission_type_id', 'discharge_disposition_id', 'admission_source_id']. This is something we will investigate further. 


In [9]:
# list column names of features that consist of numeric values
# (in this dataset the feature names make numerical value features self-evident)
feat_num = ['num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']

# list column names of features that consist of categorical values
feat_cat = ['race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty', 'diag_1',
       'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed']

### Feature Scaling

As a first step, we will only normalize the numerical features. Later on, we will consider normalizing all features (for instance, if we use a multivariate feature selection model such as Lasso)

In [10]:
# convert numerical features from strings to floats
for f in feat_num:
    diabetes[f] = pd.to_numeric(diabetes[f],errors= 'coerce')
    
scaler = StandardScaler()
scaler.fit(diabetes[feat_num])
feat_numSc = scaler.transform(diabetes[feat_num])
# np.array(Xsc)
# Xsc.mean(axis = 0)

In [11]:
diabetes[feat_num]

Unnamed: 0,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
0,41,0,1,0,0,0,1
1,59,0,18,0,0,0,9
2,11,5,13,2,0,1,6
3,44,1,16,0,0,0,7
4,51,0,8,0,0,0,5
...,...,...,...,...,...,...,...
101761,51,0,16,0,0,0,9
101762,33,3,18,0,0,1,9
101763,53,0,9,1,0,0,13
101764,45,2,21,0,0,1,9


#### Count distinct values for Categorical Features

In [12]:
cat_count = defaultdict(int)
for f in feat_cat:
    cat_count[f] = len(diabetes[f].value_counts())
cat_count

defaultdict(int,
            {'race': 6,
             'gender': 3,
             'age': 10,
             'weight': 10,
             'admission_type_id': 8,
             'discharge_disposition_id': 26,
             'admission_source_id': 17,
             'time_in_hospital': 14,
             'payer_code': 18,
             'medical_specialty': 73,
             'diag_1': 717,
             'diag_2': 749,
             'diag_3': 790,
             'max_glu_serum': 4,
             'A1Cresult': 4,
             'metformin': 4,
             'repaglinide': 4,
             'nateglinide': 4,
             'chlorpropamide': 4,
             'glimepiride': 4,
             'acetohexamide': 2,
             'glipizide': 4,
             'glyburide': 4,
             'tolbutamide': 2,
             'pioglitazone': 4,
             'rosiglitazone': 4,
             'acarbose': 4,
             'miglitol': 4,
             'troglitazone': 2,
             'tolazamide': 3,
             'examide': 1,
             'citogl

In [13]:
# The features "examide" and "citoglipton" have only one value through so they can be dropped from consideration
diabetes = diabetes.drop(['examide', 'citoglipton'], axis = 1)

In [14]:
# List of all medication features after removing  "examide" and "citoglipton"
medications = list(diabetes.columns)[24:45]

### Exploring some categorical features
We picked some features we thought would be relevant to look into further <br>

**discharge_disposition_id**: From the ID mapping that UCI ML Repository shared with us, some categories here relate to death or terminally ill facilities. Any patient that falls into these categories should possibly not be considered in our predictions because there is no way they can be readmitted. If we were to consider them, we would possibly be biasing our predictions towards "NO" readmission, which would be incorrect. Nonetheless, we might want to consider some patients who had multiple re-admissions and hence we will not completely eliminate all patients that fall in the death/hospice categories

In [15]:
## drop rows where discharge_disposition_id indicates death or hospice
# diabetes = diabetes.drop(diabetes[diabetes.discharge_disposition_id.isin([11,13,14,19,20,21])].index)
## OR, Create a Boolean for patients that died/went to hospice vs that didn't
diabetes['disposition_boolean'] = np.where((diabetes['discharge_disposition_id'].isin([11,13,14,19,20,21])),1,0)
diabetes['discharge_disposition_id'].value_counts()
feat_cat.append('disposition_boolean')

**Diagnosis Features - diag_1, diag_2, diag_3**: 
- Each of the 3 features containts 700+ categories of type string <br> 
- Some of these categories are essentially numbers (floats) while others are hard strings <br>
- We convert all the strings that can be converted into floats, and coerce the others into 'nan' <br>
- Any unknowns (?) and non-float diagnisis (ex. V50) are then categorized as "Other" 

In [16]:
def diag_cat(diag_feat):
    diabetes[diag_feat] = pd.to_numeric(diabetes[diag_feat],errors= 'coerce')
    diabetes[diag_feat] = diabetes[diag_feat].fillna(0)
    
    for ind in range(len(diabetes)):
        if diabetes[diag_feat][ind] == 'nan':
            diabetes[diag_feat][ind] = "Other"
        elif round(diabetes[diag_feat][ind]) in [250,251]:
            diabetes[diag_feat][ind] = "Diabetes"
        elif diabetes[diag_feat][ind] in range(390,460) or diabetes[diag_feat][ind] == 785:
            diabetes[diag_feat][ind] = "Circulatory"
        elif diabetes[diag_feat][ind] in range(460,520) or diabetes[diag_feat][ind] == 786:
            diabetes[diag_feat][ind] = "Respiratory"
        elif diabetes[diag_feat][ind] in range(520,580) or diabetes[diag_feat][ind] == 787:
            diabetes[diag_feat][ind] = "Digestive"
        elif diabetes[diag_feat][ind] in range(800,1000):
            diabetes[diag_feat][ind] = "Injury"
        elif diabetes[diag_feat][ind] in range(710,740):
            diabetes[diag_feat][ind] = "Musculoskeletel"
        else:
            diabetes[diag_feat][ind] = "Other"

In [17]:
diag_feat = ['diag_1', 'diag_2', 'diag_3']

for f in diag_feat:
    diag_cat(f)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes[diag_feat][ind] = "Diabetes"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes[diag_feat][ind] = "Other"


In [18]:
diabetes['diag_1'].value_counts()

Circulatory        30437
Other              26728
Respiratory        14423
Digestive           9475
Diabetes            8772
Injury              6974
Musculoskeletel     4957
Name: diag_1, dtype: int64

### Investigating Multiple Readmissions

In [19]:
## Count the number of multiple readmissions for a single patient
unique_patients = diabetes['patient_nbr'].value_counts().to_frame()
unique_patients["index"] = unique_patients.index
len(unique_patients[unique_patients["patient_nbr"] > 1])

16773

### Investigating Missing Values (?) 
Features with missing values: <br>
**Weight** - replaced it with the mode <br>
**Race** - replaced it with "UNK" <br>
**Medical Speciality** - replaced it with "UNK" <br>
**Payer Code** - replaced it with "UNK" <br>

(**diag_1, diag_2, diag_3** also had missing values but those have already been handled above)

In [20]:
for col in diabetes.columns:
    if diabetes[col].dtype == object:
        count = diabetes[col][diabetes[col] == '?'].count()
        if count > 0:
            print(col, count)

race 2273
weight 98569
payer_code 40256
medical_specialty 49949


In [21]:
## Weights: Because most weights are missing, we replace the ? with most common weight
diabetes['weight'] = np.where((diabetes['weight'] == "?"),"[75-100)",diabetes['weight'])
## Race: replace with UNK
diabetes['race'] = np.where((diabetes['race'] == "?"),"UNK",diabetes['race'])
## Medical Speciality: replace with UNK
diabetes['medical_specialty'] = np.where((diabetes['medical_specialty'] == "?"),"UNK",diabetes['medical_specialty'])
## payer_code: replace with UNK
diabetes['payer_code'] = np.where((diabetes['race'] == "?"),"UNK",diabetes['payer_code'])

## Feature Selection - Categorical Features 

#### Step 1: Investigate the value count for each medication
If we realize that hardly anyone was prescribed that medication, it is perhaps a good idea to not consider it in our analysis

In [22]:
med_count = defaultdict(list)

for med in medications:
    # count the number of Nos, Ups, Downs, and Steadys for each medication
    med_count[med].append(list(diabetes[med].value_counts()))
    
    # if the number of Nos is > 100K, disregard the medication for now 
    if med_count[med][0][0] > 100000:
        med_count.pop(med)
med_count
meds_new = list(med_count)

### Method 1: Chi Square

Lets first try to find any relations between the medication features <br>

#### Approach 1: Cross Tabulation
$D$ = Number of Medication features <br>
Null Hypothesis ($H_O$): Features are independent - there is no relationship between features $x^i$ and $x^j$ where $i, j$ $\in$ $(1,...,D)$ <br>
Alternate Hypothesis ($H_1$): Features are independent - there is a relationship between features <br>
Let's consider p-value for $H_O$ = .05 $\Rightarrow$ if p-value for a relation is < .05, then we fail to reject $H_O$ <br>

As we can see, none of our p-values are greater than the significance level, so we fail to reject the null hypothesis for any of them. Thus, we continue to consider all the medication features to be independent from each other. 

In [23]:
# chi_p = np.array([[0]*7 for _ in range(7)])
chi_p = [[0]*7 for _ in range(7)]
for med1 in medications_new:
    for med2 in meds_new:
        chi_p[meds_new.index(med1)][meds_new.index(med2)] = chi2_contingency(pd.crosstab(diabetes[med1], diabetes[med2]))[1]
chi_p = np.array(chi_p)   
chi_p       

NameError: name 'medications_new' is not defined

#### Approach 2: Ordinal / One Hot Encoding

In [None]:
# encode some catagorical features in the input data
def prepare_inputs(Xtr, Xte):
    ordEnc = OrdinalEncoder()
    ordEnc.fit(Xte)
    XtrEnc = ordEnc.transform(Xtr)
    XteEnc = ordEnc.transform(Xte)
    return XtrEnc, XteEnc
 
# encode the target feature (categorical)
def prepare_targets(Ytr, Yte):
    labEnc = LabelEncoder()
    labEnc.fit(Ytr)
    YtrEnc = labEnc.transform(Ytr)
    YteEnc = le.transform(Yte)
    return YtrEnc, YteEnc
 
# feature selection
## concern - this can only work if we only have categorical features 
def select_features(Xtr, Xte, Ytr):
    featSel = SelectKBest(score_func=chi2, k='all')
    featSel.fit(Xtr, Ytr)
    XtrSel = featSel.transform(Xtr)
    XteSel = featSel.transform(Xte)
    return XtrSel, XteSel, featSel

## Feature Selection - Numerical Features

### Method 1: Variance Threshhold
This method is quick way to eliminate features that have hardly any variance amongst their values

In [24]:
for f in feat_num:
    print(np.var(np.array(diabetes[f])))
#     if np.var(np.array(diabetes[f])) < .9:
#         print(f)

387.07672627733086
2.909748857193247
66.05668337435635
1.6059450439727272
0.8657701347600352
1.5948080179968538
3.7387727814545464
