# Purpose: Create clusters of patients based on all of the complete data in the dataset. 

**These clusters will be used when inputing missing data.** 


In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_score

## 1. Load and Inspect Data 

In [43]:
# Load data -  merged training and test data - selected columns with bins
# Added column for hospital bins (binned on # of patients/hospital) 'hospital_traffic_bin'
with open('df_hospital_bins2.pickle', 'rb') as read_file:
    df_bins = pickle.load(read_file)
df_bins.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,patients_per_hospital,hospital_traffic_bin
0,66154,25312,118,0.0,68.0,22.73,0,Caucasian,M,180.3,...,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,4333,5
1,114252,59342,81,0.0,77.0,27.42,0,Caucasian,F,160.0,...,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,580,1
2,119783,50777,118,0.0,25.0,31.95,0,Caucasian,F,172.7,...,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,4333,5
3,79267,46918,118,0.0,81.0,22.64,1,Caucasian,F,165.1,...,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular,4333,5
4,92056,34377,33,0.0,19.0,,0,Caucasian,M,188.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma,323,1


In [27]:
# Check for missing data
df_bins['patients_per_hospital'].isna().sum()

0

In [29]:
df_bins['hospital_traffic_bin'].isna().sum()

0

In [28]:
df_bins.describe()

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,height,icu_id,pre_icu_los_days,...,apache_4a_icu_death_prob,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,patients_per_hospital
count,131021.0,131021.0,131021.0,91713.0,125139.0,126506.0,131021.0,128937.0,131021.0,131021.0,...,117896.0,130127.0,130127.0,130127.0,130127.0,130127.0,130127.0,130127.0,130127.0,131021.0
mean,65528.171194,65527.414643,106.099259,0.086302,62.009965,29.113833,0.188588,169.604665,662.060212,0.833905,...,0.042732,0.00103,0.016084,0.216312,0.013602,0.025675,0.007308,0.004188,0.020857,1527.399249
std,37832.833351,37831.357384,63.493396,0.280811,16.797485,8.266435,0.391183,10.834117,304.224462,2.503083,...,0.223015,0.032074,0.1258,0.411731,0.115833,0.158164,0.085176,0.064581,0.142905,1104.98213
min,1.0,1.0,1.0,0.0,16.0,14.844926,0.0,137.2,82.0,-82.028472,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
25%,32763.0,32764.0,49.0,0.0,52.0,23.598616,0.0,162.5,427.0,0.045139,...,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,673.0
50%,65527.0,65529.0,112.0,0.0,64.0,27.573696,0.0,170.1,653.0,0.154861,...,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1260.0
75%,98294.0,98290.0,165.0,0.0,75.0,32.8125,0.0,177.8,969.0,0.422917,...,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2470.0
max,131051.0,131051.0,204.0,1.0,89.0,67.81499,1.0,195.59,1111.0,175.627778,...,0.97,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4333.0


##  2. Select Variables to use for Clustering

It's important for these variables to: 
* Have complete information for the training set and test set 
* Missing <1% of the training dataset is probably okay, but we'll have to make predictions for all of the test set - so I think we need info for all test set in order to cluster them.  

Some variables that seem good to include, but have questionable levels of missing data include: 
* gender - 0.03/0.04
* d1_heartrate_max
* d1_sysbp_max
* icu_admit_source
* ethnicity - 1.45 / 1.91
* weight - 2.97 /1.94
* bmi - 3.74/ 2.76 
* age - 4.61 / 4.21 


In [74]:
# Drop the columns that contain missing information (any missing info)
df_no_nulls = df_bins.dropna(axis=1)
df_no_nulls.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,elective_surgery,icu_id,icu_stay_type,icu_type,pre_icu_los_days,readmission_status,apache_post_operative,patients_per_hospital,hospital_traffic_bin
0,66154,25312,118,0,92,admit,CTICU,0.541667,0,0,4333,5
1,114252,59342,81,0,90,admit,Med-Surg ICU,0.927778,0,0,580,1
2,119783,50777,118,0,93,admit,Med-Surg ICU,0.000694,0,0,4333,5
3,79267,46918,118,1,92,admit,CTICU,0.000694,0,1,4333,5
4,92056,34377,33,0,91,admit,Med-Surg ICU,0.073611,0,0,323,1


In [31]:
df_no_nulls.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 131021 entries, 0 to 131020
Data columns (total 12 columns):
encounter_id             131021 non-null int64
patient_id               131021 non-null int64
hospital_id              131021 non-null int64
elective_surgery         131021 non-null int64
icu_id                   131021 non-null int64
icu_stay_type            131021 non-null object
icu_type                 131021 non-null object
pre_icu_los_days         131021 non-null float64
readmission_status       131021 non-null int64
apache_post_operative    131021 non-null int64
patients_per_hospital    131021 non-null int64
hospital_traffic_bin     131021 non-null category
dtypes: category(1), float64(1), int64(8), object(2)
memory usage: 12.1+ MB


In [34]:
df_no_nulls.nunique()

encounter_id             131021
patient_id               131021
hospital_id                 204
elective_surgery              2
icu_id                      328
icu_stay_type                 3
icu_type                      8
pre_icu_los_days          11380
readmission_status            1
apache_post_operative         2
patients_per_hospital       187
hospital_traffic_bin          5
dtype: int64

### 2b. Remove additional columns we don't want to include for clustering:
* hospital_death -  we dont' want the target to impact the clusters
* patient and encounter ids - don't matter for clustering since they're all be unique
* readmission_status - all values are the same 
* patients_per_hospital - use hospital_traffic bins instead

In [44]:
df_clustering = df_bins[['elective_surgery', 'icu_stay_type', 'icu_type',
                         'pre_icu_los_days', 'apache_post_operative', 'hospital_traffic_bin']]

In [36]:
df_clustering.columns

Index(['elective_surgery', 'icu_stay_type', 'icu_type', 'pre_icu_los_days',
       'apache_post_operative', 'hospital_traffic_bin'],
      dtype='object')

In [37]:
df_clustering.describe()

Unnamed: 0,elective_surgery,pre_icu_los_days,apache_post_operative
count,131021.0,131021.0,131021.0
mean,0.188588,0.833905,0.205746
std,0.391183,2.503083,0.404247
min,0.0,-82.028472,0.0
25%,0.0,0.045139,0.0
50%,0.0,0.154861,0.0
75%,0.0,0.422917,0.0
max,1.0,175.627778,1.0


In [38]:
df_clustering.isna().sum()

elective_surgery         0
icu_stay_type            0
icu_type                 0
pre_icu_los_days         0
apache_post_operative    0
hospital_traffic_bin     0
dtype: int64

In [45]:
df_clustering.head()

Unnamed: 0,elective_surgery,icu_stay_type,icu_type,pre_icu_los_days,apache_post_operative,hospital_traffic_bin
0,0,admit,CTICU,0.541667,0,5
1,0,admit,Med-Surg ICU,0.927778,0,1
2,0,admit,Med-Surg ICU,0.000694,0,5
3,1,admit,CTICU,0.000694,1,5
4,0,admit,Med-Surg ICU,0.073611,0,1


In [40]:
df_clustering['icu_stay_type'].value_counts()

admit       125263
transfer      5140
readmit        618
Name: icu_stay_type, dtype: int64

In [41]:
df_clustering['icu_type'].value_counts()

Med-Surg ICU    71954
CCU-CTICU       11279
MICU            11007
Neuro ICU       10616
Cardiac ICU      8337
SICU             8305
CSICU            5021
CTICU            4502
Name: icu_type, dtype: int64

## 3. Expand Categorical Features into Dummy Variables
* 'icu_stay_type' - 3 categories
* 'icu_type' - 8 categories

In [47]:
# To select all columns leave the columns keyword empty; defaults to all  
df_clustering = pd.get_dummies(df_clustering, columns=['icu_stay_type','icu_type'], drop_first=True)
df_clustering.columns

Index(['elective_surgery', 'pre_icu_los_days', 'apache_post_operative',
       'hospital_traffic_bin', 'icu_stay_type_readmit',
       'icu_stay_type_transfer', 'icu_type_CSICU', 'icu_type_CTICU',
       'icu_type_Cardiac ICU', 'icu_type_MICU', 'icu_type_Med-Surg ICU',
       'icu_type_Neuro ICU', 'icu_type_SICU'],
      dtype='object')

In [48]:
df_clustering.head()

Unnamed: 0,elective_surgery,pre_icu_los_days,apache_post_operative,hospital_traffic_bin,icu_stay_type_readmit,icu_stay_type_transfer,icu_type_CSICU,icu_type_CTICU,icu_type_Cardiac ICU,icu_type_MICU,icu_type_Med-Surg ICU,icu_type_Neuro ICU,icu_type_SICU
0,0,0.541667,0,5,0,0,0,1,0,0,0,0,0
1,0,0.927778,0,1,0,0,0,0,0,0,1,0,0
2,0,0.000694,0,5,0,0,0,0,0,0,1,0,0
3,1,0.000694,1,5,0,0,0,1,0,0,0,0,0
4,0,0.073611,0,1,0,0,0,0,0,0,1,0,0


## 4. Cluster

**Training and test data used as one dataset**

The silhouette value is a measure of how similar an object is to its own cluster (cohesion) compared to other clusters (separation). The silhouette ranges from −1 to +1, where a **high value indicates that the object is well matched** to its own cluster and poorly matched to neighboring clusters. - Wikipedia 

In [49]:
k_scores = {}
for n in range(3,30,2):
    num_clusters = n
    km = KMeans(n_clusters=num_clusters,random_state=10,n_init=1) # n_init, number of times the K-mean algorithm will run
    km.fit(df_clustering)
    cluster_labels = km.fit_predict(df_clustering)
    cluster_labels
    silhouette_avg = silhouette_score(df_clustering, cluster_labels)
    k_scores['n'] = silhouette_avg
    print("N: " + str(n) + ", Score: " + str(silhouette_avg))

N: 3, Score: 0.4045155382287674
N: 5, Score: 0.29925682559484185
N: 7, Score: 0.3042943664783366
N: 9, Score: 0.3308217203601151
N: 11, Score: 0.31985016284539336
N: 13, Score: 0.32039096410303264
N: 15, Score: 0.32241385307418646
N: 17, Score: 0.3532676169842306
N: 19, Score: 0.38538236512668633
N: 21, Score: 0.4001258557680569
N: 23, Score: 0.40333970651207635
N: 25, Score: 0.4145221953736163
N: 27, Score: 0.43167483467339596
N: 29, Score: 0.4320987612386619


started running at 3:28pm. Finished 4:55pm. Wayyy too long. :/ 

In [50]:
for n in range(27,38,4):
    num_clusters = n
    km = KMeans(n_clusters=num_clusters,random_state=10,n_init=1) # n_init, number of times the K-mean algorithm will run
    km.fit(df_clustering)
    cluster_labels = km.fit_predict(df_clustering)
    cluster_labels
    silhouette_avg = silhouette_score(df_clustering, cluster_labels)
    k_scores['n'] = silhouette_avg
    print("N: " + str(n) + ", Score: " + str(silhouette_avg))

N: 27, Score: 0.43167483467339596
N: 31, Score: 0.431825140712791
N: 35, Score: 0.44956554514712466


In [51]:
for n in range(35,51,4):
    num_clusters = n
    km = KMeans(n_clusters=num_clusters,random_state=10,n_init=1) # n_init, number of times the K-mean algorithm will run
    km.fit(df_clustering)
    cluster_labels = km.fit_predict(df_clustering)
    cluster_labels
    silhouette_avg = silhouette_score(df_clustering, cluster_labels)
    k_scores['n'] = silhouette_avg
    print("N: " + str(n) + ", Score: " + str(silhouette_avg))

N: 35, Score: 0.44956554514712466
N: 39, Score: 0.47819695576839255
N: 43, Score: 0.48611635453484286
N: 47, Score: 0.5152354192460147


Choosing 9 clusters. Don't want too many to have to use to fill in. 
Stands out a little more than the others. But mostly an arbitrary distinction. 

In [52]:
num_clusters = 9
km = KMeans(n_clusters=num_clusters,random_state=10,n_init=1) # n_init, number of times the K-mean algorithm will run
km.fit(df_clustering)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=9, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=10, tol=0.0001, verbose=0)

In [54]:
# Add labels to the dataframe 
df_clustering['clusters'] = km.labels_

In [55]:
df_clustering.head()

Unnamed: 0,elective_surgery,pre_icu_los_days,apache_post_operative,hospital_traffic_bin,icu_stay_type_readmit,icu_stay_type_transfer,icu_type_CSICU,icu_type_CTICU,icu_type_Cardiac ICU,icu_type_MICU,icu_type_Med-Surg ICU,icu_type_Neuro ICU,icu_type_SICU,clusters
0,0,0.541667,0,5,0,0,0,1,0,0,0,0,0,1
1,0,0.927778,0,1,0,0,0,0,0,0,1,0,0,0
2,0,0.000694,0,5,0,0,0,0,0,0,1,0,0,1
3,1,0.000694,1,5,0,0,0,1,0,0,0,0,0,1
4,0,0.073611,0,1,0,0,0,0,0,0,1,0,0,0


In [56]:
df_clustering['clusters'].value_counts()

0    50169
1    32357
7    19721
8    14464
4     8701
6     4018
2     1280
3      279
5       32
Name: clusters, dtype: int64

In [59]:
k_scores

{'n': 0.5152354192460147}

### Silhouette Scores
N: 3, Score: 0.4045155382287674
N: 5, Score: 0.29925682559484185
N: 7, Score: 0.3042943664783366
N: 9, Score: 0.3308217203601151
N: 11, Score: 0.31985016284539336
N: 13, Score: 0.32039096410303264
N: 15, Score: 0.32241385307418646
N: 17, Score: 0.3532676169842306
N: 19, Score: 0.38538236512668633
N: 21, Score: 0.4001258557680569
N: 23, Score: 0.40333970651207635
N: 25, Score: 0.4145221953736163
N: 27, Score: 0.43167483467339596
N: 29, Score: 0.4320987612386619

N: 27, Score: 0.43167483467339596
N: 31, Score: 0.431825140712791
N: 35, Score: 0.44956554514712466

N: 35, Score: 0.44956554514712466
N: 39, Score: 0.47819695576839255
N: 43, Score: 0.48611635453484286
N: 47, Score: 0.5152354192460147

In [63]:
# Didn't add to dictionary properly. Adding it now for purposes of graphing. 
# Not sure it's worth graphing. Will come back to if we decide to use these. 

k_scores = {3:.4045155382287674, 
            5: .29925682559484185,
            7:.3042943664783366
}

In [64]:
k_scores

{3: 0.4045155382287674, 5: 0.29925682559484185, 7: 0.3042943664783366}

# Summary:
Hmmm I'm not sure these clusters are cohesive enough to really base decisons on filling in missing data for. 

We'll try moving over to PPCA and see if that works better for filling in the missing data. 

It does mean this model will not be interpretable, however. 