Goal: Apply KModes to discharge chapters, icu, death, oxygen therapy and length of stay to find distinct clusters.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

import sys
!{sys.executable} -m pip install kmodes

from kmodes.kmodes import KModes

# Functions

In [None]:
def applyKModes(data, n_clusters = 6):
    #data must be categorical 
    np.random.seed(666)
    
    categories = [col + ' - ' + str(i) for col in data.columns for i in data[col].unique() if i!= 0]
    
    #do 10 initialisations and take the best one
    km = KModes(n_clusters=n_clusters, init='Huang', verbose=0, n_init = 10)
    km.fit_predict(data)
    
    kmodes = km.cluster_centroids_
    shape = kmodes.shape
    clusters = {}
    labels = km.labels_
    count = 0
    for i in range(shape[0]):
        for j in range(2797):
            if labels[j] == i:
                if i in clusters:
                    clusters[i].append(j)
                else:
                    clusters[i] = [j]

        if sum(kmodes[i,:]) == 0:
            print("\ncluster " + str(i) + ": " + str(len(clusters[i])) + ' patients')
            print("Non-severe cluster")
        else:
            print("\ncluster " + str(i) + ": " + str(len(clusters[i])) + ' patients')
            cent = kmodes[i,:]
            for c, cat in zip(cent, categories):
                if c:
                    print(cat)
        count += len(clusters[i])
    
    print('Final count: ' + str(count))
    
    return labels

# Data

In [None]:
disch_chaps = pd.read_csv('discharge_chapters_simple.csv', usecols = ['Blood/Immune', 'Circulatory', "Abnormal", "Musculoskeletal", "Genitourinary", "Nutritional", "Nervous", "Respiratory", "Digestive"])
disch_chaps = disch_chaps.applymap(lambda x:True if x >= 1 else False)
disch_chaps.head()

In [None]:
other_features = pd.read_csv('combined_clean.csv', usecols = ['id', 'ICU', 'death', 'Oxygen therapy - face mask', 'Oxygen therapy - high flow', 'Oxygen therapy - ventilator', 'Oxygen therapy - intubation', 'ECMO', 'Noninvasive ventilation', 'Invasive ventilation', 'Methylprednisolone', 'Norepinephrine', 'los']).fillna(0)
other_features.head()

# 1. Just discharge codes

In [None]:
disch_6 = applyKModes(disch_chaps)

In [None]:
disch_8 = applyKModes(disch_chaps, 8)

In [None]:
disch_10 = applyKModes(disch_chaps, 10)

# 2. Discharge chapters, ICU & death

In [None]:
disch_poor = pd.concat([disch_chaps, other_features[['ICU', 'death']]], axis=1).applymap(lambda x: True if x else False)
disch_poor.head()

In [None]:
disch_poor_6 = applyKModes(disch_poor)

In [None]:
disch_poor_8 = applyKModes(disch_poor, 8) 

In [None]:
disch_poor_10 = applyKModes(disch_poor, 10)

# 3. Discharge codes, ICU, death & oxygen therapy

Let's categorise oxygen therapy 3 possible ways: noninvasive, invasive or ECMO.

In [None]:
oxygen_therapies = pd.DataFrame(columns = ['Oxygen therapy - noninvasive', 'Oxygen therapy - invasive'])
oxygen_therapies['Oxygen therapy - noninvasive'] = (other_features['Oxygen therapy - face mask'] + other_features['Oxygen therapy - high flow'] + other_features['Noninvasive ventilation']) > 0
oxygen_therapies['Oxygen therapy - invasive'] = (other_features['Oxygen therapy - intubation'] + other_features['Oxygen therapy - ventilator'] + other_features['Invasive ventilation']) > 0

In [None]:
disch_oxy = pd.concat([disch_poor, oxygen_therapies, other_features[['ECMO']]], axis=1).applymap(lambda x: True if x else False)
disch_oxy.head()

In [None]:
disch_oxy_6 = applyKModes(disch_oxy)

In [None]:
disch_oxy_8 = applyKModes(disch_oxy, 8)

In [None]:
disch_oxy_10 = applyKModes(disch_oxy, 10)

# 4. Discharge chapters, ICU, death, oxygen therapy & length of stay

In [None]:
#Make a length of stay dataframe with 2 columns
length_of_stay = pd.DataFrame(columns = ['id', 'los:2-4weeks', 'los:4weeks+'])
length_of_stay.id = other_features.id
for col in ['los:2-4weeks', 'los:4weeks+']:
    length_of_stay[col].values[:] = False

In [None]:
for id_num, length in zip(other_features.id, other_features.los):
    if length in list(range(14, 28)):
        length_of_stay.loc[length_of_stay.id == id_num, 'los:2-4weeks'] = True
    elif length >= 28:
        length_of_stay.loc[length_of_stay.id == id_num, 'los:4weeks+'] = True
        
length_of_stay.drop(columns = 'id', inplace = True)
length_of_stay.head()

In [None]:
disch_los = pd.concat([disch_oxy, length_of_stay], axis=1)
disch_los.head()

In [None]:
disch_los_6 = applyKModes(disch_los)

In [None]:
disch_los_8 = applyKModes(disch_los, 8)

In [None]:
disch_los_10 = applyKModes(disch_los, 10)

In [None]:
id_nums = pd.read_csv('discharge_chapters_simple.csv', usecols = ['id'], squeeze = True).tolist()
kmodes_clusters = pd.DataFrame({'id':id_nums, 'disch_6': disch_6, 'disch_8': disch_8, 'disch_10':disch_10, 'disch_poor_6':disch_poor_6, 'disch_poor_8':disch_poor_8, 'disch_poor_10':disch_poor_10, 'disch_oxy_6':disch_oxy_6, 'disch_oxy_8':disch_oxy_8, 'disch_oxy_10':disch_oxy_10, 'disch_los_6':disch_los_6, 'disch_los_8':disch_los_8, 'disch_los_10':disch_los_10})
kmodes_clusters.head()
kmodes_clusters.to_csv('kmodes_clusters.csv')

In [None]:
kmodes_clusters.head()