In [None]:
import pandas as pd
import numpy as np

import sys
!{sys.executable} -m pip install kmodes

from kmodes.kmodes import KModes

In [None]:
def applyKModes(data, n_clusters = 6):
    #data must be categorical 
    np.random.seed(666)
    
    categories = [col + ' - ' + str(i) for col in data.columns for i in data[col].unique() if i!= 0]
    
    #do 10 initialisations and take the best one
    km = KModes(n_clusters=n_clusters, init='Huang', verbose=0, n_init = 10)
    km.fit_predict(data)
    
    kmodes = km.cluster_centroids_
    shape = kmodes.shape
    clusters = {}
    labels = km.labels_
    count = 0
    for i in range(shape[0]):
        for j in range(data.shape[0]):
            if labels[j] == i:
                if i in clusters:
                    clusters[i].append(j)
                else:
                    clusters[i] = [j]

        if sum(kmodes[i,:]) == 0:
            print("\ncluster " + str(i) + ": " + str(len(clusters[i])) + ' patients')
            print("Non-severe cluster")
        else:
            print("\ncluster " + str(i) + ": " + str(len(clusters[i])) + ' patients')
            cent = kmodes[i,:]
            for c, cat in zip(cent, categories):
                if c:
                    print(cat)
        count += len(clusters[i])
    
    print('Final count: ' + str(count))
    
    return labels

In [None]:
disch_chaps = pd.read_csv('discharge_chapters_simple.csv', usecols = ['Blood/Immune', 'Circulatory', "Abnormal", "Musculoskeletal", "Genitourinary", "Nutritional", "Nervous", "Respiratory", "Digestive"])
disch_chaps = disch_chaps.applymap(lambda x:1 if x >= 1 else 0)
disch_chaps.head()

In [None]:
other_features = pd.read_csv('combined_clean.csv', usecols = ['id', 'ICU', 'death', 'Oxygen therapy - face mask', 'Oxygen therapy - high flow', 'Oxygen therapy - ventilator', 'Oxygen therapy - intubation', 'ECMO', 'Noninvasive ventilation', 'Invasive ventilation', 'los']).fillna(0)
other_features.head()

In [None]:
oxygen_therapies = pd.DataFrame(columns = ['Oxygen therapy - noninvasive', 'Oxygen therapy - invasive'])
oxygen_therapies['Oxygen therapy - noninvasive'] = (other_features['Oxygen therapy - face mask'] + other_features['Oxygen therapy - high flow'] + other_features['Noninvasive ventilation']) > 0
oxygen_therapies['Oxygen therapy - invasive'] = (other_features['Oxygen therapy - intubation'] + other_features['Oxygen therapy - ventilator'] + other_features['Invasive ventilation']) > 0

In [None]:
length_of_stay = pd.DataFrame(columns = ['id', 'los:2-4weeks', 'los:4weeks+'])
length_of_stay.id = other_features.id
for col in ['los:2-4weeks', 'los:4weeks+']:
    length_of_stay[col].values[:] = False

In [None]:
for id_num, length in zip(other_features.id, other_features.los):
    if length in list(range(14, 28)):
        length_of_stay.loc[length_of_stay.id == id_num, 'los:2-4weeks'] = True
    elif length >= 28:
        length_of_stay.loc[length_of_stay.id == id_num, 'los:4weeks+'] = True
        
length_of_stay.drop(columns = 'id', inplace = True)
length_of_stay.head()

In [None]:
secondary_features = pd.concat([other_features[['ICU', 'death', 'ECMO']], oxygen_therapies, length_of_stay], axis = 1)
secondary_features.head()


# 1. Cluster on discharge codes only

In [None]:
disch_3 = applyKModes(disch_chaps, 3)

In [None]:
disch_4 = applyKModes(disch_chaps, 4)

In [None]:
disch_6 = applyKModes(disch_chaps)

In [None]:
disch_8 = applyKModes(disch_chaps, 8)

# 2. Within cluster, clustering on other features

## (a) disch_3

In [None]:
other_3_0 = secondary_features.loc[disch_3 == 0, :]
other_3_1 = secondary_features.loc[disch_3 == 1, :]
other_3_2 = secondary_features.loc[disch_3 == 2, :]

In [None]:
other_3_0_clusters = applyKModes(other_3_0, 3)

In [None]:
other_3_1_clusters = applyKModes(other_3_1, 4)

In [None]:
other_3_2_clusters = applyKModes(other_3_2, 2)

## (b) disch_4

In [None]:
other_4_0 = secondary_features.loc[disch_4 == 0, :]
other_4_1 = secondary_features.loc[disch_4 == 1, :]
other_4_2 = secondary_features.loc[disch_4 == 2, :]
other_4_3 = secondary_features.loc[disch_4 == 3, :]

In [None]:
#Nutritional
other_4_0_clusters = applyKModes(other_4_0, 3)

In [None]:
#respiratory
other_4_1_clusters = applyKModes(other_4_1, 2)

In [None]:
#circulatory
other_4_2_clusters = applyKModes(other_4_2, 2)

In [None]:
#non-severe
other_4_3_clusters = applyKModes(other_4_3, 2)

# Save clusters

In [None]:
count_0 = 0
count_1 = 0
count_2 = 0
id_nums = other_features.id.to_list()
sub_cluster_3_labels = []
sub_cluster_names = {0:'a', 1:'b', 2:'c', 3:'d'}
for j in range(2797):
    cluster = disch_3[j]
    if cluster == 0:
        sub_cluster_3_labels.append('0' + sub_cluster_names[other_3_0_clusters[count_0]])
        count_0 += 1
    elif cluster == 1:
        sub_cluster_3_labels.append('1' + sub_cluster_names[other_3_1_clusters[count_1]])
        count_1 += 1
    else:
        sub_cluster_3_labels.append('2' + sub_cluster_names[other_3_2_clusters[count_2]])
        count_2 += 1
        
print(count_0, count_1, count_2)

In [None]:
count_0 = 0
count_1 = 0
count_2 = 0
count_3 = 0
sub_cluster_4_labels = []
sub_cluster_names = {0:'a', 1:'b', 2:'c', 3:'d'}
for j in range(2797):
    cluster = disch_4[j]
    if cluster == 0:
        sub_cluster_4_labels.append('0' + sub_cluster_names[other_4_0_clusters[count_0]])
        count_0 += 1
    elif cluster == 1:
        sub_cluster_4_labels.append('1' + sub_cluster_names[other_4_1_clusters[count_1]])
        count_1 += 1
    elif cluster == 2:
        sub_cluster_4_labels.append('2' + sub_cluster_names[other_4_2_clusters[count_2]])
        count_2 += 1
    else:
        sub_cluster_4_labels.append('3' + sub_cluster_names[other_4_3_clusters[count_3]])
        count_3 += 1
        
print(count_0, count_1, count_2, count_3)

In [None]:
layered_axes_clusters = pd.DataFrame({'id':id_nums, 'disch_3':disch_3, 'disch_3_sub_clusters':sub_cluster_3_labels, 'disch_4':disch_4, 'disch_4_sub_clusters':sub_cluster_4_labels})
layered_axes_clusters.head()

In [None]:
layered_axes_clusters.to_csv('layered_axes_clusters.csv')