## Setup & read in data

In [65]:
import pandas as pd
import sklearn
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt   

In [66]:
#this is the data including imputed values for test results etc 
all_data = pd.read_csv('imputed_all_data.csv')
parse_dates=['admission_date_structured']
labels = pd.read_csv('new_dbscan_9clus.csv')
combined_clean = pd.read_csv('combined_clean.csv')


In [67]:
#remove comorbidities 
orig_data_and_labels = orig_data_and_labels_comorb.drop(columns=['Unnamed: 0','morbidity_Diabetes','morbidity_COPD','morbidity_Hypertension','morbidity_Heartdisease','morbidity_Renaldisease','morbidity_Tumor','morbidity_Metabolicdisorders','morbidity_Respiratorydiseases'])

In [68]:
orig_data_and_labels

Unnamed: 0,id,Height,Weight,smoking_structured,age,blood_sugar_d1_min,blood_sugar_d1_max,blood_sugar_d3_min,blood_sugar_d3_max,SystolicBP_d1_min,...,AlbumintoGlobulinRatio,Male,Consciousness,Temperature,Respiratoryrate,Redcelldistributionwidth,SystolicBP,DiastolicBP,Lymphocyte(%),cluster
0,100251,155.00000,60.000000,0,62.0,4.660000,4.660000,4.925083,5.956664,124.000000,...,1.382490,False,1.0,36.6,20.0,12.100000,134.500000,68.500000,23.384615,0
1,100358,164.75961,65.000000,0,62.0,5.793085,6.349398,5.576803,5.064091,115.000000,...,1.006937,True,1.0,37.5,24.0,11.900000,122.500000,80.000000,15.538462,0
2,101536,158.00000,50.000000,0,48.0,5.248720,6.629004,4.899923,5.805336,98.000000,...,1.437422,False,1.0,36.7,22.0,11.300000,108.000000,62.500000,46.326531,0
3,101289,165.00000,60.000000,0,50.0,4.458859,5.799471,3.930688,4.154902,118.000000,...,1.329695,False,1.0,36.5,20.0,13.600000,122.000000,73.000000,22.432432,0
4,100851,170.00000,70.000000,0,74.0,5.270000,5.270000,4.348679,5.624614,116.000000,...,1.283246,True,1.0,36.8,24.0,12.700000,121.500000,81.000000,27.342448,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2755,102301,175.00000,72.000000,0,67.0,4.110000,4.110000,5.335332,5.082172,114.000000,...,1.240000,True,1.0,36.5,28.0,14.600000,120.000000,74.500000,20.384615,8
2756,101680,164.75961,65.029389,1,77.0,5.680000,5.680000,6.280000,6.280000,98.000000,...,1.240000,True,0.0,37.2,18.0,16.400000,118.500000,63.500000,5.454545,8
2757,102631,168.00000,65.000000,0,75.0,8.380000,8.380000,6.140000,6.140000,130.000000,...,1.700000,True,1.0,36.3,23.0,12.500000,140.000000,70.000000,8.358209,8
2758,100166,164.75961,65.029389,0,69.0,4.133798,7.149667,11.590000,11.590000,121.000000,...,1.120000,True,1.0,38.2,23.0,11.700000,134.000000,83.000000,33.750000,8


### Predicting binary severity 

Here we redefine the clusters into 2 categories

In [69]:
#for 6_kmodes
limited_clusters = True
if(limited_clusters):
    data_and_labels = orig_data_and_labels.copy()
    #group clusters together into non-severe, medium, severe
    data_and_labels['cluster'] = orig_data_and_labels['cluster'].apply(
        lambda x: 'remainder' if (x==0 or x==1 or x==2 or x==3 or x==4 or x==6 or x==5 or x==8) else x)
#     data_and_labels['cluster'] = data_and_labels['cluster'].apply(
#         lambda x: 'medium' if (x==0) else x)    
    data_and_labels['cluster'] = data_and_labels['cluster'].apply(
        lambda x: 'seven' if (x==7) else x)            

In [70]:
data_and_labels.cluster.value_counts()

remainder    2732
seven          28
Name: cluster, dtype: int64

In [71]:
five = data_and_labels[data_and_labels.cluster=='five']
remainder = data_and_labels[data_and_labels.cluster=='remainder']

In [72]:
from sklearn import preprocessing, model_selection

def preprocess(all_data):
#     Split the dataset into features and labels (clusters) - so we can normalise the features but not the labels
    all_X = all_data.iloc[:,2:len(all_data.columns)-1]
    all_y = all_data['cluster']
    
    # normalise the data so we have unit variance and mean 0 using built-in preprocessing method in sklearn
    scaler = preprocessing.StandardScaler()
    all_X= pd.DataFrame(scaler.fit_transform(all_X),columns=all_X.columns)
    
#     reset the indexes otherwise it was breaking
    all_X= all_X.reset_index()
    all_y = all_y.reset_index()
    all_X = all_X.drop(columns=['index'])
    all_y = all_y.drop(columns=['index'])
    
    #prepare data for training a model by splitting into training and testing data 
    return (all_X, all_y)

In [73]:
all_X, all_y = preprocess(data_and_labels)

## Classification setup

In [51]:
#Set the class names
class_names = ['seven','remainder']

### setting up performance metrics calculations

In [52]:
from sklearn import metrics
#method to compute performance metrics 
def performance_metrics(y_true, y_pred):
    #create a confusion matrix from results 
    cnf_matrix  = metrics.confusion_matrix(y_true, y_pred, labels=class_names, sample_weight=None)
#     # average over all classes to find overall fp,fn,tp,tn
#     FP = np.sum(cnf_matrix, axis=0) - np.diag(cnf_matrix)
#     FN = np.sum(cnf_matrix, axis=1) - np.diag(cnf_matrix)
#     TP = np.diag(cnf_matrix)
#     TN = np.sum(cnf_matrix) - (FP + FN + TP)
        
    #find out if each predicted label is right 
    correct_labels = 0
    for i, true_label in enumerate(y_true):
        if (true_label == y_pred[i]):
            correct_labels += 1

    #calculate overall accuracy
#     acc = np.round(correct_labels/y_pred.shape[0],2)
        
    # tp rate is the same as recall 
#     recall = np.sum(TP)/(np.sum(TP)+np.sum(FN))
#     FP_rate = np.sum(FP)/(np.sum(FP)+np.sum(TN))  
#     precision = np.sum(TP)/(np.sum(TP)+np.sum(FP)) 
#     f_measure = (2*precision*recall)/(precision + recall) 

    metrics_dict = metrics.classification_report(y_true,y_pred,output_dict=True)
    avg_metrics = metrics_dict['macro avg']
    
    recall = avg_metrics['recall']
    precision = avg_metrics['precision']
    f_measure = avg_metrics['f1-score']
    acc = metrics_dict['accuracy']
    
    print(metrics.classification_report(y_true,y_pred))
    
    return (acc, recall, precision, f_measure,
            np.round(cnf_matrix/cnf_matrix.sum(axis=1), 2))


In [53]:
import seaborn as sn

#method to plot the confusion matrix 
def plot_confusion_matrix(confusion_matrix, class_names):
    conf_matrix = pd.DataFrame(confusion_matrix, index = [i for i in class_names], columns = [i for i in class_names])
    plt.figure()
    sn.set(font_scale=1.4) # for label size
    sn.heatmap(conf_matrix, annot=True, annot_kws={"size": 16}, cmap="Blues")
    plt.show()

## k-fold cross validation

Here we use k-fold cross validation to evaluate the effectiveness of our models, where each 'fold' results in a slightly different train/test split, allowing us to see how the model beaves with different data. We can compute various metrics about the model performance, and from these results, we can choose the most effective classification model.

In [54]:
from sklearn import model_selection 
from sklearn import linear_model
from sklearn import ensemble

def cross_val(all_X, all_y, class_names, downsampling, upsampling, down_prop, up_prop, clf, printing):
    print('cross-validating...')
    k_val = 3
    #use 3 splits 
    k_fold = model_selection.KFold(n_splits=k_val)
    strat_k_fold = model_selection.StratifiedKFold(n_splits=k_val, shuffle=True)

    #create empty arrays to store the results from each fold
    #we can then average these to get the overall classifications and performance 
    accuracies = np.empty(k_val)
    tp_rates = np.empty(k_val)
    precisions = np.empty(k_val)
    f_measures = np.empty(k_val)

    count = 0

    #split training data into training and validation sets
    for train_indices, validation_indices in strat_k_fold.split(all_X, all_y):
    #     print(X_train.index)
        X_training = all_X.iloc[train_indices]
        y_training = all_y.iloc[train_indices]
        X_validate = all_X.iloc[validation_indices]
        y_validate = all_y.iloc[validation_indices]
        
        # upsample the training data in each fold, if upsample = true
        #with stratified k fold, better not to upsample, as it preserves percentage of samples for each class 
#         print('value counts: ', y_training.value_counts())

        if(downsampling):
#             print('downsampling...')
            X_training, y_training = downsample(X_training, y_training, class_names, down_prop)
            if (printing):
                print('value counts: ', y_training.value_counts())

        if (upsampling):
#             print('upsampling...')
            X_training, y_training = upsample(X_training, y_training, class_names, up_prop)
            if (printing):
                print('value counts:', y_training.value_counts())
            
        # fit classifier
        clf.fit(X_training, y_training.values.ravel())
        
        #predict 
        y_predicted = clf.predict(X_validate)
        y_true_val = y_validate.cluster
        
        #if we are using the decision tree classifier we may want to export the tree to look at it 
        from sklearn import tree

        export_tree = False
        if(export_tree):
            for tree_in_forest in clf.estimators_:
                if(export_tree):
                    tree.export_graphviz(tree_in_forest, out_file="rotated_tree_eg.dot",filled = True, 
                                         feature_names = X_training.columns, class_names=class_names,
                                         rotate = True)
        if (printing):
            print(np.unique(y_predicted, return_counts=True))

        #see how validation set performs 
        acc, recall, precision, f_measure, confusion_matrix = performance_metrics(y_true_val.values, y_predicted)

        if (printing):
            #print performance results for each fold
            print("accuracy: ", acc)
            print("true positive rate / recall: ", recall)
            print("precision: ", precision)
            print("f measure: ", f_measure)
            plot_confusion_matrix(confusion_matrix, class_names)
            print("----------")

        # store info for each fold 
        accuracies[count] = acc
        tp_rates[count] = recall
        precisions[count] = precision
        f_measures[count] = f_measure
        count+=1
        
        
    #average scores
    cv_acc = np.mean(accuracies)
    cv_tp = np.mean(tp_rates)
    cv_precision = np.mean(precisions)
    cv_f_measure = np.mean(f_measures)


    print("overall average performance:")
    print("accuracy: ", cv_acc)
    print("true positive rate/recall: ", cv_tp)
    print("precision: ", cv_precision)
    print("f measure: ", cv_f_measure)
    
    return X_training, y_training, X_validate, y_validate, y_predicted, y_true_val, cv_f_measure


Method to perform grid search to find the best parameters for the random forest classifier, uses cross-val


Had to implement this manually due to library constraints and difficulties with upsampling properly inside gridsearch

In [56]:
from sklearn import model_selection, pipeline, ensemble

def grid_search_rand_forest(data_X, data_y,class_names, downsampling, upsampling, down_prop, up_prop):

    best_f = 0.0
    
    # gridsearch for parameter optimisation
    param_grid = { 
        'n_estimators': [5, 10, 15, 20, 30, 40, 60, 80],
        'max_depth': [5, 10, 15, None],
        'max_features': ['auto', 'sqrt', 'log2'],
        'min_samples_split': [2, 5, 10],
        'criterion' :['gini', 'entropy']}
    
    for n_estimators in param_grid['n_estimators']:
        for max_depth in param_grid['max_depth']:
            for max_features in param_grid['max_features']:
                for min_samples_split in param_grid['min_samples_split']:
                    for criterion in param_grid['criterion']:
                        clf = ensemble.RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features, 
                                               min_samples_split=min_samples_split, criterion=criterion)

                        X_training, y_training, X_validate, y_validate, y_predicted, y_true_val, f_measure = cross_val(data_X, data_y, class_names, downsampling, upsampling, down_prop, up_prop, clf, printing=False)

                        if(f_measure > best_f):
                            best_f = f_measure
                            best_clf = clf
    
    # Create grid search object
#     clf = model_selection.GridSearchCV(estimator=ensemble.RandomForestClassifier(), param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)

#     best_clf = clf.fit(data_X, data_y.values.ravel())

#     gridsearch_results = pd.DataFrame(best_clf.cv_results_)

    #sort according to overall rank score to find best params
#     best_params_found = gridsearch_results.sort_values('rank_test_score').iloc[0]
    print(best_clf)
    
    return best_clf

## Rebalancing classes

### Downsampling

To address class imbalance by removing some examples in most numerous classes

In [57]:
def downsample(train_X, train_y, class_names, down_prop):
    
    train_data_and_labels = train_X.copy()
    train_data_and_labels['cluster'] = train_y
    
    #find the largest class and the number of samples in training
    value_counts = train_data_and_labels.cluster.value_counts()
    max_val = max(value_counts)

    most_numerous =  train_data_and_labels.cluster.value_counts().index[0]
    big_cluster = train_data_and_labels[train_data_and_labels['cluster']==most_numerous]
    new_train = train_data_and_labels[train_data_and_labels.cluster!=most_numerous]
    #remove more individuals in largest class if limited classes, as class imbalance is bigger here
    reduced_cluster = big_cluster.sample(replace=False, n=int(len(big_cluster)*down_prop), random_state=1)
    train_down = new_train.append(reduced_cluster)
    y_train_down = train_down['cluster']
    X_train_down = train_down.iloc[:,0:len(train_down.columns)-1]        
    
    return X_train_down, y_train_down

### Upsampling 

This improves the prediction accuracy for the classes containing fewer individuals.

In [58]:
#upsample the least numerous classes
def upsample(train_X, train_y, class_names, up_prop):
    
    train_data_and_labels = train_X.copy()
    train_data_and_labels['cluster'] = train_y

    #find the largest class and the number of samples in training
    value_counts = train_data_and_labels.cluster.value_counts()
    max_val = max(value_counts)
        
    #upsampling all the classes in training data 
    for cluster in class_names:
        cluster_vals = train_data_and_labels[train_data_and_labels.cluster==cluster]
        num_in_cluster = len(cluster_vals.index)
        num_extra_samples = int(up_prop*(max_val - num_in_cluster))
        new_samples = cluster_vals.sample(replace=True, n=num_extra_samples, random_state=1)
        train_data_and_labels = train_data_and_labels.append(new_samples)
            
#     print(train_data_and_labels.cluster.value_counts())
    
    X_train_up = train_data_and_labels.iloc[:,0:len(train_data_and_labels.columns)-1]
    y_train_up = train_data_and_labels['cluster']
    
    return X_train_up, y_train_up

Set parameters

In [62]:
#need to do gridsearch 


# clf = grid_search_rand_forest(all_X, all_y, class_names, downsampling=True, upsampling=True, down_prop=0.6, up_prop=0.8)

#for 5 vs remainder
# clf= ensemble.RandomForestClassifier(criterion='entropy', max_depth=10, max_features='sqrt',min_samples_split=5, n_estimators=5)
#for 7 vs remainder
clf= ensemble.RandomForestClassifier(criterion='entropy', max_depth=5, max_features='sqrt',n_estimators=20)


Now we perform the k-fold cross validation on data. Since our dataset is small, the predictions can vary a lot based on which data is used for training/testing, which is why k-fold cross-val is useful to get overall prediction metrics. 
If upsampling is true then the least numerous classes will be upsampled.

#### Binaryclass:

In [74]:
class_names = ['seven','remainder'] 
X_training, y_training, X_validate, y_validate, y_predicted, y_true_val, cv_f_measure = cross_val(all_X, all_y, class_names, downsampling=True, upsampling=True, down_prop=0.6, up_prop=0.8, clf=clf, printing=False)

cross-validating...
              precision    recall  f1-score   support

   remainder       0.99      0.99      0.99       911
       seven       0.40      0.44      0.42         9

    accuracy                           0.99       920
   macro avg       0.70      0.72      0.71       920
weighted avg       0.99      0.99      0.99       920

              precision    recall  f1-score   support

   remainder       0.99      1.00      1.00       911
       seven       0.57      0.44      0.50         9

    accuracy                           0.99       920
   macro avg       0.78      0.72      0.75       920
weighted avg       0.99      0.99      0.99       920

              precision    recall  f1-score   support

   remainder       0.99      1.00      1.00       910
       seven       0.57      0.40      0.47        10

    accuracy                           0.99       920
   macro avg       0.78      0.70      0.73       920
weighted avg       0.99      0.99      0.99       920
