In [1]:
import numpy as np
from tqdm import tqdm
import math
from time import time
from multiprocessing import Pool

In [2]:
data_input_file = "/home/jsenadesouza/DA-healthy2patient/results/outcomes/dataset/dataset_demographics_poi.npz"
tmp = np.load(data_input_file, allow_pickle=True)
X = tmp["X"]
y = tmp['y']
X_char = tmp['X_char']
y_col_names = list(tmp['y_col_names'])
print(y_col_names)

['heart_rate', 'heart_rate_class', 'temp', 'temp_class', 'lenght_of_stay', 'is_dead', 'pain_score', 'pain_score_class', 'sofa_score', 'sofa_score_class', 'map', 'map_class', 'braden_score', 'braden_score_class', 'spo2', 'spo2_class', 'cam', 'patient_id']


In [9]:
regression_val = [0, 2, 4, 5, 6, 8, 10, 14, 16]
class_val = [1, 3, 4, 5, 7, 9, 11, 15, 16]
_all = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16]

In [10]:
def get_clinical_data(y, y_col_names, target_col_name):
    regression_val = [0, 2, 6, 8, 10, 14, 16]
    col_target = y_col_names.index(target_col_name)
    col_target_reg = y_col_names.index(target_col_name.split("_class")[0])

    clin_var_idx = []
    for ii in regression_val:
        ii = int(ii)
        if ii != col_target and ii != col_target_reg:
            clin_var_idx.append(ii)

    clin_var = y[:, clin_var_idx]

    print(f'Target = {y_col_names[col_target]}')
    print("\nCLinical variables used:\n")
    for idx in clin_var_idx:
        print(f"{y_col_names[idx]}")

    return clin_var.astype(np.float32)


Classify with SVM using demographics features 

In [15]:
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, recall_score
from scipy import stats as st

skf = StratifiedKFold(n_splits=5)

clin_data = np.array(clin_var)
char = np.expand_dims(np.array(X_char)[:, -1], axis=1)
X_data = np.concatenate([X_feat, clin_data, X_char], axis=1)
cum_acc, cum_recall,cum_f1 = [], [], []
for train_index, test_index in skf.split(X_data, y_target):
    clf = svm.SVC(class_weight='balanced')
    clf = clf.fit(X_data[train_index], y_target[train_index])
    y_pred = clf.predict(X_data[test_index])
    y_true = y_target[test_index]
    cum_acc.append(accuracy_score(y_true, y_pred))
    cum_f1.append(f1_score(y_true, y_pred, average="macro"))
    cum_recall.append(recall_score(y_true, y_pred, average="macro", zero_division=0))
    print(f" acc = {accuracy_score(y_true, y_pred)}")
    print(f"f1 = {f1_score(y_true, y_pred, average='macro')}")
    print(f'recall = {recall_score(y_true, y_pred, average="macro", zero_division=0)}')
    
ci_mean = st.t.interval(0.9, len(cum_acc) - 1, loc=np.mean(cum_acc), scale=st.sem(cum_acc))
ci_f1 = st.t.interval(0.9, len(cum_f1) -1, loc=np.mean(cum_f1), scale=st.sem(cum_f1))
ci_recall = st.t.interval(0.9, len(cum_recall) -1, loc=np.mean(cum_recall), scale=st.sem(cum_recall))

print('accuracy: {:.2f} ± {:.2f}'.format(np.mean(cum_acc) * 100, abs(np.mean(cum_acc) - ci_mean[0]) * 100))
print('recall: {:.2f} ± {:.2f}'.format(np.mean(cum_recall) * 100, abs(np.mean(cum_recall) - ci_recall[0]) * 100))
print('f1-score: {:.2f} ± {:.2f}'.format(np.mean(cum_f1) * 100, abs(np.mean(cum_f1) - ci_f1[0]) * 100))


 acc = 0.40692307692307694
f1 = 0.36028807779427996
recall = 0.6509936667394628
 acc = 0.6992307692307692
f1 = 0.5373366501644307
recall = 0.7165538327145664
 acc = 0.6515384615384615
f1 = 0.44832638393823204
recall = 0.4907426633444075
 acc = 0.5976923076923077
f1 = 0.40561895095547695
recall = 0.4137423864894795
 acc = 0.7321016166281755
f1 = 0.44409082412663425
recall = 0.4337165588389579
accuracy: 61.75 ± 12.22
recall: 54.11 ± 12.89
f1-score: 43.91 ± 6.23


In [14]:
col_los = y_col_names.index('lenght_of_stay')
los_target = [0 if float(yy) <= 14 else 1 for yy in y[:, col_los]]
print(np.unique(los_target, return_counts=True))
print(1388/(1388+5111))
print(5111/(1388+5111))

(array([0, 1]), array([1388, 5111]))
0.2135713186644099
0.7864286813355901
