In [1]:
import numpy as np
from tqdm import tqdm
import math
from time import time
from multiprocessing import Pool

In [2]:
def magnitude(sample):
    mag_vector = []
    for s in sample:
        mag_vector.append(math.sqrt(sum([s[0]**2, s[1]**2, s[2]**2])))
    return mag_vector

def A(sample):
    feat = []
    for col in range(0,sample.shape[1]):
        average = np.average(sample[:, col])
        feat.append(average)

    return np.mean(feat)


def SD(sample):
    feat = []
    for col in range(0, sample.shape[1]):
        std = np.std(sample[:, col])
        feat.append(std)

    return np.mean(feat)


def AAD(sample):
    feat = []
    for col in range(0, sample.shape[1]):
        data = sample[col,:]
        add = np.mean(np.absolute(data - np.mean(data)))
        feat.append(add)

    return np.mean(feat)


def ARA(sample):
    #Average Resultant Acceleration[1]:
    # Average of the square roots of the sum of the values of each axis squared √(xi^2 + yi^2+ zi^2) over the ED
    feat = []
    sum_square = 0
    sample = np.power(sample, 2)
    for col in range(0, sample.shape[1]):
        sum_square = sum_square + sample[:, col]

    sample = np.sqrt(sum_square)
    average = np.average(sample)
    feat.append(average)
    return np.mean(feat)

def COR(sample):
    feat = []
    for axis_i in range(0, sample.shape[1]):
        for axis_j in range(axis_i+1, sample.shape[1]):
            cor = np.corrcoef(sample[:, axis_i], sample[:, axis_j])
            cor = 0 if np.isnan(cor) else cor[0][1]
            feat.append(cor)

    return np.mean(feat)


def mag_mean(sample):
    mag = magnitude(sample)
    ft_mean = np.mean(mag)
    return ft_mean

def mag_std(sample):
    mag = magnitude(sample)
    ft_std = np.std(mag)
    return ft_std


def feature_extraction(sample):
    """
    Derive three activity intensity cues: mean and standard deviation of activity intensity,
    and duration of immobility during assessment window to summarize the data.
    # Average - A,
    # Standard Deviation - SD,
    # Average Absolute Difference - AAD,
    # Average Resultant Acceleration - ARA(1),
    """
    features = []

    mag = magnitude(sample)
    features = np.mean(mag)
    features = np.hstack((features, np.std(mag)))
    features = np.hstack((features, A(sample)))
    features = np.hstack((features, SD(sample)))
    features = np.hstack((features, AAD(sample)))
    features = np.hstack((features, ARA(sample)))

    return features

In [3]:
data_input_file = "/home/jsenadesouza/DA-healthy2patient/results/outcomes/dataset/dataset_demographics_poi.npz"
tmp = np.load(data_input_file, allow_pickle=True)
X = tmp["X"]
y = tmp['y']
X_char = tmp['X_char']
y_col_names = list(tmp['y_col_names'])
print(y_col_names)

['heart_rate', 'heart_rate_class', 'temp', 'temp_class', 'lenght_of_stay', 'is_dead', 'pain_score', 'pain_score_class', 'sofa_score', 'sofa_score_class', 'map', 'map_class', 'braden_score', 'braden_score_class', 'spo2', 'spo2_class', 'cam', 'patient_id']


In [4]:
X_trasp = np.transpose(np.squeeze(X), (0, 1, 2))
print("Extracting Features")
start = time()
with Pool(20) as p:
        X_feat = p.map(feature_extraction, X_trasp)
end = time()
print(f"{end-start:.4} seconds passed.")

X_feat = np.array(X_feat)

Extracting Features
23.21 seconds passed.


In [5]:
def get_clinical_data(y, y_col_names, target_col_name):
    regression_val = [0, 2, 6, 8, 10, 14, 16]
    col_target = y_col_names.index(target_col_name)
    col_target_reg = y_col_names.index(target_col_name.split("_class")[0])

    clin_var_idx = []
    for idx in regression_val:
        idx = int(idx)
        if idx != col_target and idx != col_target_reg:
            clin_var_idx.append(idx)

    clin_var = y[:, clin_var_idx]

    print(f'Target = {y_col_names[col_target]}')
    print("\nCLinical variables used:\n")
    print(clin_var_idx)
    for idx in clin_var_idx:
        print(f"{y_col_names[idx]}")

    return clin_var.astype(np.float32)


Classify with SVM using demographics features 

In [7]:
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, recall_score
from scipy import stats as st

skf = StratifiedKFold(n_splits=5)
target_col_name = "pain_score_class"
col_idx_target = y_col_names.index(target_col_name)
y_target = y[:, col_idx_target]

clin_data = get_clinical_data(y, y_col_names, target_col_name)
POI = np.expand_dims(np.array(X_char)[:, -1], axis=1)
X_data = np.concatenate([X_char], axis=1)
cum_acc, cum_recall,cum_f1 = [], [], []
for train_index, test_index in skf.split(X_data, y_target):
    clf = svm.SVC(class_weight='balanced')
    clf = clf.fit(X_data[train_index], y_target[train_index])
    y_pred = clf.predict(X_data[test_index])
    y_true = y_target[test_index]
    cum_acc.append(accuracy_score(y_true, y_pred))
    cum_f1.append(f1_score(y_true, y_pred, average="macro"))
    cum_recall.append(recall_score(y_true, y_pred, average="macro", zero_division=0))
    print(f" acc = {accuracy_score(y_true, y_pred)}")
    print(f"f1 = {f1_score(y_true, y_pred, average='macro')}")
    print(f'recall = {recall_score(y_true, y_pred, average="macro", zero_division=0)}')
    
ci_mean = st.t.interval(0.9, len(cum_acc) - 1, loc=np.mean(cum_acc), scale=st.sem(cum_acc))
ci_f1 = st.t.interval(0.9, len(cum_f1) -1, loc=np.mean(cum_f1), scale=st.sem(cum_f1))
ci_recall = st.t.interval(0.9, len(cum_recall) -1, loc=np.mean(cum_recall), scale=st.sem(cum_recall))

print('accuracy: {:.2f} ± {:.2f}'.format(np.mean(cum_acc) * 100, abs(np.mean(cum_acc) - ci_mean[0]) * 100))
print('recall: {:.2f} ± {:.2f}'.format(np.mean(cum_recall) * 100, abs(np.mean(cum_recall) - ci_recall[0]) * 100))
print('f1-score: {:.2f} ± {:.2f}'.format(np.mean(cum_f1) * 100, abs(np.mean(cum_f1) - ci_f1[0]) * 100))


Target = pain_score_class

CLinical variables used:

[0, 2, 8, 10, 14, 16]
heart_rate
temp
sofa_score
map
spo2
cam
 acc = 0.5207692307692308
f1 = 0.4348027252850935
recall = 0.6930115745796025
 acc = 0.7338461538461538
f1 = 0.5608039494446093
recall = 0.7303778117492903
 acc = 0.6307692307692307
f1 = 0.43751149315458704
recall = 0.4795300387596899
 acc = 0.2776923076923077
f1 = 0.23016817486098945
recall = 0.24098491140642303
 acc = 0.2063125481139338
f1 = 0.20344051716600736
recall = 0.571843853820598
accuracy: 47.39 ± 21.56
recall: 54.31 ± 18.69
f1-score: 37.33 ± 14.49


In [None]:
#Please use the following code snippets for the corresponding tasks-  

#AUC- 
from sklearn.metrics import roc_curve, auc 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

pred_score = full_model.predict(x_test)  #Sometimes full_model.predict_proba(x_test) 
fpr, tpr, = roc_curve(y_test, pred_score[:,1])  #False positive Rate and True positive rate #if .predict() returns only one value use                                                                                                    #pred_score instead of pred_score[:, 1]
roc_auc = auc(fpr, tpr)

#ROC curve 
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=15)
plt.ylabel('True Positive Rate', fontsize=15)
plt.title('Receiver operating characteristic', fontsize=15)
plt.legend(loc="lower right", prop={"size":14})
plt.show()

#Precision-Recall curve-   #if you need to visualize the Precision-Recall relationship. 
prec, rec, = precision_recall_curve(y_test, pred_score[:,1])  #same instruction as predscore above
avg_prec = average_precision_score(y_test, pred_score[:,1])

#PR curve 
plt.figure()
lw = 2
plt.plot(rec, prec, color='darkorange',
         lw=lw, label='Average Precision = %0.2f' % avg_prec)
plt.plot([0, 1], [1, 0], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall', fontsize=15)
plt.ylabel('Precision', fontsize=15)
plt.title('Precision Recall characteristic', fontsize=15)
plt.legend(loc="lower left", prop={"size":14})
plt.show()

#Finding best Threshold-  #this threshold finding might help you in improving the values of F1-Score, Sensitivity, Specificity, etc. slightly. 
def Find_Optimal_Cutoff(target, predicted):  #Youden index 
    """ Find the optimal probability cutoff point for a classification model related to event rate
    Parameters
    ----------
    target : Matrix with dependent or target data, where rows are observations

    predicted : Matrix with predicted data, where rows are observations

    Returns
    -------    
    list type, with optimal cutoff value
       
    """
    fpr, tpr, threshold = roc_curve(target, predicted)
    i = np.arange(len(tpr))
    roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
    roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]

    return list(roc_t['threshold'])

threshold = Find_Optimal_Cutoff(y_test, pred_score[:,1])  #same instruction as pred_score above
print(threshold)

y_pred_2 = list(map(lambda x: 1 if x > threshold else 0, pred_score[:,1]))  #same instruction as pred_score above. 

confusion_matrix(y_test, y_pred_2)

#if this confusion matrix is better you can calculate all the metrics (e.g., Sensitivity, Specificity, F1-score, Precision and NPV)
#based on y_pred_2. In that case you have to write in the paper that you tuned the threshold using Youden Index. 

