In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
input_dir = '/data/datasets/ICU_Data/EHR_Data/truncated/2020-02-26/'
df = pd.read_csv(os.path.join(input_dir, 'encounters_0_trimmed.csv'))

In [3]:
data_input_file = "/home/jsenadesouza/DA-healthy2patient/results/outcomes/dataset/f10_t1800_outcomesscore_patientid_acc_30minmargin_measurednowcol_30min_10hz_filtered.npz"
tmp = np.load(data_input_file, allow_pickle=True)
X= tmp['X']
y = tmp['y']
y_col_names = list(tmp['y_col_names'])
col_idx = y_col_names.index('patient_id')
y_patient = np.array(y[:, col_idx])
y_target = np.unique(y_patient)

In [4]:
import math, os
import numpy as np
from scipy import signal
import pandas as pd
import resampy

##predefined filter coefficients, as found by Jan Brond
A_coeff = np.array(
    [1, -4.1637, 7.5712,-7.9805, 5.385, -2.4636, 0.89238, 0.06361, -1.3481, 2.4734, -2.9257, 2.9298, -2.7816, 2.4777,
     -1.6847, 0.46483, 0.46565, -0.67312, 0.4162, -0.13832, 0.019852])
B_coeff = np.array(
    [0.049109, -0.12284, 0.14356, -0.11269, 0.053804, -0.02023, 0.0063778, 0.018513, -0.038154, 0.048727, -0.052577,
     0.047847, -0.046015, 0.036283, -0.012977, -0.0046262, 0.012835, -0.0093762, 0.0034485, -0.00080972, -0.00019623])

def pptrunc(data, max_value):
    '''
    Saturate a vector such that no element's absolute value exceeds max_abs_value.
    Current name: absolute_saturate().
      :param data: a vector of any dimension containing numerical data
      :param max_value: a float value of the absolute value to not exceed
      :return: the saturated vector
    '''
    outd = np.where(data > max_value, max_value, data)
    return np.where(outd < -max_value, -max_value, outd)

def trunc(data, min_value):
  
    '''
    Truncate a vector such that any value lower than min_value is set to 0.
    Current name zero_truncate().
    :param data: a vector of any dimension containing numerical data
    :param min_value: a float value the elements of data should not fall below
    :return: the truncated vector
    '''

    return np.where(data < min_value, 0, data)

def runsum(data, length, threshold):
    '''
    Compute the running sum of values in a vector exceeding some threshold within a range of indices.
    Divides the data into len(data)/length chunks and sums the values in excess of the threshold for each chunk.
    Current name run_sum().
    :param data: a 1D numerical vector to calculate the sum of
    :param len: the length of each chunk to compute a sum along, as a positive integer
    :param threshold: a numerical value used to find values exceeding some threshold
    :return: a vector of length len(data)/length containing the excess value sum for each chunk of data
    '''
    
    N = len(data)
    cnt = int(math.ceil(N/length))

    rs = np.zeros(cnt)

    for n in range(cnt):
        for p in range(length*n, length*(n+1)):
            if p<N and data[p]>=threshold:
                rs[n] = rs[n] + data[p] - threshold

    return rs

def counts(data, filesf, B=B_coeff, A=A_coeff):
    '''
    Get activity counts for a set of accelerometer observations.
    First resamples the data frequency to 30Hz, then applies a Butterworth filter to the signal, then filters by the
    coefficient matrices, saturates and truncates the result, and applies a running sum to get the final counts.
    Current name get_actigraph_counts()
    :param data: the vertical axis of accelerometer readings, as a vector
    :param filesf: the number of observations per second in the file
    :param a: coefficient matrix for filtering the signal, as found by Jan Brond
    :param b: coefficient matrix for filtering the signal, as found by Jan Brond
    :return: a vector containing the final counts
    '''
    
    deadband = 0.068
    sf = 30
    peakThreshold = 2.13
    adcResolution = 0.0164
    integN = 10
    gain = 0.965

    #if filesf>sf:
    data = resampy.resample(np.asarray(data), filesf, sf)

    B2, A2 = signal.butter(4, np.array([0.01, 7])/(sf/2), btype='bandpass')
    dataf = signal.filtfilt(B2, A2, data)

    B = B * gain

    #NB: no need for a loop here as we only have one axis in array
    fx8up = signal.lfilter(B, A, dataf)

    fx8 = pptrunc(fx8up[::3], peakThreshold) #downsampling is replaced by slicing with step parameter

    return runsum(np.floor(trunc(np.abs(fx8), deadband)/adcResolution), integN, 0)

In [5]:
def POI(sample):
    """
    Calculate the percentage of time spent immobile in a window
    """
    def calc_mob_per_min(countx, county, countz):
        mob_per_min = []
        for i in range(0, len(countx), 60):
            countx_1m = np.mean(countx[i:i+60])
            county_1m = np.mean(county[i:i+60])
            countz_1m = np.mean(countz[i:i+60])
            mob_per_min.append(np.mean([countx_1m, county_1m, countz_1m]))
        return mob_per_min

    def percentagem_of_immobility(mob_per_min):
        mob_per_min = np.asarray(mob_per_min)
        inactivity_counts = (mob_per_min <= 4).sum() 
        return inactivity_counts/len(mob_per_min)

    # calculate counts per axis
    c1_1s = counts(sample[0], 10)
    c2_1s = counts(sample[1], 10)
    c3_1s = counts(sample[2], 10)
    mob_per_min = calc_mob_per_min(c1_1s, c2_1s, c3_1s)
    POI = percentagem_of_immobility(mob_per_min)
    return POI
    
        

In [6]:
patients_char = []
for patient_id in y_target:
    row = df[df['record_id'] == int(patient_id)]
    admit = datetime.strptime(row['admit_datetime'].values[0], '%Y-%m-%d %H:%M:%S')
    birth = datetime.strptime(row['birth_date'].values[0], '%Y-%m-%d')
    dischg = datetime.strptime(row['dischg_datetime'].values[0], '%Y-%m-%d %H:%M:%S')
    lenght_stay = abs((dischg - admit).days)
    age = int((admit - birth).days/365)
    gender = row['sex'].values[0]
    race = row['race'].values[0]
    height = row['height_cm'].values[0]
    weight = row['weight_kgs'].values[0]
    patients_char.append({'patient_id': int(patient_id), 'sex': gender, 'race': race, 'height_cm': height, 'age':age, 'weight_kgs':weight, 'lenght_stay':lenght_stay})

df_char = pd.DataFrame(data=patients_char)

variable = 'lenght_stay'
print(f'{df_char[variable].mean():.2f}({df_char[variable].std():.2f})')

21.85(18.06)


In [9]:
df_char.head()

Unnamed: 0,patient_id,sex,race,height_cm,age,weight_kgs,lenght_stay
0,100,1,1,172.720001,58,82.553744,5
1,101,1,1,180.339996,49,101.406161,23
2,102,1,1,190.5,63,102.001501,42
3,103,1,1,162.559998,80,60.781328,15
4,106,1,1,177.800003,45,77.989475,13


In [8]:
df_char.loc[df_char.sex == 'MALE', 'sex']= 0
df_char.loc[df_char.sex != 'MALE', 'sex']= 1
df_char.loc[df_char.race == 'BLACK', 'race']= 0
df_char.loc[df_char.race != 'BLACK', 'race']= 1

In [20]:
X_char = []
col_patient = y_col_names.index('patient_id')
col_target = y_col_names.index('braden_score_class')
for xx, sample in zip(X.squeeze(), y):
    char_pat = df_char[df_char["patient_id"] == int(sample[col_patient])]
    char_final = list(char_pat.loc[:, char_pat.columns != "patient_id"].values[0])
    xx = np.transpose(xx, (1,0))
    poi = POI(xx)

    char_final.append(poi)
    X_char.append(char_final)

In [22]:
out_file = "/home/jsenadesouza/DA-healthy2patient/results/outcomes/dataset/dataset_demographics_poi.npz"
np.savez(out_file, X=tmp['X'], y=tmp['y'], y_col_names=tmp['y_col_names'], X_char=X_char)

In [43]:
X_char = []
col_patient = y_col_names.index('patient_id')
col_target = y_col_names.index('braden_score_class')
for sample in y:
    char_pat = df_char[df_char["patient_id"] == int(sample[col_patient])]
    char_final = char_pat.loc[:, char_pat.columns != "patient_id"].values[0]
    X_char.append(char_final)

In [40]:
np.unique(np.array(X_char)[:, -1], return_counts=True)

(array([0.        , 0.03333333, 0.06666667, 0.1       , 0.13333333,
        0.16666667, 0.2       , 0.23333333, 0.26666667, 0.3       ,
        0.33333333, 0.36666667, 0.4       , 0.43333333, 0.46666667,
        0.5       , 0.53333333, 0.56666667, 0.6       , 0.63333333,
        0.66666667, 0.7       , 0.73333333, 0.76666667, 0.8       ,
        0.83333333, 0.86666667, 0.9       , 0.93333333, 0.96666667,
        1.        ]),
 array([  41,   19,   29,   24,   22,   28,   25,   15,   26,   25,   30,
          30,   34,   35,   41,   42,   68,   65,   76,   83,   82,  117,
         123,  151,  203,  217,  210,  314,  406,  476, 3442]))

Classify with SVM using demographics features 

In [42]:
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, recall_score
from scipy import stats as st

skf = StratifiedKFold(n_splits=5)

y_braden =  y[:,col_target]
X_data = np.array(X_char)[:, 0:6]
cum_acc, cum_recall,cum_f1 = [], [], []
for train_index, test_index in skf.split(X_data, y_braden):
    clf = svm.SVC()
    clf = clf.fit(X_data[train_index], y_braden[train_index])
    y_pred = clf.predict(X_data[test_index])
    y_true = y_braden[test_index]
    cum_acc.append(accuracy_score(y_true, y_pred))
    cum_f1.append(f1_score(y_true, y_pred, average="macro"))
    cum_recall.append(recall_score(y_true, y_pred, average="macro", zero_division=0))
    print(f" acc = {accuracy_score(y_true, y_pred)}")
    print(f"f1 = {f1_score(y_true, y_pred, average='macro')}")
    print(f'recall = {recall_score(y_true, y_pred, average="macro", zero_division=0)}')
    
ci_mean = st.t.interval(0.9, len(cum_acc) - 1, loc=np.mean(cum_acc), scale=st.sem(cum_acc))
ci_f1 = st.t.interval(0.9, len(cum_f1) -1, loc=np.mean(cum_f1), scale=st.sem(cum_f1))
ci_recall = st.t.interval(0.9, len(cum_recall) -1, loc=np.mean(cum_recall), scale=st.sem(cum_recall))

print('accuracy: {:.2f} ± {:.2f}'.format(np.mean(cum_acc) * 100, abs(np.mean(cum_acc) - ci_mean[0]) * 100))
print('f1-score: {:.2f} ± {:.2f}'.format(np.mean(cum_f1) * 100, abs(np.mean(cum_f1) - ci_f1[0]) * 100))
print('recall: {:.2f} ± {:.2f}'.format(np.mean(cum_recall) * 100, abs(np.mean(cum_recall) - ci_recall[0]) * 100))

 acc = 0.6230769230769231
f1 = 0.38388625592417064
recall = 0.5
 acc = 0.6230769230769231
f1 = 0.38388625592417064
recall = 0.5
 acc = 0.25384615384615383
f1 = 0.25255648207202486
recall = 0.28110355253212393
 acc = 0.6230769230769231
f1 = 0.38388625592417064
recall = 0.5
 acc = 0.6104695919938414
f1 = 0.37906309751434036
recall = 0.4901112484548826
accuracy: 54.67 ± 15.62
f1-score: 35.67 ± 5.55
recall: 45.42 ± 9.24


In [32]:
y[:, col_patient]

array(['44', '44', '44', ..., '65', '65', '65'], dtype='<U18')

In [None]:
print("Extracting Features")
start = time()
with Pool(20) as p:
        X_feat = p.map(feature_extraction, X_trasp)
end = time()
print(f"{end-start:.4} seconds passed.")
#X_feat = np.array(feature_extraction(X))

In [None]:
from sklearn import preprocessing
def isfloat(num):
    try:
        float(num)
        return True
    except ValueError:
        return False

new_y = []
for col in range(y.shape[1]):
    if isfloat(y[0, col]):
        if len(new_y) == 0:
            new_y = y[:,col]
        else:
            new_y = np.vstack((new_y, y[:,col]))
    else:
        le = preprocessing.LabelEncoder()
        le.fit(y[:, col])
        transformed_col = le.transform(y[:, col])
        if len(new_y) == 0:
            new_y = transformed_col
        else:
            new_y = np.vstack((new_y, transformed_col))
new_y = np.transpose(np.array(new_y), (1,0)).astype(np.float32)