In [1]:
import pandas as pd
import numpy as np
import wfdb
import ast
import collections

In [None]:
path = '../benchmark/RAW_DATA/PTBXL_RAW_temp/physionet.org/files/ptb-xl/1.0.3/'

# Load scp_statements.csv for diagnostic aggregation
agg_df = pd.read_csv(path+'scp_statements.csv', index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]


In [2]:
def load_raw_data(df, sampling_rate, path):
    # files = os.listdir(path+'records100/00000/')
    # files_unique_name = list(set([f[:-4] for f in files]))
    data = [wfdb.rdsamp(path+f) for f in df.filename_lr]
    data = np.array([signal for signal, meta in data])
    return data


def aggregate_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))


In [3]:
# load and convert annotation data
Y = pd.read_csv(path+'ptbxl_database.csv', index_col='ecg_id')
Y = Y.iloc[:366]
Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))

# Load raw signal data
X = load_raw_data(Y, sampling_rate=100, path=path)

In [4]:
# Apply diagnostic superclass
Y['diagnostic_superclass'] = Y.scp_codes.apply(aggregate_diagnostic)

# Split data into train and test
test_fold = 10
# Train
X_train = X[np.where(Y.strat_fold != test_fold)]
y_train = Y[(Y.strat_fold != test_fold)].diagnostic_superclass
# Test
X_test = X[np.where(Y.strat_fold == test_fold)]
y_test = Y[Y.strat_fold == test_fold].diagnostic_superclass

In [14]:
ls_y_train = y_train.tolist()
ls_y_train

[['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['MI'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 [],
 [],
 ['NORM'],
 [],
 ['NORM'],
 ['STTC'],
 [],
 ['NORM'],
 ['NORM'],
 ['STTC'],
 ['NORM'],
 ['STTC'],
 ['NORM'],
 ['HYP'],
 ['NORM'],
 ['CD'],
 ['NORM'],
 [],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['STTC', 'MI'],
 ['CD'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['CD', 'HYP'],
 ['NORM'],
 ['NORM'],
 ['STTC'],
 ['CD'],
 ['MI', 'CD'],
 ['NORM'],
 ['CD'],
 ['NORM'],
 ['STTC'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['MI', 'CD'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['STTC', 'CD'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['HYP'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['NORM'],
 ['STTC', 'CD'],
 ['MI', 'CD'],
 ['NOR

In [6]:
y_train[20:40]

ecg_id
22        [STTC]
23            []
24        [NORM]
25        [NORM]
26        [STTC]
27        [NORM]
28        [STTC]
29        [NORM]
30         [HYP]
31        [NORM]
32          [CD]
33        [NORM]
34            []
35        [NORM]
36        [NORM]
37        [NORM]
39    [STTC, MI]
41          [CD]
42        [NORM]
43        [NORM]
Name: diagnostic_superclass, dtype: object

In [7]:
collections.Counter([1,2,3,2,1])

Counter({1: 2, 2: 2, 3: 1})

In [8]:
dict_y_all = dict()
dict_y_train = dict()
keep_x_indices = []
for i, y in enumerate(ls_y_train):
    if len(y) > 0:
        if len(y)==1:
            if y[0] in dict_y_train.keys():
                dict_y_train[y[0]] += 1
            else:
                dict_y_train[y[0]] = 1
            keep_x_indices.append(i)
        else:
            y = '+'.join(y)
            if y in dict_y_all.keys():
                dict_y_all[y] += 1
            else:
                dict_y_all[y] = 1

In [9]:
len(keep_x_indices)

267

In [10]:
dict_y_train, dict_y_all

({'NORM': 192, 'MI': 16, 'STTC': 32, 'HYP': 5, 'CD': 22},
 {'STTC+MI': 7,
  'CD+HYP': 3,
  'MI+CD': 9,
  'STTC+CD': 5,
  'MI+HYP': 3,
  'STTC+MI+HYP': 4,
  'STTC+HYP': 5,
  'STTC+MI+CD': 2,
  'CD+NORM': 4,
  'STTC+MI+HYP+CD': 1,
  'STTC+CD+HYP': 1})

In [21]:
X_train.shape, y_train.shape

((318, 1000, 12), (318,))

In [23]:
keep_x = []
keep_y = []
for i in keep_x_indices:
    keep_x.append(X_train[i,:,:])
    keep_y.append(ls_y_train[i][0])

In [27]:
keep_x = np.array(keep_x)
keep_y = np.array(keep_y)
keep_x.shape, keep_y.shape

((267, 1000, 12), (267,))

In [25]:
keep_y

['NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'MI',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'STTC',
 'NORM',
 'NORM',
 'STTC',
 'NORM',
 'STTC',
 'NORM',
 'HYP',
 'NORM',
 'CD',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'CD',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'STTC',
 'CD',
 'NORM',
 'CD',
 'NORM',
 'STTC',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'HYP',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'NORM',
 'STTC',
 'NORM',
 'MI',
 'STTC',
 'STTC',
 'STTC',
 'NORM',
 'HYP',
 'NORM',
 'STTC',
 'NORM',
 'NORM',
 'NORM',
 'MI',
 'CD',
 'MI',
 'NORM',
 'NORM',
 'NORM',
 'MI

In [29]:
keep_y[keep_y=='NORM'] = 0
keep_y[keep_y=='CD'] = 1
keep_y[keep_y=='MI'] = 2
keep_y[keep_y=='HYP'] = 3
keep_y[keep_y=='STTC'] = 4
keep_y.astype(int)

array([0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0,
       4, 0, 3, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 1, 0, 1, 0, 4, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 4, 0, 2, 4, 4, 4, 0, 3, 0, 4, 0, 0, 0, 2, 1, 2,
       0, 0, 0, 2, 0, 4, 0, 0, 4, 1, 4, 4, 0, 0, 2, 0, 0, 0, 2, 0, 4, 0,
       1, 2, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 2, 4, 4, 0, 0, 0, 2, 0, 2, 0, 0, 4, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 3, 0, 0, 0,
       2, 4, 4, 0, 2, 0, 2, 4, 0, 4, 4, 0, 1, 0, 3, 0, 1, 0, 0, 4, 0, 0,
       4, 0, 1, 1, 1, 4, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0,
       0, 0, 0])

In [7]:
from sklearn.preprocessing import StandardScaler
import numpy as np
scaler = StandardScaler()
x_train = np.load('../benchmark/RAW_DATA/PTBXL_SUPERCLASS/x_train.npy', allow_pickle=True)
y_train = np.load('../benchmark/RAW_DATA/PTBXL_SUPERCLASS/y_train.npy', allow_pickle=True)
x_test = np.load('../benchmark/RAW_DATA/PTBXL_SUPERCLASS/x_test.npy', allow_pickle=True)
y_test = np.load('../benchmark/RAW_DATA/PTBXL_SUPERCLASS/y_test.npy', allow_pickle=True)
# scaler.fit(x_train.reshape(-1, 1))


In [8]:
x_train.shape, x_test.shape

((14594, 1000, 12), (1650, 1000, 12))

In [9]:
import collections
collections.Counter(y_train), collections.Counter(y_test)

(Counter({0: 8157, 2: 2276, 4: 2158, 3: 479, 1: 1524}),
 Counter({0: 912, 2: 256, 1: 184, 4: 242, 3: 56}))