# Import module and data
---

In [1]:
import os
import numpy as np
import pandas as pd
import datetime as datetime

# !pip install wfdb
import wfdb # Waveform Database Software Package (WFDB) for Python
import ast

In [2]:
my_path = os.getcwd()
path = '/home/ubuntu/dr-you-ecg-20220420_mount/STEMI_JKL/PTB_XL'
my_dir = '/home/ubuntu/dr-you-ecg-20220420_mount/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.1/'

# Preprocess
---
### (1) load data

In [3]:
Y = pd.read_csv('/home/ubuntu/dr-you-ecg-20220420_mount/STEMI_JKL/PTB_XL/PTB_label_short.csv') #21837
Y = Y.drop(['diagnosis'], axis=1)
Y.head()

Unnamed: 0,ecg_id,patient_id,scp_codes,filename_hr
0,1,15709.0,"{'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}",records500/00000/00001_hr
1,2,13243.0,"{'NORM': 80.0, 'SBRAD': 0.0}",records500/00000/00002_hr
2,3,20372.0,"{'NORM': 100.0, 'SR': 0.0}",records500/00000/00003_hr
3,4,17014.0,"{'NORM': 100.0, 'SR': 0.0}",records500/00000/00004_hr
4,5,17448.0,"{'NORM': 100.0, 'SR': 0.0}",records500/00000/00005_hr


---
### (2) make the label table

In [4]:
# replace '/' to '_' (string '/' causes syntax error) : 21837
# remove data which have uncertain diagnosis or rhythm (only leave 100% likelihood) : 17903
scp_list = []
for i in range(len(Y)):
    scp_dict = Y.at[i,'scp_codes']
    scp_dict = ast.literal_eval(scp_dict)
    key_list = []
    for key, value in scp_dict.items():
        if value == 100:
            if "/" in key:
                key = key.replace('/','_')
            key_list.append(key)
    scp_list.append(key_list)
    
Y['confirm_disease'] = scp_list

In [5]:
full_index = []
for i in range(len(Y)):
    val_list = Y.iloc[i]['confirm_disease']
    if len(val_list) != 0:
        full_index.append(i)

df_Y = Y.loc[full_index]
df_Y = df_Y.reset_index(drop=True)
df_Y

Unnamed: 0,ecg_id,patient_id,scp_codes,filename_hr,confirm_disease
0,1,15709.0,"{'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}",records500/00000/00001_hr,[NORM]
1,3,20372.0,"{'NORM': 100.0, 'SR': 0.0}",records500/00000/00003_hr,[NORM]
2,4,17014.0,"{'NORM': 100.0, 'SR': 0.0}",records500/00000/00004_hr,[NORM]
3,5,17448.0,"{'NORM': 100.0, 'SR': 0.0}",records500/00000/00005_hr,[NORM]
4,6,19005.0,"{'NORM': 100.0, 'SR': 0.0}",records500/00000/00006_hr,[NORM]
...,...,...,...,...,...
17898,21832,7954.0,"{'LAFB': 100.0, 'IVCD': 100.0, 'SR': 0.0}",records500/21000/21832_hr,"[LAFB, IVCD]"
17899,21833,17180.0,"{'NDT': 100.0, 'PVC': 100.0, 'VCLVH': 0.0, 'ST...",records500/21000/21833_hr,"[NDT, PVC]"
17900,21834,20703.0,"{'NORM': 100.0, 'ABQRS': 0.0, 'SR': 0.0}",records500/21000/21834_hr,[NORM]
17901,21836,8873.0,"{'NORM': 100.0, 'SR': 0.0}",records500/21000/21836_hr,[NORM]


In [6]:
# remove 'PMI' and 'STACH' (because it's too small to train the model) : 17898
# PMI : (3199, 4558, 5950, 12360) / STACH : (10476)
r_list = []
for i in range(len(df_Y)):
    List = df_Y.iloc[i]['confirm_disease']
    for j in List:
        if j == 'STACH' or j == 'PMI': 
            r_list.append(i)
r_list

[3199, 4558, 5950, 10476, 12360]

In [7]:
# check the remove list
for i in r_list:
    print(df_Y.iloc[i])
    print('\n')

ecg_id                                                          3859
patient_id                                                    8132.0
scp_codes          {'PMI': 100.0, 'ASMI': 100.0, 'IVCD': 100.0, '...
filename_hr                                records500/03000/03859_hr
confirm_disease                                    [PMI, ASMI, IVCD]
Name: 3199, dtype: object


ecg_id                                                          5543
patient_id                                                    8963.0
scp_codes          {'PMI': 100.0, 'LMI': 50.0, 'SEHYP': 50.0, 'AB...
filename_hr                                records500/05000/05543_hr
confirm_disease                                                [PMI]
Name: 4558, dtype: object


ecg_id                                    7244
patient_id                             17069.0
scp_codes          {'PMI': 100.0, 'AFIB': 0.0}
filename_hr          records500/07000/07244_hr
confirm_disease                          [PMI]
Name: 5950, dtype: 

In [8]:
for i in r_list:
    df_Y = df_Y.drop(index = i)

df_Y = df_Y.reset_index(drop=True)
df_Y

Unnamed: 0,ecg_id,patient_id,scp_codes,filename_hr,confirm_disease
0,1,15709.0,"{'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}",records500/00000/00001_hr,[NORM]
1,3,20372.0,"{'NORM': 100.0, 'SR': 0.0}",records500/00000/00003_hr,[NORM]
2,4,17014.0,"{'NORM': 100.0, 'SR': 0.0}",records500/00000/00004_hr,[NORM]
3,5,17448.0,"{'NORM': 100.0, 'SR': 0.0}",records500/00000/00005_hr,[NORM]
4,6,19005.0,"{'NORM': 100.0, 'SR': 0.0}",records500/00000/00006_hr,[NORM]
...,...,...,...,...,...
17893,21832,7954.0,"{'LAFB': 100.0, 'IVCD': 100.0, 'SR': 0.0}",records500/21000/21832_hr,"[LAFB, IVCD]"
17894,21833,17180.0,"{'NDT': 100.0, 'PVC': 100.0, 'VCLVH': 0.0, 'ST...",records500/21000/21833_hr,"[NDT, PVC]"
17895,21834,20703.0,"{'NORM': 100.0, 'ABQRS': 0.0, 'SR': 0.0}",records500/21000/21834_hr,[NORM]
17896,21836,8873.0,"{'NORM': 100.0, 'SR': 0.0}",records500/21000/21836_hr,[NORM]


In [66]:
# save
#df_Y.to_csv('/home/ubuntu/dr-you-ecg-20220420_mount/STEMI_JKL/PTB_table_220616.csv', index=False)

---
### (3) make the y_dataset (by sklearn.MultiLabelBinarizer)

In [None]:
# import the table
#df_Y=pd.read_csv('/home/ubuntu/dr-you-ecg-20220420_mount/STEMI_JKL/PTB_table_220616.csv')

In [9]:
y_list = df_Y['confirm_disease'].values.tolist()
y_list[:5]

[['NORM'], ['NORM'], ['NORM'], ['NORM'], ['NORM']]

In [40]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(df_Y['confirm_disease'].values)
labels

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [41]:
# labels
labels.shape

(17898, 50)

In [45]:
# classes (50)
mlb.classes_ 

array(['1AVB', '2AVB', '3AVB', 'AFIB', 'AFLT', 'ALMI', 'AMI', 'ANEUR',
       'ASMI', 'BIGU', 'CLBBB', 'CRBBB', 'DIG', 'EL', 'ILBBB', 'ILMI',
       'IMI', 'INJAL', 'INJAS', 'INJIL', 'INJIN', 'INJLA', 'IPLMI',
       'IPMI', 'IRBBB', 'ISCAL', 'ISCAN', 'ISCAS', 'ISCIL', 'ISCIN',
       'ISCLA', 'ISC_', 'IVCD', 'LAFB', 'LAO_LAE', 'LMI', 'LNGQT', 'LPFB',
       'LVH', 'NDT', 'NORM', 'NST_', 'PAC', 'PACE', 'PSVT', 'PVC',
       'RAO_RAE', 'RVH', 'SEHYP', 'WPW'], dtype=object)

In [42]:
# save the y_dataset 
# np.save('/home/ubuntu/dr-you-ecg-20220420_mount/STEMI_JKL/PTB_Y_0616',labels)

In [46]:
# make a label table
df_ptb = pd.DataFrame(columns=mlb.classes_, data=labels)
df_ptb.insert(0,'filename_hr',df_Y['filename_hr'])
df_ptb.insert(0,'confirm_disease',df_Y['confirm_disease'])
df_ptb.insert(0,'patient_id',df_Y['patient_id'])
df_ptb.insert(0,'ecg_id',df_Y['ecg_id'])
df_ptb.head()

Unnamed: 0,ecg_id,patient_id,confirm_disease,filename_hr,1AVB,2AVB,3AVB,AFIB,AFLT,ALMI,...,NORM,NST_,PAC,PACE,PSVT,PVC,RAO_RAE,RVH,SEHYP,WPW
0,1,15709.0,[NORM],records500/00000/00001_hr,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,3,20372.0,[NORM],records500/00000/00003_hr,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,4,17014.0,[NORM],records500/00000/00004_hr,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,5,17448.0,[NORM],records500/00000/00005_hr,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,6,19005.0,[NORM],records500/00000/00006_hr,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [47]:
# save the label table
# df_ptb.to_csv('/home/ubuntu/dr-you-ecg-20220420_mount/STEMI_JKL/PTB_multi_label_220616.csv', index=False)

In [49]:
# check
df_ptb.loc[17000]

ecg_id                                 20749
patient_id                           19560.0
confirm_disease                [ASMI, ILBBB]
filename_hr        records500/20000/20749_hr
1AVB                                       0
2AVB                                       0
3AVB                                       0
AFIB                                       0
AFLT                                       0
ALMI                                       0
AMI                                        0
ANEUR                                      0
ASMI                                       0
BIGU                                       0
CLBBB                                      0
CRBBB                                      0
DIG                                        0
EL                                         0
ILBBB                                      0
ILMI                                       0
IMI                                        0
INJAL                                      0
INJAS     

---
### (4) make the x_dataset

In [None]:
def load_raw_data(df, sampling_rate, path):
    data = [wfdb.rdsamp(path+f) for f in df.filename_hr]
    data = np.array([signal for signal, meta in data])
    return data

path = my_dir
sampling_rate=500

# Load raw signal data
X = load_raw_data(df_Y, sampling_rate, path)

In [51]:
X 

array([[[-0.115, -0.05 ,  0.065, ..., -0.035, -0.035, -0.075],
        [-0.115, -0.05 ,  0.065, ..., -0.035, -0.035, -0.075],
        [-0.115, -0.05 ,  0.065, ..., -0.035, -0.035, -0.075],
        ...,
        [ 0.21 ,  0.205, -0.005, ...,  0.185,  0.17 ,  0.18 ],
        [ 0.21 ,  0.205, -0.005, ...,  0.185,  0.17 ,  0.18 ],
        [ 0.21 ,  0.205, -0.005, ...,  0.185,  0.17 ,  0.18 ]],

       [[-0.035, -0.07 , -0.035, ..., -0.1  , -0.075, -0.065],
        [-0.035, -0.07 , -0.035, ..., -0.1  , -0.075, -0.065],
        [-0.035, -0.07 , -0.035, ..., -0.1  , -0.075, -0.065],
        ...,
        [-0.04 , -0.18 , -0.14 , ..., -0.015,  0.02 ,  0.025],
        [-0.04 , -0.18 , -0.14 , ..., -0.015,  0.02 ,  0.025],
        [-0.04 , -0.18 , -0.14 , ..., -0.015,  0.02 ,  0.025]],

       [[-0.055, -0.155, -0.1  , ..., -0.305, -0.185, -0.175],
        [-0.055, -0.155, -0.1  , ..., -0.305, -0.185, -0.175],
        [-0.055, -0.155, -0.1  , ..., -0.305, -0.185, -0.175],
        ...,
        [ 0.

In [60]:
len(X)

17898

In [61]:
# 8 lead and Padding
# 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
# I, II, III, aVL, aVR, aVF, V1-V6
x_array = []
for i in X:
    df_ecg = pd.DataFrame(i)
    nm = df_ecg[[0, 1, 6, 7, 8, 9, 10, 11]]
    nm_numpy = nm.to_numpy()
    pad_ecg = np.pad(nm,((120,0),(0,0)),'constant',constant_values=0) # Lead zero padded to 5120
    x_array.append(pad_ecg)
    
X_dataset = np.array(x_array)

In [62]:
X_dataset.shape

(17898, 5120, 8)

In [64]:
# x_dataset 
np.save('/home/ubuntu/dr-you-ecg-20220420_mount/STEMI_JKL/PTB_X_0616',X_dataset)