# Unzip the file
- [source] https://physionet.org/content/ptb-xl/1.0.1/
---

In [None]:
import zipfile

try:
    with zipfile.ZipFile("/home/ubuntu/dr-you-ecg-20220420_mount/PTB_XL.zip") as zf:
        zf.extractall(path = '/home/ubuntu/dr-you-ecg-20220420_mount')
        print("uncompress success")
except:
    print("uncompress fail")

In [2]:
my_path = os.getcwd()
my_dir = '/home/ubuntu/dr-you-ecg-20220420_mount/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.1/'

# Import module and data
---

In [9]:
import os
import numpy as np
import pandas as pd
import datetime as datetime

# !pip install wfdb
import wfdb # Waveform Database Software Package (WFDB) for Python
import ast

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
You should consider upgrading via the '/home/ubuntu/anaconda3/envs/tensorflow2_p38/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

# Preprocess
---
### (1) load data

In [None]:
def load_raw_data(df, sampling_rate, path):
    data = [wfdb.rdsamp(path+f) for f in df.filename_hr]
    data = np.array([signal for signal, meta in data])
    return data

path = my_dir
sampling_rate=500

# load and convert annotation data
Y = pd.read_csv(path+'ptbxl_database.csv', index_col='ecg_id')
Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))

# Load raw signal data
X = load_raw_data(Y, sampling_rate, path)

# Load scp_statements.csv for diagnostic aggregation
agg_df = pd.read_csv(path+'scp_statements.csv', index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]

In [63]:
Y.scp_codes

ecg_id
1                 {'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}
2                             {'NORM': 80.0, 'SBRAD': 0.0}
3                               {'NORM': 100.0, 'SR': 0.0}
4                               {'NORM': 100.0, 'SR': 0.0}
5                               {'NORM': 100.0, 'SR': 0.0}
                               ...                        
21833    {'NDT': 100.0, 'PVC': 100.0, 'VCLVH': 0.0, 'ST...
21834             {'NORM': 100.0, 'ABQRS': 0.0, 'SR': 0.0}
21835                           {'ISCAS': 50.0, 'SR': 0.0}
21836                           {'NORM': 100.0, 'SR': 0.0}
21837                           {'NORM': 100.0, 'SR': 0.0}
Name: scp_codes, Length: 21837, dtype: object

In [98]:
agg_df

Unnamed: 0,description,diagnostic,form,rhythm,diagnostic_class,diagnostic_subclass,Statement Category,SCP-ECG Statement Description,AHA code,aECG REFID,CDISC Code,DICOM Code
NDT,non-diagnostic T abnormalities,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,non-diagnostic T abnormalities,,,,
NST_,non-specific ST changes,1.0,1.0,,STTC,NST_,Basic roots for coding ST-T changes and abnorm...,non-specific ST changes,145.0,MDC_ECG_RHY_STHILOST,,
DIG,digitalis-effect,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,suggests digitalis-effect,205.0,,,
LNGQT,long QT-interval,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,long QT-interval,148.0,,,
NORM,normal ECG,1.0,,,NORM,NORM,Normal/abnormal,normal ECG,1.0,,,F-000B7
IMI,inferior myocardial infarction,1.0,,,MI,IMI,Myocardial Infarction,inferior myocardial infarction,161.0,,,
ASMI,anteroseptal myocardial infarction,1.0,,,MI,AMI,Myocardial Infarction,anteroseptal myocardial infarction,165.0,,,
LVH,left ventricular hypertrophy,1.0,,,HYP,LVH,Ventricular Hypertrophy,left ventricular hypertrophy,142.0,,C71076,
LAFB,left anterior fascicular block,1.0,,,CD,LAFB/LPFB,Intraventricular and intra-atrial Conduction d...,left anterior fascicular block,101.0,MDC_ECG_BEAT_BLK_ANT_L_HEMI,C62267,D3-33140
ISC_,non-specific ischemic,1.0,,,STTC,ISC_,Basic roots for coding ST-T changes and abnorm...,ischemic ST-T changes,226.0,,,


### ※ include and reset the index 
∵ some elements of index include string '/' and it makes an error

In [47]:
agg_df2 = agg_df.reset_index(level=0)
agg_df2 = agg_df2.rename(columns={'index':'diag'})
agg_df2

Unnamed: 0,diag,description,diagnostic,form,rhythm,diagnostic_class,diagnostic_subclass,Statement Category,SCP-ECG Statement Description,AHA code,aECG REFID,CDISC Code,DICOM Code
0,NDT,non-diagnostic T abnormalities,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,non-diagnostic T abnormalities,,,,
1,NST_,non-specific ST changes,1.0,1.0,,STTC,NST_,Basic roots for coding ST-T changes and abnorm...,non-specific ST changes,145.0,MDC_ECG_RHY_STHILOST,,
2,DIG,digitalis-effect,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,suggests digitalis-effect,205.0,,,
3,LNGQT,long QT-interval,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,long QT-interval,148.0,,,
4,NORM,normal ECG,1.0,,,NORM,NORM,Normal/abnormal,normal ECG,1.0,,,F-000B7
5,IMI,inferior myocardial infarction,1.0,,,MI,IMI,Myocardial Infarction,inferior myocardial infarction,161.0,,,
6,ASMI,anteroseptal myocardial infarction,1.0,,,MI,AMI,Myocardial Infarction,anteroseptal myocardial infarction,165.0,,,
7,LVH,left ventricular hypertrophy,1.0,,,HYP,LVH,Ventricular Hypertrophy,left ventricular hypertrophy,142.0,,C71076,
8,LAFB,left anterior fascicular block,1.0,,,CD,LAFB/LPFB,Intraventricular and intra-atrial Conduction d...,left anterior fascicular block,101.0,MDC_ECG_BEAT_BLK_ANT_L_HEMI,C62267,D3-33140
9,ISC_,non-specific ischemic,1.0,,,STTC,ISC_,Basic roots for coding ST-T changes and abnorm...,ischemic ST-T changes,226.0,,,


In [102]:
D_list = agg_df2['diag'].to_numpy()
D_list

array(['NDT', 'NST_', 'DIG', 'LNGQT', 'NORM', 'IMI', 'ASMI', 'LVH',
       'LAFB', 'ISC_', 'IRBBB', '1AVB', 'IVCD', 'ISCAL', 'CRBBB', 'CLBBB',
       'ILMI', 'LAO/LAE', 'AMI', 'ALMI', 'ISCIN', 'INJAS', 'LMI', 'ISCIL',
       'LPFB', 'ISCAS', 'INJAL', 'ISCLA', 'RVH', 'ANEUR', 'RAO/RAE', 'EL',
       'WPW', 'ILBBB', 'IPLMI', 'ISCAN', 'IPMI', 'SEHYP', 'INJIN',
       'INJLA', 'PMI', '3AVB', 'INJIL', '2AVB'], dtype=object)

In [107]:
# find which elements of index include string '/'
for i,D in enumerate(D_list):
    if "/" in D:
        print(D)
        print(i)

LAO/LAE
17
RAO/RAE
30


In [109]:
# replace '/' with '_'
D_list[17] = 'LAO_LAE'
D_list[30] = 'RAO_RAE'

In [117]:
# replace '/' index with '_' index
agg_df2['Diag'] = D_list
agg_df2 = agg_df2.drop(['diag'], axis=1)
agg_df2 = agg_df2.set_index("Diag")

agg_df2['Diagnosis'] = D_list
agg_df2

Unnamed: 0_level_0,description,diagnostic,form,rhythm,diagnostic_class,diagnostic_subclass,Statement Category,SCP-ECG Statement Description,AHA code,aECG REFID,CDISC Code,DICOM Code,Diagnosis
Diag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
NDT,non-diagnostic T abnormalities,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,non-diagnostic T abnormalities,,,,,NDT
NST_,non-specific ST changes,1.0,1.0,,STTC,NST_,Basic roots for coding ST-T changes and abnorm...,non-specific ST changes,145.0,MDC_ECG_RHY_STHILOST,,,NST_
DIG,digitalis-effect,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,suggests digitalis-effect,205.0,,,,DIG
LNGQT,long QT-interval,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,long QT-interval,148.0,,,,LNGQT
NORM,normal ECG,1.0,,,NORM,NORM,Normal/abnormal,normal ECG,1.0,,,F-000B7,NORM
IMI,inferior myocardial infarction,1.0,,,MI,IMI,Myocardial Infarction,inferior myocardial infarction,161.0,,,,IMI
ASMI,anteroseptal myocardial infarction,1.0,,,MI,AMI,Myocardial Infarction,anteroseptal myocardial infarction,165.0,,,,ASMI
LVH,left ventricular hypertrophy,1.0,,,HYP,LVH,Ventricular Hypertrophy,left ventricular hypertrophy,142.0,,C71076,,LVH
LAFB,left anterior fascicular block,1.0,,,CD,LAFB/LPFB,Intraventricular and intra-atrial Conduction d...,left anterior fascicular block,101.0,MDC_ECG_BEAT_BLK_ANT_L_HEMI,C62267,D3-33140,LAFB
ISC_,non-specific ischemic,1.0,,,STTC,ISC_,Basic roots for coding ST-T changes and abnorm...,ischemic ST-T changes,226.0,,,,ISC_


In [116]:
agg_df2.index

Index(['NDT', 'NST_', 'DIG', 'LNGQT', 'NORM', 'IMI', 'ASMI', 'LVH', 'LAFB',
       'ISC_', 'IRBBB', '1AVB', 'IVCD', 'ISCAL', 'CRBBB', 'CLBBB', 'ILMI',
       'LAO_LAE', 'AMI', 'ALMI', 'ISCIN', 'INJAS', 'LMI', 'ISCIL', 'LPFB',
       'ISCAS', 'INJAL', 'ISCLA', 'RVH', 'ANEUR', 'RAO_RAE', 'EL', 'WPW',
       'ILBBB', 'IPLMI', 'ISCAN', 'IPMI', 'SEHYP', 'INJIN', 'INJLA', 'PMI',
       '3AVB', 'INJIL', '2AVB'],
      dtype='object', name='Diag')

---
### (2) make the diagnosis column

In [118]:
def function_diagnosis(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df2.index:
            tmp.append(agg_df2.loc[key].Diagnosis)
    return list(set(tmp))

# Apply diagnostic superclass
Y['diagnosis'] = Y.scp_codes.apply(function_diagnosis)

In [138]:
Y

Unnamed: 0_level_0,patient_id,age,sex,height,weight,nurse,site,device,recording_date,report,...,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr,diagnostic_superclass,diagnosis
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,15709.0,56.0,1,,63.0,2.0,0.0,CS-12 E,1984-11-09 09:17:34,sinusrhythmus periphere niederspannung,...,", I-V1,",,,,,3,records100/00000/00001_lr,records500/00000/00001_hr,[NORM],[NORM]
2,13243.0,19.0,0,,70.0,2.0,0.0,CS-12 E,1984-11-14 12:55:37,sinusbradykardie sonst normales ekg,...,,,,,,2,records100/00000/00002_lr,records500/00000/00002_hr,[NORM],[NORM]
3,20372.0,37.0,1,,69.0,2.0,0.0,CS-12 E,1984-11-15 12:49:10,sinusrhythmus normales ekg,...,,,,,,5,records100/00000/00003_lr,records500/00000/00003_hr,[NORM],[NORM]
4,17014.0,24.0,0,,82.0,2.0,0.0,CS-12 E,1984-11-15 13:44:57,sinusrhythmus normales ekg,...,,,,,,3,records100/00000/00004_lr,records500/00000/00004_hr,[NORM],[NORM]
5,17448.0,19.0,1,,70.0,2.0,0.0,CS-12 E,1984-11-17 10:43:15,sinusrhythmus normales ekg,...,,,,,,4,records100/00000/00005_lr,records500/00000/00005_hr,[NORM],[NORM]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21833,17180.0,67.0,1,,,1.0,2.0,AT-60 3,2001-05-31 09:14:35,ventrikulÄre extrasystole(n) sinustachykardie ...,...,", alles,",,,1ES,,7,records100/21000/21833_lr,records500/21000/21833_hr,[STTC],[NDT]
21834,20703.0,93.0,0,,,1.0,2.0,AT-60 3,2001-06-05 11:33:39,sinusrhythmus lagetyp normal qrs(t) abnorm ...,...,,,,,,4,records100/21000/21834_lr,records500/21000/21834_hr,[NORM],[NORM]
21835,19311.0,59.0,1,,,1.0,2.0,AT-60 3,2001-06-08 10:30:27,sinusrhythmus lagetyp normal t abnorm in anter...,...,", I-AVR,",,,,,2,records100/21000/21835_lr,records500/21000/21835_hr,[STTC],[ISCAS]
21836,8873.0,64.0,1,,,1.0,2.0,AT-60 3,2001-06-09 18:21:49,supraventrikulÄre extrasystole(n) sinusrhythmu...,...,,,,SVES,,8,records100/21000/21836_lr,records500/21000/21836_hr,[NORM],[NORM]


In [123]:
condition = Y['patient_id'] == 13958
Y[condition] 

Unnamed: 0_level_0,patient_id,age,sex,height,weight,nurse,site,device,recording_date,report,...,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr,diagnostic_superclass,diagnosis
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
49,13958.0,23.0,0,,76.0,2.0,0.0,CS-12 E,1985-04-24 07:20:19,sinusrhythmus rechtstyp unspezifische intraven...,...,", I-V2,",,,,,9,records100/00000/00049_lr,records500/00000/00049_hr,[CD],[IVCD]


In [139]:
# save
Y.to_csv('/home/ubuntu/dr-you-ecg-20220420_mount/STEMI_JKL/PTB_label_total.csv', index=False)

In [136]:
Y2 = Y.reset_index(level=0)

PTB_label_short = Y2[['ecg_id', 'patient_id', 'scp_codes', 'diagnosis', 'filename_hr']]
PTB_label_short

Unnamed: 0,ecg_id,patient_id,scp_codes,diagnosis,filename_hr
0,1,15709.0,"{'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}",[NORM],records500/00000/00001_hr
1,2,13243.0,"{'NORM': 80.0, 'SBRAD': 0.0}",[NORM],records500/00000/00002_hr
2,3,20372.0,"{'NORM': 100.0, 'SR': 0.0}",[NORM],records500/00000/00003_hr
3,4,17014.0,"{'NORM': 100.0, 'SR': 0.0}",[NORM],records500/00000/00004_hr
4,5,17448.0,"{'NORM': 100.0, 'SR': 0.0}",[NORM],records500/00000/00005_hr
...,...,...,...,...,...
21832,21833,17180.0,"{'NDT': 100.0, 'PVC': 100.0, 'VCLVH': 0.0, 'ST...",[NDT],records500/21000/21833_hr
21833,21834,20703.0,"{'NORM': 100.0, 'ABQRS': 0.0, 'SR': 0.0}",[NORM],records500/21000/21834_hr
21834,21835,19311.0,"{'ISCAS': 50.0, 'SR': 0.0}",[ISCAS],records500/21000/21835_hr
21835,21836,8873.0,"{'NORM': 100.0, 'SR': 0.0}",[NORM],records500/21000/21836_hr


In [137]:
# save
PTB_label_short.to_csv('/home/ubuntu/dr-you-ecg-20220420_mount/STEMI_JKL/PTB_label_short.csv', index=False)

---
### (3) make the STEMI column

In [201]:
diagnosed = PTB_label_short['diagnosis']
diagnosed

np_diag = diagnosed.to_numpy()

0         [NORM]
1         [NORM]
2         [NORM]
3         [NORM]
4         [NORM]
          ...   
21832      [NDT]
21833     [NORM]
21834    [ISCAS]
21835     [NORM]
21836     [NORM]
Name: diagnosis, Length: 21837, dtype: object

In [203]:
STEMI = ['IMI', 'ASMI', 'ILMI', 'AMI', 'ALMI', 'LMI', 'IPLMI', 'IPMI', 'PMI']
STEMI_index = []

for i, elem in enumerate(np_diag):
    for e in elem:
        if e in STEMI:
            STEMI_index.append(i)
STEMI_index

[7,
 38,
 49,
 62,
 76,
 102,
 105,
 130,
 138,
 141,
 145,
 151,
 152,
 154,
 160,
 161,
 174,
 176,
 180,
 180,
 180,
 181,
 183,
 183,
 188,
 188,
 198,
 209,
 210,
 222,
 233,
 233,
 239,
 256,
 256,
 257,
 262,
 265,
 265,
 266,
 268,
 269,
 269,
 270,
 273,
 280,
 280,
 280,
 281,
 286,
 289,
 293,
 297,
 306,
 307,
 309,
 310,
 318,
 322,
 324,
 336,
 379,
 379,
 382,
 406,
 408,
 417,
 422,
 424,
 428,
 429,
 441,
 452,
 454,
 463,
 476,
 481,
 481,
 482,
 485,
 485,
 491,
 495,
 499,
 506,
 509,
 511,
 512,
 513,
 514,
 517,
 525,
 527,
 529,
 531,
 534,
 534,
 535,
 536,
 539,
 543,
 544,
 546,
 553,
 555,
 557,
 562,
 562,
 563,
 566,
 576,
 578,
 580,
 591,
 593,
 593,
 598,
 599,
 599,
 600,
 603,
 606,
 607,
 609,
 609,
 616,
 621,
 622,
 622,
 626,
 633,
 635,
 635,
 645,
 645,
 650,
 650,
 651,
 660,
 680,
 684,
 690,
 690,
 698,
 699,
 700,
 702,
 703,
 705,
 705,
 707,
 712,
 715,
 716,
 719,
 721,
 722,
 726,
 726,
 727,
 727,
 730,
 733,
 733,
 737,
 741,
 744,
 746

In [214]:
len(STEMI_index)

6473

In [207]:
PTB_label_short['STEMI'] = 0
PTB_label_short

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  PTB_label_short['STEMI'] = 0


Unnamed: 0,ecg_id,patient_id,scp_codes,diagnosis,filename_hr,STEMI
0,1,15709.0,"{'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}",[NORM],records500/00000/00001_hr,0
1,2,13243.0,"{'NORM': 80.0, 'SBRAD': 0.0}",[NORM],records500/00000/00002_hr,0
2,3,20372.0,"{'NORM': 100.0, 'SR': 0.0}",[NORM],records500/00000/00003_hr,0
3,4,17014.0,"{'NORM': 100.0, 'SR': 0.0}",[NORM],records500/00000/00004_hr,0
4,5,17448.0,"{'NORM': 100.0, 'SR': 0.0}",[NORM],records500/00000/00005_hr,0
...,...,...,...,...,...,...
21832,21833,17180.0,"{'NDT': 100.0, 'PVC': 100.0, 'VCLVH': 0.0, 'ST...",[NDT],records500/21000/21833_hr,0
21833,21834,20703.0,"{'NORM': 100.0, 'ABQRS': 0.0, 'SR': 0.0}",[NORM],records500/21000/21834_hr,0
21834,21835,19311.0,"{'ISCAS': 50.0, 'SR': 0.0}",[ISCAS],records500/21000/21835_hr,0
21835,21836,8873.0,"{'NORM': 100.0, 'SR': 0.0}",[NORM],records500/21000/21836_hr,0


In [218]:
for index in AMI_index:
    PTB_label_short.loc[index, 'STEMI'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [223]:
# save
PTB_label_short.to_csv('/home/ubuntu/dr-you-ecg-20220420_mount/STEMI_JKL/PTB_STEMI_label.csv', index=False)

In [222]:
# check
condition = PTB_label_short['STEMI']==1
PTB_label_short[condition]

Unnamed: 0,ecg_id,patient_id,scp_codes,diagnosis,filename_hr,STEMI
7,8,11275.0,"{'IMI': 35.0, 'ABQRS': 0.0, 'SR': 0.0}",[IMI],records500/00000/00008_hr,1
38,39,13619.0,"{'IMI': 15.0, 'LNGQT': 100.0, 'NST_': 100.0, '...","[IMI, NST_, LNGQT, DIG]",records500/00000/00039_hr,1
49,50,16961.0,"{'LMI': 15.0, 'IVCD': 100.0, 'SR': 0.0}","[LMI, IVCD]",records500/00000/00050_hr,1
62,63,15265.0,"{'ASMI': 15.0, 'ABQRS': 0.0, 'SR': 0.0}",[ASMI],records500/00000/00063_hr,1
76,77,8555.0,"{'AMI': 50.0, 'IRBBB': 100.0, 'SR': 0.0}","[IRBBB, AMI]",records500/00000/00077_hr,1
...,...,...,...,...,...,...
21819,21820,18655.0,"{'IMI': 15.0, 'SR': 0.0}",[IMI],records500/21000/21820_hr,1
21823,21824,19977.0,"{'AMI': 15.0, 'ABQRS': 0.0, 'SR': 0.0}",[AMI],records500/21000/21824_hr,1
21825,21826,9178.0,"{'IMI': 80.0, 'ABQRS': 0.0, 'SARRH': 0.0}",[IMI],records500/21000/21826_hr,1
21826,21827,13862.0,"{'IMI': 100.0, 'ISCLA': 50.0, 'ABQRS': 0.0, 'S...","[IMI, ISCLA]",records500/21000/21827_hr,1


# Data Split
---

In [143]:
# Split data into train and test
test_fold = 10
# Train
x_train = X[np.where(Y.strat_fold != test_fold)]
y_train = PTB_label_short[(PTB_label_short.strat_fold != test_fold)].STEMI
# Test
x_test = X[np.where(Y.strat_fold == test_fold)]
y_test = PTB_label_short[PTB_label_short.strat_fold == test_fold].STEMI

In [141]:
x_train.shape

(19634, 5000, 12)

In [142]:
x_test.shape

(2203, 5000, 12)

---
### (1) x dataset
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
- I, II, III, aVL, aVR, aVF, V1-V6

In [169]:
# Trainset

x_trainset = []
for i in x_train:
    df_ecg = pd.DataFrame(i)
    nm = df_ecg[[0, 1, 6, 7, 8, 9, 10, 11]]
    nm_numpy = nm.to_numpy()
    pad_ecg = np.pad(nm,((120,0),(0,0)),'constant',constant_values=0) # Lead zero padded to 5120
    x_trainset.append(pad_ecg)
    
X_trainset = np.array(x_trainset)

In [172]:
X_trainset.shape

(19634, 5120, 8)

In [174]:
# Testset

x_testset = []
for i in x_test:
    df_ecg = pd.DataFrame(i)
    nm = df_ecg[[0, 1, 6, 7, 8, 9, 10, 11]]
    nm_numpy = nm.to_numpy()
    pad_ecg = np.pad(nm,((120,0),(0,0)),'constant',constant_values=0) # Lead zero padded to 5120
    x_testset.append(pad_ecg)
    
X_testset = np.array(x_testset)

In [175]:
X_testset.shape

(2203, 5120, 8)

In [177]:
# x_dataset 
np.save('/home/ubuntu/dr-you-ecg-20220420_mount/STEMI_JKL/PTB_x_train_0527',X_trainset)
np.save('/home/ubuntu/dr-you-ecg-20220420_mount/STEMI_JKL/PTB_x_test_0527',X_testset)

In [250]:
# y_dataset 
np.save('/home/ubuntu/dr-you-ecg-20220420_mount/STEMI_JKL/PTB_y_train_0527',y_train_npy)
np.save('/home/ubuntu/dr-you-ecg-20220420_mount/STEMI_JKL/PTB_y_test_0527',y_test_npy)