In [1]:
import librosa
import librosa.display
import os
from scipy.io import loadmat
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import random
import datetime as dt
from keras import layers
from keras import models
from keras import optimizers
from sklearn.model_selection import train_test_split

os.environ["CUDA_VISIBLE_DEVICES"]="1"
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
# Set TF random seed to improve reproducibility
tf.set_random_seed(1234)
random.seed(100)

In [3]:
# define parameters
batch_size = 32
minimum_len = 128
epochs = 200
val_period = 10 # 몇 epoch에 한번씩 validation?
loss_function = 'categorical_crossentropy'
activation_function = 'softmax'

### Getting file names

In [4]:
#currdir= os.getcwd()
rootdir = '/home/taejoon/PhysioNetChallenge'
input_directory = os.path.join(rootdir, 'Training_WFDB')
mel_name = 'Mel_data_20200402_128' 
mel_directory = os.path.join(rootdir, mel_name)
results_directory = os.path.join(rootdir, 'results')
#save_directory = os.path.join(currdir, '')
if not os.path.isdir(input_directory):
        os.mkdir(input_directory)
if not os.path.isdir(mel_directory):
        os.mkdir(mel_directory)
if not os.path.isdir(results_directory):
        os.mkdir(results_directory)                       

In [5]:
# Find files
input_files = []
for f in os.listdir(input_directory):
    if os.path.isfile(os.path.join(input_directory, f)) and not f.lower().startswith('.') and f.lower().endswith('mat'):
        input_files.append(f)

In [6]:
input_file_names = sorted(input_files)
input_file_names

['A0001.mat',
 'A0002.mat',
 'A0003.mat',
 'A0004.mat',
 'A0005.mat',
 'A0006.mat',
 'A0007.mat',
 'A0008.mat',
 'A0009.mat',
 'A0010.mat',
 'A0011.mat',
 'A0012.mat',
 'A0013.mat',
 'A0014.mat',
 'A0015.mat',
 'A0016.mat',
 'A0017.mat',
 'A0018.mat',
 'A0019.mat',
 'A0020.mat',
 'A0021.mat',
 'A0022.mat',
 'A0023.mat',
 'A0024.mat',
 'A0025.mat',
 'A0026.mat',
 'A0027.mat',
 'A0028.mat',
 'A0029.mat',
 'A0030.mat',
 'A0031.mat',
 'A0032.mat',
 'A0033.mat',
 'A0034.mat',
 'A0035.mat',
 'A0036.mat',
 'A0037.mat',
 'A0038.mat',
 'A0039.mat',
 'A0040.mat',
 'A0041.mat',
 'A0042.mat',
 'A0043.mat',
 'A0044.mat',
 'A0045.mat',
 'A0046.mat',
 'A0047.mat',
 'A0048.mat',
 'A0049.mat',
 'A0050.mat',
 'A0051.mat',
 'A0052.mat',
 'A0053.mat',
 'A0054.mat',
 'A0055.mat',
 'A0056.mat',
 'A0057.mat',
 'A0058.mat',
 'A0059.mat',
 'A0060.mat',
 'A0061.mat',
 'A0062.mat',
 'A0063.mat',
 'A0064.mat',
 'A0065.mat',
 'A0066.mat',
 'A0067.mat',
 'A0068.mat',
 'A0069.mat',
 'A0070.mat',
 'A0071.mat',
 'A007

### Code for extracting only single-label subjects

In [7]:
# Find unique number of classes  
def get_unique_classes(input_directory,files):

    unique_classes=set()
    for f in files:
        g = f.replace('.mat','.hea')
        input_file = os.path.join(input_directory,g)
        with open(input_file,'r') as f:
            for lines in f:
                if lines.startswith('#Dx'):
                    tmp = lines.split(': ')[1].split(',')
                    for c in tmp:
                        unique_classes.add(c.strip())

    return sorted(unique_classes)

unique_classes = get_unique_classes(input_directory, input_files)
# Creating one-hot vector for Y
# num = np.unique(classes, axis=0)
class2index = {}
for a, b in enumerate(unique_classes):
    class2index[b] = a
#class2index

def one_hot_encoding(one_hot_vector,y, class2index):

       ind=class2index[y]
       one_hot_vector[ind]=1
       return one_hot_vector

In [8]:
class2index

{'AF': 0,
 'I-AVB': 1,
 'LBBB': 2,
 'Normal': 3,
 'PAC': 4,
 'PVC': 5,
 'RBBB': 6,
 'STD': 7,
 'STE': 8}

In [9]:
# Search for multi-label subjects
def searching_overlap(input_directory,class2index, input_file_names):
    multiclasses=[]
    multisubjects=[]
    number = []
    for file in input_file_names:
        f=file
        g = f.replace('.mat','.hea')
        input_file = os.path.join(input_directory,g)
        with open(input_file,'r') as f:
            for lines in f:
                if lines.startswith('#Dx'):
                    tmp = lines.split(': ')[1].split(',')
                    if len(tmp)>1:
                        one_hot_vector = [0]*(len(class2index))
                        for c in tmp:
                            one_hot_vector = one_hot_encoding(one_hot_vector, c.strip(), class2index)
                        multiclasses.append(one_hot_vector)
                        multisubjects.append(g)
                        number.append(len(tmp))
    return multisubjects, multiclasses, number

In [10]:
classes_orig= [x.replace('.mat', '.hea') for x in input_file_names] # total subjects
classes_multi, _, _ = searching_overlap(input_directory,class2index, input_file_names)
classes_single = [x for x in classes_orig if x not in classes_multi]
classes_single = [x.replace('.hea', '.mat') for x in classes_single]

In [11]:
np.shape(classes_single)

(6401,)

In [12]:
classes_single

['A0001.mat',
 'A0002.mat',
 'A0003.mat',
 'A0004.mat',
 'A0005.mat',
 'A0006.mat',
 'A0007.mat',
 'A0008.mat',
 'A0009.mat',
 'A0010.mat',
 'A0011.mat',
 'A0012.mat',
 'A0013.mat',
 'A0014.mat',
 'A0015.mat',
 'A0016.mat',
 'A0017.mat',
 'A0018.mat',
 'A0019.mat',
 'A0020.mat',
 'A0021.mat',
 'A0022.mat',
 'A0023.mat',
 'A0024.mat',
 'A0025.mat',
 'A0026.mat',
 'A0027.mat',
 'A0028.mat',
 'A0029.mat',
 'A0030.mat',
 'A0031.mat',
 'A0032.mat',
 'A0033.mat',
 'A0034.mat',
 'A0035.mat',
 'A0036.mat',
 'A0037.mat',
 'A0038.mat',
 'A0039.mat',
 'A0040.mat',
 'A0041.mat',
 'A0042.mat',
 'A0044.mat',
 'A0045.mat',
 'A0046.mat',
 'A0047.mat',
 'A0048.mat',
 'A0049.mat',
 'A0050.mat',
 'A0051.mat',
 'A0052.mat',
 'A0053.mat',
 'A0054.mat',
 'A0055.mat',
 'A0056.mat',
 'A0057.mat',
 'A0058.mat',
 'A0059.mat',
 'A0060.mat',
 'A0061.mat',
 'A0062.mat',
 'A0063.mat',
 'A0064.mat',
 'A0065.mat',
 'A0066.mat',
 'A0067.mat',
 'A0068.mat',
 'A0069.mat',
 'A0070.mat',
 'A0071.mat',
 'A0072.mat',
 'A007

In [13]:
# double-checking if classes_single have single-label
a, b, c  = searching_overlap(input_directory,class2index,classes_single)
a

[]

In [14]:
# we can safely use classes_single as input_file_names
input_file_names = classes_single
random.shuffle(input_file_names)
np.shape(input_file_names)

(6401,)

### Shuffle and divide files into train/eval/test

In [15]:
data, data_test = train_test_split(input_file_names, test_size = 0.2, train_size = 0.8, shuffle=True)
data_train, data_val = train_test_split(data, test_size = 0.25, train_size = 0.75, shuffle=True)

In [16]:
print(np.shape(data_train), np.shape(data_val), np.shape(data_test))

(3840,) (1280,) (1281,)


### CNN Model

In [17]:
from keras.applications.densenet import DenseNet169
from keras.layers import Input, GlobalAveragePooling2D, Dense
from keras.models import Model 

input_tensor = Input(shape=(128, 128, 12))
base_model = DenseNet169(input_tensor=input_tensor, weights=None, include_top=False)

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
pred = Dense(9, activation=activation_function)(x)

model = Model(inputs=base_model.input, outputs=pred)











In [18]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 128, 128, 12) 0                                            
__________________________________________________________________________________________________
zero_padding2d_1 (ZeroPadding2D (None, 134, 134, 12) 0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1/conv (Conv2D)             (None, 64, 64, 64)   37632       zero_padding2d_1[0][0]           
__________________________________________________________________________________________________
conv1/bn (BatchNormalization)   (None, 64, 64, 64)   256         conv1/conv[0][0]                 
__________________________________________________________________________________________________
conv1/relu

In [19]:
def block_feature(sequence_en, minimum_len): 
    new_en = []
    if len(sequence_en) > minimum_len:  # 길이가 minimum보다 긴 경우
        start = random.randint(0,len(sequence_en)-minimum_len)    
        new_en = sequence_en[start:start+minimum_len]
    elif len(sequence_en) == minimum_len: # 길이가 minimum
        new_en = sequence_en
    else: 
        assert len(sequence_en) <= minimum_len
    return new_en

In [20]:
def exploratory_look(input_directory,file, class2index):
    
    classes = []

    f = file
    g = f.replace('.mat','.hea')
    input_file = os.path.join(input_directory,g)
    with open(input_file,'r') as f:
        for lines in f:
            if lines.startswith('#Dx'):
                tmp = lines.split(': ')[1].split(',')
                print(tmp, len(tmp))
    return tmp            

for file in input_file_names:
    tmp = exploratory_look(input_directory, file, class2index)

['Normal\n'] 1
['STD\n'] 1
['I-AVB\n'] 1
['I-AVB\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['AF\n'] 1
['Normal\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['I-AVB\n'] 1
['AF\n'] 1
['PAC\n'] 1
['PAC\n'] 1
['STD\n'] 1
['RBBB\n'] 1
['PVC\n'] 1
['STE\n'] 1
['PVC\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['PAC\n'] 1
['STD\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['I-AVB\n'] 1
['AF\n'] 1
['PAC\n'] 1
['STD\n'] 1
['Normal\n'] 1
['AF\n'] 1
['STE\n'] 1
['I-AVB\n'] 1
['STD\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['STD\n'] 1
['LBBB\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['PAC\n'] 1
['Normal\n'] 1
['I-AVB\n'] 1
['STD\n'] 1
['AF\n'] 1
['STD\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['PVC\n'] 1
['AF\n'] 1
['PAC\n'] 1
['PVC\n'] 1
['RBBB\n'] 1
['I-AVB\n'] 1
['LBBB\n'] 1
['PVC\n'] 1
['STD\n'] 1
['RBBB\n'] 1
['PVC\n'] 1
['PVC\n'] 1
['I-AVB\n'] 1
['RBBB\n'] 1
['PAC\n'] 1
['LBBB\n'] 1
['I-AVB\n'] 1
['RBBB\n'] 1
['STD\n'] 1
['AF\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['I-AVB\n'] 1
['STD\n'] 1
['STE\n'] 1
['Normal\n'] 1
['LBBB\n'] 1
['STD\n'] 1
['

['AF\n'] 1
['STD\n'] 1
['PAC\n'] 1
['STD\n'] 1
['RBBB\n'] 1
['PVC\n'] 1
['AF\n'] 1
['Normal\n'] 1
['AF\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['PVC\n'] 1
['STD\n'] 1
['AF\n'] 1
['I-AVB\n'] 1
['STD\n'] 1
['RBBB\n'] 1
['STD\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['Normal\n'] 1
['AF\n'] 1
['PVC\n'] 1
['AF\n'] 1
['Normal\n'] 1
['PVC\n'] 1
['Normal\n'] 1
['PVC\n'] 1
['PVC\n'] 1
['RBBB\n'] 1
['PAC\n'] 1
['AF\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['PVC\n'] 1
['STD\n'] 1
['AF\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['PVC\n'] 1
['I-AVB\n'] 1
['RBBB\n'] 1
['STD\n'] 1
['LBBB\n'] 1
['Normal\n'] 1
['Normal\n'] 1
['RBBB\n'] 1
['Normal\n'] 1
['AF\n'] 1
['PVC\n'] 1
['I-AVB\n'] 1
['RBBB\n'] 1
['PVC\n'] 1
['LBBB\n'] 1
['PVC\n'] 1
['PAC\n'] 1
['RBBB\n'] 1
['PAC\n'] 1
['Normal\n'] 1
['PAC\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['Normal\n'] 1
['RBBB\n'] 1
['Normal\n'] 1
['I-AVB\n'] 1
['RBBB\n'] 1
['Normal\n'] 1
['RBBB\n'] 1
['I-AVB\n'] 1
['AF\n'] 1
['I-AVB\n'] 1
['RBBB\n'] 1
['PVC\n'] 1
['AF

['STD\n'] 1
['LBBB\n'] 1
['Normal\n'] 1
['I-AVB\n'] 1
['I-AVB\n'] 1
['PVC\n'] 1
['AF\n'] 1
['PAC\n'] 1
['STE\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['Normal\n'] 1
['I-AVB\n'] 1
['PAC\n'] 1
['RBBB\n'] 1
['I-AVB\n'] 1
['I-AVB\n'] 1
['Normal\n'] 1
['STE\n'] 1
['RBBB\n'] 1
['PAC\n'] 1
['PVC\n'] 1
['RBBB\n'] 1
['STD\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['Normal\n'] 1
['I-AVB\n'] 1
['AF\n'] 1
['PVC\n'] 1
['PVC\n'] 1
['I-AVB\n'] 1
['Normal\n'] 1
['AF\n'] 1
['STE\n'] 1
['PVC\n'] 1
['Normal\n'] 1
['STD\n'] 1
['AF\n'] 1
['PAC\n'] 1
['AF\n'] 1
['PAC\n'] 1
['I-AVB\n'] 1
['PAC\n'] 1
['STE\n'] 1
['I-AVB\n'] 1
['PVC\n'] 1
['STD\n'] 1
['Normal\n'] 1
['Normal\n'] 1
['STD\n'] 1
['STD\n'] 1
['PAC\n'] 1
['AF\n'] 1
['PAC\n'] 1
['Normal\n'] 1
['PVC\n'] 1
['STD\n'] 1
['STD\n'] 1
['RBBB\n'] 1
['I-AVB\n'] 1
['PAC\n'] 1
['RBBB\n'] 1
['Normal\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['STD\n'] 1
['AF\n'] 1
['I-AVB\n'] 1
['STE\n'] 1
['RBBB\n'] 1
['STD\n'] 1
['Normal\n'] 1
['Normal\n'] 1
['RBBB\n'] 1
['STE\n'] 1
['PAC\n'] 1
['PAC\n'] 1
[

['Normal\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['LBBB\n'] 1
['I-AVB\n'] 1
['I-AVB\n'] 1
['I-AVB\n'] 1
['I-AVB\n'] 1
['PAC\n'] 1
['AF\n'] 1
['STE\n'] 1
['STD\n'] 1
['PVC\n'] 1
['STD\n'] 1
['AF\n'] 1
['PVC\n'] 1
['I-AVB\n'] 1
['PAC\n'] 1
['PAC\n'] 1
['Normal\n'] 1
['Normal\n'] 1
['PVC\n'] 1
['RBBB\n'] 1
['I-AVB\n'] 1
['STD\n'] 1
['Normal\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['Normal\n'] 1
['AF\n'] 1
['Normal\n'] 1
['AF\n'] 1
['I-AVB\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['Normal\n'] 1
['Normal\n'] 1
['I-AVB\n'] 1
['PAC\n'] 1
['AF\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['PAC\n'] 1
['STD\n'] 1
['STD\n'] 1
['Normal\n'] 1
['Normal\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['PAC\n'] 1
['AF\n'] 1
['I-AVB\n'] 1
['AF\n'] 1
['PVC\n'] 1
['I-AVB\n'] 1
['AF\n'] 1
['Normal\n'] 1
['I-AVB\n'] 1
['PAC\n'] 1
['STE\n'] 1
['Normal\n'] 1
['PVC\n'] 1
['STD\n'] 1
['PAC\n'] 1
['STE\n'] 1
['PVC\n'] 1
['I-AVB\n'] 1
['I-AVB\n'] 1
['Normal\n'] 1
['STD\n'] 1
['STD\n'] 1
['RBBB\n'] 1
['Normal\n'] 1
['PVC\n'] 1
['AF\n'] 1
['STD\

['RBBB\n'] 1
['I-AVB\n'] 1
['AF\n'] 1
['RBBB\n'] 1
['LBBB\n'] 1
['Normal\n'] 1
['AF\n'] 1
['AF\n'] 1
['I-AVB\n'] 1
['Normal\n'] 1
['AF\n'] 1
['Normal\n'] 1
['RBBB\n'] 1
['PVC\n'] 1
['AF\n'] 1
['STD\n'] 1
['AF\n'] 1
['Normal\n'] 1
['Normal\n'] 1
['PAC\n'] 1
['PVC\n'] 1
['PVC\n'] 1
['Normal\n'] 1
['I-AVB\n'] 1
['PVC\n'] 1
['PAC\n'] 1
['Normal\n'] 1
['STE\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['PAC\n'] 1
['STD\n'] 1
['RBBB\n'] 1
['PAC\n'] 1
['PVC\n'] 1
['PVC\n'] 1
['I-AVB\n'] 1
['RBBB\n'] 1
['I-AVB\n'] 1
['RBBB\n'] 1
['STD\n'] 1
['Normal\n'] 1
['Normal\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['Normal\n'] 1
['RBBB\n'] 1
['I-AVB\n'] 1
['PVC\n'] 1
['STD\n'] 1
['RBBB\n'] 1
['Normal\n'] 1
['I-AVB\n'] 1
['STD\n'] 1
['Normal\n'] 1
['STD\n'] 1
['AF\n'] 1
['Normal\n'] 1
['STD\n'] 1
['PAC\n'] 1
['AF\n'] 1
['AF\n'] 1
['STE\n'] 1
['I-AVB\n'] 1
['PVC\n'] 1
['STE\n'] 1
['PVC\n'] 1
['Normal\n'] 1
['STD\n'] 1
['AF\n'] 1
['I-AVB\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['Normal\n'] 1
['I-AVB

['PVC\n'] 1
['RBBB\n'] 1
['PVC\n'] 1
['STD\n'] 1
['Normal\n'] 1
['AF\n'] 1
['STD\n'] 1
['PAC\n'] 1
['I-AVB\n'] 1
['RBBB\n'] 1
['PAC\n'] 1
['STD\n'] 1
['AF\n'] 1
['PAC\n'] 1
['STD\n'] 1
['PVC\n'] 1
['Normal\n'] 1
['STD\n'] 1
['I-AVB\n'] 1
['PVC\n'] 1
['Normal\n'] 1
['STE\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['RBBB\n'] 1
['Normal\n'] 1
['PVC\n'] 1
['Normal\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['PAC\n'] 1
['Normal\n'] 1
['PAC\n'] 1
['Normal\n'] 1
['AF\n'] 1
['STD\n'] 1
['STD\n'] 1
['PVC\n'] 1
['PVC\n'] 1
['Normal\n'] 1
['Normal\n'] 1
['AF\n'] 1
['AF\n'] 1
['Normal\n'] 1
['RBBB\n'] 1
['STD\n'] 1
['AF\n'] 1
['PAC\n'] 1
['LBBB\n'] 1
['LBBB\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['I-AVB\n'] 1
['PAC\n'] 1
['RBBB\n'] 1
['STD\n'] 1
['STD\n'] 1
['AF\n'] 1
['AF\n'] 1
['I-AVB\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['PVC\n'] 1
['Normal\n'] 1
['I-AVB\n'] 1
['AF\n'] 1
['RBBB\n'] 1
['STD\n'] 1
['AF\n'] 1
['RBBB\n'] 1
['PVC\n'] 1
['PAC\n'] 1
['PVC\n'] 1
['LBBB\n'] 1
['AF\n'] 1
['AF\n'] 1
['Normal\n'] 1
['RBBB\n'] 1
['

['AF\n'] 1
['AF\n'] 1
['Normal\n'] 1
['I-AVB\n'] 1
['AF\n'] 1
['Normal\n'] 1
['I-AVB\n'] 1
['RBBB\n'] 1
['PAC\n'] 1
['AF\n'] 1
['RBBB\n'] 1
['STD\n'] 1
['PVC\n'] 1
['RBBB\n'] 1
['STD\n'] 1
['I-AVB\n'] 1
['LBBB\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['STD\n'] 1
['RBBB\n'] 1
['Normal\n'] 1
['AF\n'] 1
['Normal\n'] 1
['AF\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['RBBB\n'] 1
['PVC\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['Normal\n'] 1
['Normal\n'] 1
['I-AVB\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['I-AVB\n'] 1
['I-AVB\n'] 1
['PVC\n'] 1
['AF\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['RBBB\n'] 1
['Normal\n'] 1
['PAC\n'] 1
['LBBB\n'] 1
['AF\n'] 1
['PVC\n'] 1
['LBBB\n'] 1
['PAC\n'] 1
['Normal\n'] 1
['STE\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['STE\n'] 1
['RBBB\n'] 1
['Normal\n'] 1
['AF\n'] 1
['PVC\n'] 1
['RBBB\n'] 1
['Normal\n'] 1
['Normal\n'] 1
['I-AVB\n'] 1
['I-AVB\n'] 1
['LBBB\n'] 1
['Normal\n'] 1
['Normal\n'] 1
['Normal\n'] 1
['Normal\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['PAC\n'] 1
['STD\n'] 1
['PVC\n'] 1
['I-AVB\n'] 1
['Normal\n'] 1
['STD\n'] 1
['N

['AF\n'] 1
['AF\n'] 1
['AF\n'] 1
['RBBB\n'] 1
['STD\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['Normal\n'] 1
['RBBB\n'] 1
['STD\n'] 1
['I-AVB\n'] 1
['AF\n'] 1
['I-AVB\n'] 1
['AF\n'] 1
['STD\n'] 1
['STD\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['PVC\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['STE\n'] 1
['AF\n'] 1
['AF\n'] 1
['Normal\n'] 1
['Normal\n'] 1
['STD\n'] 1
['AF\n'] 1
['RBBB\n'] 1
['STD\n'] 1
['STE\n'] 1
['PVC\n'] 1
['RBBB\n'] 1
['STD\n'] 1
['Normal\n'] 1
['LBBB\n'] 1
['RBBB\n'] 1
['PVC\n'] 1
['AF\n'] 1
['STD\n'] 1
['PAC\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['STD\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['Normal\n'] 1
['RBBB\n'] 1
['I-AVB\n'] 1
['RBBB\n'] 1
['PAC\n'] 1
['RBBB\n'] 1
['RBBB\n'] 1
['Normal\n'] 1
['Normal\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['STD\n'] 1
['Normal\n'] 1
['PAC\n'] 1
['I-AVB\n'] 1
['PAC\n'] 1
['Normal\n'] 1
['PVC\n'] 1
['Normal\n'] 1
['AF\n'] 1
['AF\n'] 1
['Normal\n'] 1
['AF\n'] 1
['I-AVB\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['STE\n'] 1
['RBBB\n'] 1
['AF\n'] 1
['I-AVB\n'] 1
['STD\n'] 1
['PAC\n

In [21]:
# Get classes of sorted file names
def get_labels(input_directory,file, class2index):
    f = file
    g = f.replace('.mat','.hea')
    input_file = os.path.join(input_directory,g)
    with open(input_file,'r') as f:
        for lines in f:
            if lines.startswith('#Dx'):
                tmp = lines.split(': ')[1].split(',')
                one_hot_vector = [0]*(len(class2index))
                for c in tmp:
                    one_hot_vector = one_hot_encoding(one_hot_vector, c.strip(), class2index)
                
    return one_hot_vector

In [22]:
def randextract_mels(curr_step, batch_size, data_train, mel_directory, class2index, minimum_len): # step = 0, 1, 2, 3....
    mel_files = []
    classes = []
    start = batch_size*curr_step
    end = batch_size*(curr_step+1)
    curr_file_indices = data_train[start:end]
    for file in curr_file_indices:
        tmp_file = np.load(mel_directory + '/' + file.replace('.mat', '.npy'))
        clip_file = block_feature(tmp_file, minimum_len)
        mel_files.append(clip_file)
        
        label = get_labels(input_directory, file, class2index)
        classes.append(label)
    return mel_files, classes

In [23]:
def randextract_mels_val(curr_range_start, curr_range_end, data_val, mel_directory, class2index, minimum_len): # step = 0, 1, 2, 3....
    mel_files = []
    classes = []
    curr_file_indices = data_val[int(curr_range_start):int(curr_range_end)]
    for file in curr_file_indices:
        tmp_file = np.load(mel_directory + '/' + file.replace('.mat', '.npy'))
        clip_file = block_feature(tmp_file, minimum_len)
        mel_files.append(clip_file)
        
        label = get_labels(input_directory, file, class2index)
        classes.append(label)
    return mel_files, classes

In [24]:
model.compile(loss=loss_function,
              optimizer=optimizers.RMSprop(lr=1e-4),
              metrics=['acc'])




In [25]:
def train(data_train, mel_directory, batch_size, class2index, minimum_len): 
    
    loss=[]
    acc = []

    total_steps = int(np.ceil(len(data_train)/batch_size))
    for curr_step in range(total_steps):
        batch_mels, batch_labels = randextract_mels(curr_step, batch_size, data_train, mel_directory, class2index, minimum_len)
        batch_mels = np.asarray(batch_mels)
        batch_labels = np.asarray(np.squeeze(batch_labels))
        train_loss_tmp = model.train_on_batch(batch_mels, batch_labels)
        loss.append(train_loss_tmp[0])
        acc.append(train_loss_tmp[1])

    loss = np.mean(np.array(loss))
    acc = np.mean(np.array(acc))
    return loss, acc

In [26]:
def validation(data, epochs, mel_directory, minimum_len,ct, per_val): 
    # 1. 파일 읽어오기
    # 2. 128 길이의 block들로 쪼개기 (끄트머리 버리기-> np.floor)
    # 3. 각 block -> model.predict -> logit 가져오기 
    # 4. Block들간 평균내서 1 or 0 부여
    # 5. 실제 label과 비교
    

    curr_range_start = (ct-1)*per_val
    curr_range_end = ct*per_val
    scores = []
    curr_file_indices = data[curr_range_start:curr_range_end]
    accuracy=np.zeros(len(curr_file_indices))
    for i, file in enumerate(curr_file_indices):
        tmp_file = np.load(mel_directory + '/' + file.replace('.mat', '.npy'))
        steps = int(np.floor(tmp_file.shape[0]/minimum_len))
        logits=[]
        for block in range(steps): # 128개씩 쪼갠 블럭 단위로 predict
            start = block*minimum_len
            end = (block+1)*minimum_len
            curr_block = tmp_file[start:end]
            curr_block = np.expand_dims(curr_block,0)

            logit = model.predict(curr_block) # sigmoid 거쳐서 나온 것
            logits.append(logit)
        
        average = np.mean(np.squeeze(logits), axis=0)
        scores.append(average)
        average[average>=0.5] = 1
        average[average<0.5] = 0

        if average[average<0]:
            print('Error: output of sigmoid should be between 0 and 1')

        true_label = get_labels(input_directory, file, class2index)
        if np.array_equal(average, true_label): # predicted와 실제 label이 아예 같아야만 정답으로 인정
            accuracy[i] = 1
    avg_acc = np.mean(accuracy)
    return avg_acc

In [27]:
def test(data, minimum_len, model): # valid에서 이 함수 부를 떄는 ct 인자로 넣어주기
    # 1. 파일 읽어오기
    # 2. 128 길이의 block들로 쪼개기 (끄트머리 버리기-> np.floor)
    # 3. 각 block -> model.predict -> logit 가져오기 
    # 4. Block들간 평균내서 1 or 0 부여
    # 5. 실제 label과 비교 

    curr_range_start = 0
    curr_range_end = len(data)
    curr_file_indices = data[curr_range_start:curr_range_end]
    scores = []
    predicted_labels=[]
    accuracy=np.zeros(len(curr_file_indices))
    for i, file in enumerate(curr_file_indices):
        tmp_file = np.load(mel_directory + '/' + file.replace('.mat', '.npy'))
        steps = int(np.floor(tmp_file.shape[0]/minimum_len))
        logits=[]
        for block in range(steps): # 128개씩 쪼갠 블럭 단위로 predict
            start = block*minimum_len
            end = (block+1)*minimum_len
            curr_block = tmp_file[start:end]
            curr_block = np.expand_dims(curr_block,0)

            logit = model.predict(curr_block) # sigmoid 거쳐서 나온 것
            logits.append(logit)

        average = np.mean(np.squeeze(logits), axis=0)
        scores.append(average)
        average[average>=0.5] = 1
        average[average<0.5] = 0
        
        if average[average<0]:
            print('Error: output of sigmoid should be between 0 and 1')
        predicted_labels.append(average)    
        true_label = get_labels(input_directory, file, class2index)
        if np.array_equal(average, true_label): # predicted와 실제 label이 아예 같아야만 정답으로 인정
            accuracy[i] = 1

    avg_acc = np.mean(accuracy)
    return avg_acc, scores, predicted_labels

In [None]:
ct = 0
val_loss_sum = []
val_acc_sum=[]
train_loss_sum=[]
train_acc_sum=[]
 
# per_epoch = epochs/val_period # how many validation sets we need: divide total epochs (1000) by 20
per_val = int(np.floor(len(data_val)/epochs))
for num_epoch in range(epochs):
    ct+=1
    train_loss, train_acc = train(data_train, mel_directory, batch_size, class2index, minimum_len)
    print('\nEpoch',num_epoch+1,'train_loss:',f'{train_loss:.3f}','train_acc:',f'{train_acc:.3f}',"\t", dt.datetime.now())
    model_output = "ecg_mel_E%02dL%.2f" % (num_epoch, train_loss)
    save_name = os.path.join(results_directory, model_output)
#     validation(data, epochs, mel_directory, minimum_len,ct)
    val_acc = validation(data_val, epochs, mel_directory, minimum_len, ct, per_val)
    print('\nValidation', ct,'valid_acc:',f'{val_acc:.3f}',"\t", dt.datetime.now()) 
    val_acc_sum.append(val_acc)
    train_loss_sum.append(train_loss)
    train_acc_sum.append(train_acc)

    model.save(save_name)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1 train_loss: 1.807 train_acc: 0.359 	 2020-04-13 07:59:55.167520





Validation 1 valid_acc: 0.000 	 2020-04-13 07:59:57.599853

Epoch 2 train_loss: 1.440 train_acc: 0.505 	 2020-04-13 08:01:42.152629

Validation 2 valid_acc: 0.167 	 2020-04-13 08:01:42.589537

Epoch 3 train_loss: 1.299 train_acc: 0.553 	 2020-04-13 08:02:02.327164

Validation 3 valid_acc: 0.000 	 2020-04-13 08:02:03.019154

Epoch 4 train_loss: 1.201 train_acc: 0.587 	 2020-04-13 08:02:22.894918

Validation 4 valid_acc: 0.000 	 2020-04-13 08:02:23.643164

Epoch 5 train_loss: 1.110 train_acc: 0.619 	 2020-04-13 08:02:43.586664

Validation 5 valid_acc: 0.500 	 2020-04-13 08:02:44.129433

Epoch 6 train_loss: 1.046 train_acc: 0.648 	 2020-04-13 08:03:04.028161

Validation 6 valid_acc: 0.167 	 2020-04-13 08:03:04.452675

Epoch 7 train_loss: 0.974 train_acc: 0.665 	 2020-04-13 08:03:24.378079

Validation 7 valid_acc: 0.500 	 2020-04-13 08:03:24.786508

Epoch 8 train_loss: 0.911 train_acc: 0.689 	 2020-04-13 08:03:44.673988

Validation 8 valid_acc: 0.333 	 2020-04-13 08:03:45.100106

Epoch 9 

In [None]:
x_range = range(epochs)
plt.plot(x_range, train_acc_sum, 'bo', label='Training Accuracy')
plt.plot(x_range, val_acc_sum, 'ro', label='Validation Accuracy')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(x_range, train_loss_sum, 'bo', label='Training Loss')
plt.plot(x_range, val_loss_sum, 'ro', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

### Now, test with test data

In [None]:
# Scoring:

# For each 12-lead ECG recording, your algorithm must identify a set of one or more classes as well as a probability or confidence score for each class. 

# Example: 
# #Record ID
#  AF, I-AVB, LBBB, Normal, RBBB, PAC,  PVC,  STD, STE
#   1,     1,    0,      0,    0,   0,   0,     0,   0
# 0.9,   0.6,  0.2,   0.05,  0.2, 0.35, 0.35, 0.1, 0.1

Since some recordings may have multiple labels, we normalize their contributions to these scoring functions so that each recording, not each class, makes an equal contribution. For example, if a recording has six classes, and your classifier identifies three labels correctly, identifies one label incorrectly, and misses two labels, then we increment the true positives by 3/6, false positives by 1/6, and false negatives by 2/6 for both scoring functions.

In [None]:
# def test(data_test, mel_directory, class2index, minimum_len, model): 
    
#     metrics = []
#     batch_mels, batch_labels = randextract_mels_val(0, len(data_test)-1, data_val, mel_directory, class2index, minimum_len)
#     # although rendextract_mels_val, you can use the same function fpr test
#     batch_mels = np.asarray(batch_mels)
#     batch_labels = np.asarray(np.squeeze(batch_labels))
#     test_loss_tmp = model.test_on_batch(batch_mels, batch_labels)
#     loss = test_loss_tmp[0]
#     acc = test_loss_tmp[1]
# #     metrics.append(test_loss_tmp)

# #     metrics = np.mean(np.array(metrics))
#     return loss, acc

In [None]:
# {'AF': 0,
#  'I-AVB': 1,
#  'LBBB': 2,
#  'Normal': 3,
#  'PAC': 4,
#  'PVC': 5,
#  'RBBB': 6,
#  'STD': 7,
#  'STE': 8}
classes = [class2index[x] for x in range(8)]
# def save_challenge_predictions(output_directory,filename,scores,labels,classes):
def save_challenge_predictions(data, scores, predicted_labels, classes):    
#     recording = os.path.splitext(filename)[0]
    for i, filename in enumerate(data):
        curr_score = scores[i]
        curr_label = predicted_labels[i]
        new_file = filename.replace('.mat','.csv')
        output_file = os.path.join(output_directory,new_file)
        recording = new_file.replace('.csv', '')
        # Include the filename as the recording number
        recording_string = '#{}'.format(recording)
        class_string = ','.join(classes)
        label_string = ','.join(str(i) for i in curr_label)
        score_string = ','.join(str(i) for i in curr_score)

        with open(output_file, 'w') as f:
            f.write(recording_string + '\n' + class_string + '\n' + label_string + '\n' + score_string + '\n')

In [None]:
# load model
#  test(data, epochs, minimum_len,flag='val', ct=None): # valid에서 이 함수 부를 떄는 ct 인자로 넣어주기
rootdir = '/home/taejoon/PhysioNetChallenge'
results_directory = os.path.join(rootdir, 'results', 'csvfiles')
model = tf.keras.models.load_model(os.path.join(results_directory, "ecg_mel_E200*"))
avg_acc, scores, predicted_labels = test(data_test, minimum_len, model)

#     print('\nTest result: loss:',f'{test_metrics[0]:.3f}','accuracy:', f'{test_metrics[1]:.3f}',"\t", dt.datetime.now())



