In [None]:
import librosa
import librosa.display
import os
from scipy.io import loadmat
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import random
import datetime as dt
from keras import layers
from keras import models
from keras import optimizers
from sklearn.model_selection import train_test_split

In [None]:
# Set TF random seed to improve reproducibility
tf.set_random_seed(1234)

In [None]:
# define parameters
batch_size = 32
minimum_len = 128
epochs = 200

### Getting file names

In [None]:
#currdir= os.getcwd()
rootdir = '/home/taejoon/PhysioNetChallenge'
input_directory = os.path.join(rootdir, 'Training_WFDB')
mel_name = 'Mel_data_20200402_128' 
mel_directory = os.path.join(rootdir, mel_name)
#save_directory = os.path.join(currdir, '')
if not os.path.isdir(input_directory):
        os.mkdir(input_directory)
if not os.path.isdir(mel_directory):
        os.mkdir(mel_directory)        

In [None]:
# Find files
input_files = []
for f in os.listdir(input_directory):
    if os.path.isfile(os.path.join(input_directory, f)) and not f.lower().startswith('.') and f.lower().endswith('mat'):
        input_files.append(f)

In [None]:
input_file_names = sorted(input_files)
input_file_names

### Shuffle and divide files into train/eval/test

In [None]:
data, data_test = train_test_split(input_file_names, test_size = 0.2, train_size = 0.8, shuffle=True)
data_train, data_val = train_test_split(data, test_size = 0.25, train_size = 0.75, shuffle=True)

In [None]:
print(np.shape(data_train), np.shape(data_val), np.shape(data_test))

### Preprocess labels (one-hot encoding)

In [None]:
# Find unique number of classes  
def get_unique_classes(input_directory,files):

    unique_classes=set()
    for f in files:
        g = f.replace('.mat','.hea')
        input_file = os.path.join(input_directory,g)
        with open(input_file,'r') as f:
            for lines in f:
                if lines.startswith('#Dx'):
                    tmp = lines.split(': ')[1].split(',')
                    for c in tmp:
                        unique_classes.add(c.strip())

    return sorted(unique_classes)

unique_classes = get_unique_classes(input_directory, input_files)
# Creating one-hot vector for Y
# num = np.unique(classes, axis=0)
class2index = {}
for a, b in enumerate(unique_classes):
    class2index[b] = a
#class2index

def one_hot_encoding(y, class2index):
       one_hot_vector = [0]*(len(class2index))
       ind=class2index[y]
       one_hot_vector[ind]=1
       return one_hot_vector


In [None]:
class2index

In [None]:
# # Checkinc which x is minimum
# minimum = 300
# for file in input_file_names:
#     tmp_file = np.load(mel_directory + '/' + file.replace('.mat', '.npy'))
#     print(np.shape(tmp_file))
#     if len(tmp_file) < minimum:
#         minimum = tmp_file.shape[0]
# #print(minimum)

In [None]:
# classes= np.asarray(classes)    

In [None]:
# mel_files = np.asarray(mel_files)

In [None]:

# dataset = dataset.batch(batch_size)
# dataset

In [None]:
# from sklearn.model_selection import train_test_split

In [None]:
# x, x_test, y, y_test  = train_test_split(mel_files, classes, test_size=0.2, train_size = 0.8)
# x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.25, train_size = 0.75)

In [None]:
# print(np.shape(x_train), np.shape(x_val), np.shape(x_test), np.shape(y_train), np.shape(y_val), np.shape(y_test))

### CNN Model

In [None]:
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',
                        input_shape=(minimum_len, minimum_len, 12)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(9, activation='softmax'))

In [None]:
model.summary()

In [None]:
def block_feature(sequence_en, minimum_len): 
    new_en = []
    if len(sequence_en) > minimum_len:  # 길이가 minimum보다 긴 경우
        start = random.randint(0,len(sequence_en)-minimum_len)    
        new_en = sequence_en[start:start+minimum_len]
    elif len(sequence_en) == minimum_len: # 길이가 minimum
        new_en = sequence_en
    else: 
        assert len(sequence_en) <= minimum_len
    return new_en

In [None]:
# Get classes of sorted file names
def get_labels(input_directory,file, class2index):
    
    classes = []

    f = file
    g = f.replace('.mat','.hea')
    input_file = os.path.join(input_directory,g)
    with open(input_file,'r') as f:
        for lines in f:
            if lines.startswith('#Dx'):
                tmp = lines.split(': ')[1].split(',')
                for c in tmp:
                    curr_label = one_hot_encoding(c.strip(), class2index)
                classes.append(curr_label)

    return classes

In [None]:
def randextract_mels(curr_step, batch_size, data_train, mel_directory, class2index, minimum_len): # step = 0, 1, 2, 3....
    mel_files = []
    classes = []
    start = batch_size*curr_step
    end = batch_size*(curr_step+1)-1
    curr_file_indices = data_train[start:end]
    for file in curr_file_indices:
        tmp_file = np.load(mel_directory + '/' + file.replace('.mat', '.npy'))
        clip_file = block_feature(tmp_file, minimum_len)
        mel_files.append(clip_file)
        
        label = get_labels(input_directory, file, class2index)
        classes.append(label)
    return mel_files, classes

In [None]:
def randextract_mels_val(curr_range_start, curr_range_end, data_val, mel_directory, class2index, minimum_len): # step = 0, 1, 2, 3....
    mel_files = []
    classes = []
    start = curr_range_start
    end = curr_range_end
    start = start.astype(int)
    end = end.astype(int)
    curr_file_indices = data_val[start:end]
    for file in curr_file_indices:
        tmp_file = np.load(mel_directory + '/' + file.replace('.mat', '.npy'))
        clip_file = block_feature(tmp_file, minimum_len)
        mel_files.append(clip_file)
        
        label = get_labels(input_directory, file, class2index)
        classes.append(label)
    return mel_files, classes

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.RMSprop(lr=1e-4),
              metrics=['acc'])

In [None]:
def train(data_train, mel_directory, batch_size, class2index, minimum_len): 
    
    loss=[]
    acc = []

    total_steps = int(np.ceil(len(data_train)/batch_size))
    for curr_step in range(total_steps):
        batch_mels, batch_labels = randextract_mels(curr_step, batch_size, data_train, mel_directory, class2index, minimum_len)
        batch_mels = np.asarray(batch_mels)
        batch_labels = np.asarray(np.squeeze(batch_labels))
        train_loss_tmp = model.train_on_batch(batch_mels, batch_labels)
        loss.append(train_loss_tmp[0])
        acc.append(train_loss_tmp[1])

    loss = np.mean(np.array(loss))
    acc = np.mean(np.array(acc))
#     metrics = np.mean(np.array(metrics))
#     return metrics
    return loss, acc

In [None]:
def validation(ct, data_val, mel_directory, class2index, minimum_len, epochs): 
    loss = []
    acc = []
    
    per_epoch = epochs/20 # how many validation sets we need: divide total epochs (1000) by 20
    per_val = np.floor(len(data_val)/per_epoch)
    curr_range_start = (ct-1)*per_val
    curr_range_end = ct*per_val - 1
    batch_mels, batch_labels = randextract_mels_val(curr_range_start, curr_range_end, data_val, mel_directory, class2index, minimum_len)
    batch_mels = np.asarray(batch_mels)
    batch_labels = np.asarray(np.squeeze(batch_labels))
    val_loss_tmp = model.test_on_batch(batch_mels, batch_labels)
    loss.append(val_loss_tmp[0])
    acc.append(val_loss_tmp[1])
#     metrics.append(val_loss_tmp)
#     metrics = np.mean(np.array(metrics))
#     return metrics
    return loss, acc

In [None]:
ct = 0
for num_epoch in range(epochs):
    if (num_epoch+1)%20 == 0: # Validation for every 20 epochs
        ct += 1
        val_loss, val_acc = validation(ct,data_val, mel_directory, class2index, minimum_len, epochs)
        curr = (num_epoch+1)/20
        print('\nValidation', curr.astype(int),'valid_loss:',f'{valid_loss:.3f}', 'valid_acc:',f'{valid_acc:.3f}',"\t", dt.datetime.now())
    else: 
        train_loss, train_acc = train(data_train, mel_directory, batch_size, class2index, minimum_len)
        print('\nEpoch',num_epoch+1,'train_loss:',f'{train_loss:.3f}','train_acc:',f'{train_acc:.3f}',"\t", dt.datetime.now())
    
    model.save('MEL.h5')    

### Now, test with test data

In [None]:
def test(data_test, mel_directory, class2index, minimum_len): 
    
    metrics = []
    batch_mels, batch_labels = randextract_mels_val(0, len(data_test)-1, data_val, mel_directory, class2index, minimum_len)
    # although rendextract_mels_val, you can use the same function fpr test
    batch_mels = np.asarray(batch_mels)
    batch_labels = np.asarray(np.squeeze(batch_labels))
    test_loss_tmp = model.test_on_batch(batch_mels, batch_labels)
    metrics.append(test_loss_tmp)

    metrics = np.mean(np.array(metrics))
    return metrics

In [None]:
test_metrics = test(data_test, mel_directory, class2index, minimum_len)
print('\nTest result: test_metrics:',f'{test_metrics:.3f}',"\t", dt.datetime.now())

In [None]:
# history = model.fit(
#       mel_files, classes)

In [None]:
# from keras import optimizers
# model.compile(loss='categorical_crossentropy',
#               optimizer=optimizers.RMSprop(lr=1e-4),
#               metrics=['acc'])
# nepochs=1000
# model.fit(x_train, y_train, batch_size=batch_size, epochs=nepochs, validation_data=(x_val, y_val), verbose=2)
# model.save('ECG1.h5')