# Prerequisites


In [None]:
!unzip pre_saved_assests/mitdb.zip 

Install WFDB to read files

Run these two commands in terminal:

source venv/bin/activate

pip install -r requirements.txt

In [None]:
import pandas as pd
import numpy as np
import wfdb
import glob 
from scipy import signal
from PIL import Image
import cv2
import random
from sklearn.model_selection import (train_test_split, cross_val_score, KFold)
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib
import numpy as np
import time
from tqdm import tqdm
from matplotlib import gridspec
import keras.backend as K
from PIL import Image
import gc
import cv2
from sklearn.metrics import confusion_matrix
import os.path
from os import path

# Splitting


Global variables used for splitting functionality.

In [None]:
df = pd.DataFrame()

abnormal = ['L','R','V','/','A','f','F','j','a','E','J','e','S']
normal = ['.','N']
af = "(AFIB"

allowed = ['(AFIB','(N', '']

pts = ['100', '101', '102', '103', '104', '105', '106', '107',
       '108', '109', '111', '112', '113', '114', '115', '116',
       '117', '118', '119', '121', '122', '123', '124', '200',
       '201', '202', '203', '205', '207', '208', '209', '210',
       '212', '213', '214', '215', '217', '219', '220', '221',
       '222', '223', '228', '230', '231', '232', '233', '234']

In [None]:
def test_symbols(data_path):

    for file in glob.iglob(f'{data_path}/*.hea'):
        thing = file[0:9]
        if thing == 'mitdb/201':
            record = wfdb.rdrecord(thing)
            annotation = wfdb.rdann(thing,'atr')
            sym = annotation.symbol
            print(len(annotation.aux_note))

def load_ecg(file):
    '''Reads a signal ECG and gets the physical signal (raw data) and associated annotations and symbols
        Parameters
        ----------
        file : dat file
            A dat file containing a raw ECG
        Returns
        ------
        p_signal : list
            numerical form of ECG
        sym : char
            Beat annotation
        samp : int
            Location of R peak
        aux_note : str
            Rhythm annotation
        
        References
        ----------
        https://towardsdatascience.com/detecting-heart-arrhythmias-with-deep-learning-in-keras-with-dense-cnn-and-lstm-add337d9e41f
    '''
    record = wfdb.rdrecord(file)
    ann = wfdb.rdann(file, 'atr')
    p_signal = record.p_signal 
    aux_note = ann.aux_note
    
    sym = ann.symbol
    samp = ann.sample

    return p_signal, sym, samp, aux_note

# This makes a dataset given a directory path, the associated frequency and the abnormal classes
#it makes 3 datasets 
# X dataset = signal dataset
# Y dataset = binary annotation 
# sym_all = associated beat symbol 


def make_dataset(data_path,num_sec, fs, samples=pts, exclude_other=True):
    '''Makes a segmented dataset from a set of ECG samples
        Parameters
        ----------
        data_path : str
            Path to database

        num_sec : int
            number of seconds to sample on either side of an R peak

        fs : int
            sampling frequency. MUST be equal to sampling frequency of databse

        samples : list
            Set of samples to be included in the dataset

        exclude_other : bool
            Depricated

        Returns
        ------
        X_all : np.ndarray
            A set of segmented ECGs
        Y_all : np.ndarray
            A set of beat annotations
        rhythm_all : np.ndarray
            A set of rhythm annotations
        
        References
        ----------
        https://towardsdatascience.com/detecting-heart-arrhythmias-with-deep-learning-in-keras-with-dense-cnn-and-lstm-add337d9e41f
    '''
    num_cols = 2*num_sec * fs
    X_all = np.zeros((1,num_cols))
    Y_all = np.zeros((1,1))
    sym_all = []
    rhythm_all = []
    max_rows = []
    limit = 0
    flag = True
    
    
    for patient in samples:

        file = data_path + patient 
        limit = limit+1
   
        p_signal,sym,samp, aux_note = load_ecg(file)
      
       # rhythm_all.append(aux_note[0])
        

        p_signal = p_signal[:,0]
        df_ann = pd.DataFrame({'atr_sym':sym, 'atr_sample':samp, 'aux_note':aux_note})
       # df_ann = df_ann.loc[df_ann.atr_sym.isin(abnormal + normal)]
    
        X,Y,sym, rhythm = build_XY(p_signal,df_ann, num_cols, num_sec, fs, exclude_other)
        sym_all = sym_all+sym
        rhythm[0]=str(aux_note[0])
        rhythm_all = rhythm_all+rhythm
        max_rows.append(X.shape[0])
        X_all = np.append(X_all,X,axis = 0)
        Y_all = np.append(Y_all,Y,axis = 0)
    


    X_all = X_all[1:,:]
    Y_all = Y_all[1:,:]



    return X_all, convert_to_label(rhythm_all) 

# this function builds the X,Y matrices for each beat
# it also returns the original symbols for Y
# Uses a dataframe to keep track of stuff


def split_by_patient(train_val_split=None):
    '''Takes a set of samples and produces a patient split i.e. No samples from
    any one patient can be in both the training and validation set.

        Parameters
        ----------
        train_val_split : float
            % split in the validation set
        Returns
        ------
        pts_train : list
            Set of samples to train on
        pts_valid : list
            Set of samples to validate on
        
        
        References
        ----------
        https://towardsdatascience.com/detecting-heart-arrhythmias-with-deep-learning-in-keras-with-dense-cnn-and-lstm-add337d9e41f
    '''   
    pts_train, pts_valid = None, None

    if (train_val_split==None):
        random.seed( 42 )
        pts_train = random.sample(pts, 36)
        pts_valid = [pt for pt in pts if pt not in pts_train]
    else:
      #  seed = 4
        seed = None   #Truly random
        pts_train, pts_valid = train_test_split(pts, random_state=seed, test_size=train_val_split)

   

    return pts_train, pts_valid
 
def split_randomly(X_all, Y_all):
    '''Takes a set of samples and produces a random split.

    Parameters
    ----------
    X_all : np.ndarray
        Set of ECGs
    Y_all : np.ndarray
        Set of ECG labels
    Returns
    ------
    X_train : np.ndarray
        ECGs to train on
    Y_train : np.ndarray
        Lables associated with training set
    X_valid : np.ndarray
        ECGs to validate on
    Y_valid : np.ndarray
        ECGs associated with the validation set

    References
    ----------
    https://towardsdatascience.com/detecting-heart-arrhythmias-with-deep-learning-in-keras-with-dense-cnn-and-lstm-add337d9e41f
    '''

    X_train, X_valid, y_train, y_valid = train_test_split(X_all, Y_all, test_size=0.25, random_state=42)
    return X_train, X_valid, y_train, y_valid


def build_XY(p_signal, df_ann, num_cols, num_sec, fs, exclude_other):
    '''Helper method used to build dataset based on pandas dataframes

          Parameters
          ----------
          p_signal : np.ndarray
              An ECG
          df_ann : pd.DataFrame
              pd.DataFrame({'atr_sym':sym, 'atr_sample':samp, 'aux_note':aux_note}). A dataframe 
              containing the beat annotation, the location of the R-peak and the rhythm annotation.
          num_cols : int
              Number of points in an ECG sample
          num_sec : Integral type
              Required number of seconds to sample on either side of an R-peak
          fs : int
              Frequency at which the dataset was sampled at
          exclude_other : bool
              Depricated
          Returns
          ------
          X : np.ndarray
              An array containing ECG samples
          Y_train : np.ndarray
              An array of zeroes and ones relating to wether a beat is abnormal or
              not
          sym : np.ndarray
              An array of beat annotations for each sample
          rhythm : np.ndarray
              An array of rhythm annotations for each sample
          
          References
          ----------
          https://towardsdatascience.com/detecting-heart-arrhythmias-with-deep-learning-in-keras-with-dense-cnn-and-lstm-add337d9e41f
    '''  
    num_rows = len(df_ann)
    X = np.zeros((num_rows, num_cols))
    Y = np.zeros((num_rows,1))
    rhythm = []
    sym = []
   
    
    # keep track of rows
    max_row = 0
    for atr_sample, atr_sym, aux_note in zip(df_ann.atr_sample.values, df_ann.atr_sym.values, df_ann.aux_note.values):
       
   
        # left = max([0,(atr_sample) ])
        # right = min([len(p_signal),(atr_sample + num_sec*2*fs) ])
       
        left = max([0,(atr_sample - num_sec*fs) ])
        right = min([len(p_signal),(atr_sample + num_sec*fs) ])

        x = p_signal[left: right]
        
        if len(x) == num_cols:
            X[max_row,:] = x
            Y[max_row,:] = int(atr_sym in abnormal)
            rhythm.append(aux_note)
            sym.append(atr_sym)
            max_row += 1
    X = X[:max_row,:]
    Y = Y[:max_row,:]
   
    
    return X,Y,sym,rhythm

# def getAbnormalIndex(atr_sym,atr_sample):
#     ab_index = [b for a,b in zip(atr_sym,atr_sample) if a in abnormal][:10]
#     return ab_index

def convert_to_label(rhythm):
    '''Helper method used to determine a sample's rhythm annotation based on
    the previous samples. See: https://www.physionet.org/physiotools/wpg/wpg_30.htm for how the
    rhythm annotations work in the MIT-BIH arrythmia dataset

          Parameters
          ----------
          rhythm : np.ndarray
              A set of rhythm annotations based on the MIT-BIH symbology
        
    
          Returns
          ------
          R_all : np.ndarray
              A set of rhythm annotations encoded with integers.
        
    '''    
    afib_bit = 0
    normal_bit = 0
    other_bit = 0
    count_other = 0
    count_afib = 0
    count_normal = 0
    count_missed = 0
  
    R_all = []
    for i in rhythm:
        #print(i)
        if '(AFIB' in i:
      
            afib_bit = 1
            normal_bit = 0
            other_bit = 0
            count_afib = count_afib + 1
            
        elif '(N' in i:
      
            afib_bit = 0
            normal_bit = 1
            other_bit = 0
            count_normal = count_normal + 1
            
        elif '(AFIB' not in i and '(N' not in i and i != '':
         
            afib_bit = 0
            normal_bit = 0 
            other_bit = 1
            count_other = count_other + 1
            

        if normal_bit == 1:
            R_all.append(0)
        elif afib_bit == 1:
            R_all.append(1)
       
        elif other_bit == 1:
            R_all.append(2)

    return R_all
            

# Preprocessing


### Filters

In [None]:
from scipy.signal import medfilt
def median_filt(data):
  """Applies a median filter to input ECG signal.

  Parameters
  ----------
      data : np.ndarray
          An array containing an ECG signal
  Returns
  ------
      new_data : np.ndarray
          Same input signal with a median filter applied to it.
  """
  new_data = []
  for point in data:
    point = medfilt(point)
    new_data.append(point)
  return np.asarray(new_data)

In [None]:
from scipy.signal import iirnotch,filtfilt
def notch_filt(data):
  """Applies a notch filter to an input ECG signal.

  Parameters
  ----------
      data : np.ndarray
          An array containing an ECG signal
  Returns
  ------
      new_data : np.ndarray
          Same input signal with a notch filter applied to it.
  """
  new_data = []
  samp_freq = 360  # Sample frequency
  notch_freq = 60.0  # Frequency to be removed from signal
  quality_factor = 30.0 # this is incorrectly used as a constant 
  b_notch, a_notch = iirnotch(notch_freq, quality_factor, samp_freq)
  for point in data:
    point = filtfilt(b_notch, a_notch, point)
    new_data.append(point)
  return np.asarray(new_data)

### Make Images

The cell below accomplishes two things. Firstly, it *should* force images to be 128 by 128. Secondly, it *should* help with matplotlib's issues with memory leakage

In [None]:
#desired pixel size * 0.0139
plt.rcParams["figure.figsize"] = [1.78, 1.78]
plt.rcParams["figure.autolayout"] = True
matplotlib.use('Agg')

In [None]:
def ecg_to_greyscale_image(array, labels,split,save = True,incl_other = False):
    """Function used to make a set of grayscale 2D ECG images from a set of 
    ECGs.

    Parameters
    ----------
        array : np.ndarray
            Array of ECG signals
        lables : np.ndarray
            Array of lables associated with ECGs
        split : str
            The name of the split (train/test/validation etc.)
        save : bool
            Flag used to tell the function wether or not it should save the 
            images it produces (defaults to True)
       
    """

    cN = 0
    cA = 0
    cO = 0
    print(list(set(labels)))
    #filename = ''
    for count in tqdm(range(len(array))):
        
        fig = plt.figure()
        plt.ioff()
        plt.axis("off")
        plt.plot(array[count],'k') 

        if save == True:
            if labels[count] == 1:
                
                filename =  './images_'+split+'/af/af_' + str(cA)+'.png'
                cA += 1
            elif labels[count] == 0:
                filename =  './images_'+split+'/normal/normal_' + str(cN)+'.png'
                cN += 1 
            else:
              if incl_other:
                filename =  './images_'+split+'/other/other_' + str(cO)+'.png'
                cO += 1
            fig.savefig(filename)

        fig.clear() 
        plt.close(fig) 

# Usecases

## Make datasets from scratch

In [None]:
# for creating the necessary directories
if path.exists('images_train') == False:
  os.mkdir('images_train')
  os.mkdir('images_train/af')
  os.mkdir('images_train/normal')

if path.exists('images_test') == False:
  os.mkdir('images_test')
  os.mkdir('images_test/af')
  os.mkdir('images_test/normal')

In [None]:
path = 'mitdb/'
df = pd.DataFrame()
num_sec = 1
fs = 360
split = 0.1

# Get the patient samples to be used in test and training sents
train,test = split_by_patient(split)

# Creates the dataset
X_train, Y_train = make_dataset(path, num_sec, fs,train,True)
X_test, Y_test = make_dataset(path, num_sec, fs,test,True)


In [None]:
# Applies median and notch filters to training and test sets

X_train = median_filt(X_train)
X_train = notch_filt(X_train)

X_test= median_filt(X_test)
X_test = notch_filt(X_test)

In [None]:
np.save('X_train.npy',X_train)
np.save('Y_train.npy',Y_train)
np.save('X_test.npy',X_test)
np.save('Y_test.npy',Y_test)

In [None]:
ecg_to_greyscale_image(X_train,Y_train,'train')
ecg_to_greyscale_image(X_test,Y_test,'test')

## Make datasets from presaved splits:
i.e. If you choose not to make your own splits you can use the presaved ones

In [None]:
!unzip pre_saved_assests/saved_data_splits.npz

In [None]:
X_train = np.load('X_train.npy')
Y_train = np.load('rhythm_train.npy')
X_test = np.load('X_test.npy')
Y_test = np.load('rhythm_test.npy')

In [None]:
X_train = median_filt(X_train)
X_train = notch_filt(X_train)

X_test= median_filt(X_test)
X_test = notch_filt(X_test)

In [None]:
# for creating the necessary directories
if path.exists('/content/images_train') == False:
  os.mkdir('/content/images_train')
  os.mkdir('/content/images_train/af')
  os.mkdir('/content/images_train/normal')

if path.exists('/content/images_test') == False:
  os.mkdir('/content/images_test')
  os.mkdir('/content/images_test/af')
  os.mkdir('/content/images_test/normal')

In [None]:
ecg_to_greyscale_image(X_train,Y_train,'train')
ecg_to_greyscale_image(X_test,Y_test,'test')

## Moving dataset into presaved assets

In [None]:
!zip -r images.zip images_train images_test

In [None]:
%cp -av images.zip pre_saved_assests

# Next: 
With your new datasets, you can now move to the generative component: https://colab.research.google.com/drive/1RTMFgeI8X0Kchjicr6fd0OHqGZzmaskL?usp=sharing
