In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import wfdb
import os
import neurokit2 as nk #TODO look into this algorithm
from tqdm import tqdm


In [2]:
# path_to_dataset = "2_data\physionet_datasets\mitbih\mit-bih-arrhythmia-database-1.0.0"
# sample_of_interest = "221" # 221, 210 for AF | 100, 101, 103, 105, 108, 109, 111, 112, 113, 114, 115, 116, 117, 118, 121, 122, 123 for Normal 
# path_to_sample_1 = os.path.join(path_to_dataset, sample_of_interest)

path_to_dataset = "2_data\physionet_datasets\mitbih\\af_dataset"

samples = ['04048', '04746', '07162', '07859', '06995', '08405', 
            '04126', '05261', '05121', '08215', '08434', '04908', 
            '06426', '07879', '08455', '04015', '06453', '05091', 
            '08378', '08219', '04043', '04936', '07910']

# for sample in samples:
    
#     path_to_sample = os.path.join(dataset_path, sample)

#     print("Reading record...")
#     record = wfdb.rdrecord(path_to_sample)
#     print("Reading annotations...")
#     ann = wfdb.rdann(path_to_sample, 'atr')
    
#     print(ann.sample, ann.symbol)
# print("plotting...")
# wfdb.plot_wfdb(record=record, annotation=ann, plot_sym=True,
#                time_units='seconds', title=f'MIT-BIH Record {sample_of_interest}',
#                figsize=(10,4), ecg_grids='all')


In [3]:
# entire_ecg_signal = record.p_signal.T[0]
# plt.plot(entire_ecg_signal)
# plt.xlim(0,250*5)
# plt.ylim(-0.5,2)

In [4]:
def split_ecgs_after_given_number_of_R_peaks(ecg_signal, no_R_peaks_per_split, sample_name):
    
    
    # plt.plot(record.p_signal.T[1])
    _, rpeaks = nk.ecg_peaks(ecg_signal, sampling_rate=300)
    R_peaks = rpeaks['ECG_R_Peaks']
    # plot = nk.events_plot(rpeaks['ECG_R_Peaks'], ecg_signal)
    
    indexed_peaks = dict(enumerate(R_peaks))
    
    no_splits = max(indexed_peaks.keys()) // no_R_peaks_per_split + 1
    split_indices_of_ecg = [indexed_peaks[i*no_R_peaks_per_split] for i in range(no_splits)]
    length_of_each_ecg_split_section = (np.pad(np.array(split_indices_of_ecg),1)[1:] - np.pad(np.array(split_indices_of_ecg),1)[:-1])[1:-1]
    
    assert np.std(length_of_each_ecg_split_section) / np.mean(length_of_each_ecg_split_section) <= 0.3, f"check for anomolous section lengths from sample {sample_name}: \n {length_of_each_ecg_split_section} "
    
    ecg_split_into_sections = [ecg_signal[split_indices_of_ecg[i]:split_indices_of_ecg[i+1]] for i in range(len(split_indices_of_ecg)-1)]
        
    return ecg_split_into_sections, split_indices_of_ecg

# ecg_split_into_sections, split_indices_of_ecg = split_ecgs_after_given_number_of_R_peaks(entire_ecg_signal, 32, "")
# split_indices_of_ecg

In [5]:
def is_annotation_of_change_of_heartrate_present_in_interval(interval: tuple, ann):
    for i_, index_in_ecg in enumerate(ann.sample):
        if index_in_ecg >= interval[0] and index_in_ecg <= interval[1]:
            if ann.symbol[i_] == "+":
                return True
        
    return False

In [7]:
def split_ecgs_from_mitbih_dataset_and_save_as_npy(dataset_path, filetype, no_R_peaks_per_split=32, completed_samples=[]):
    
    sample_names = set([file.split(filetype,2)[0] for file in os.listdir(dataset_path) if file.endswith(filetype)])
    sample_names = [sample_name for sample_name in sample_names if sample_name not in completed_samples]
    print(f"files to be split: {sample_names}")
    
    try:
        
        for sample_name in tqdm(sample_names):
            
            path_to_sample = os.path.join(dataset_path, sample_name)
            record = wfdb.rdrecord(path_to_sample)
            ann = wfdb.rdann(path_to_sample, 'atr')
            entire_ecg_signal = record.p_signal.T[0]
            
            try:
                split_ecg, split_indices_of_ecg = split_ecgs_after_given_number_of_R_peaks(entire_ecg_signal, no_R_peaks_per_split, sample_name)
                
            except AssertionError:
                continue
                
                
            for i,ecg_section in enumerate(split_ecg):
                
                interval = (split_indices_of_ecg[i], split_indices_of_ecg[i+1]) 
                af_is_present = is_annotation_of_change_of_heartrate_present_in_interval(interval, ann)
                if af_is_present: filename_ending = "AF"
                else: filename_ending = "N"
                new_filename_of_split_ecg = "{}_{}_out_of_{}_{}".format(os.path.join("experimental_split_ecgs", sample_name), str(i).zfill(3), str(len(split_ecg)).zfill(3), filename_ending)
                new_path_of_split_ecg = os.path.join(path_to_dataset, new_filename_of_split_ecg)
                np.save(new_path_of_split_ecg, ecg_section)
                completed_samples.append(sample_name)
                
    except KeyboardInterrupt:
        
        return completed_samples
    
path_to_dataset = "2_data\physionet_datasets\mitbih\\af_dataset"
filetype = ".dat"
# completed_samples = []
completed_samples = split_ecgs_from_mitbih_dataset_and_save_as_npy(path_to_dataset, filetype, 32, completed_samples)

files to be split: ['05091', '05261', '04043', '05121', '04936', '06426', '04746', '06995', '07162', '08378', '04048', '07910', '06453', '08455', '07879', '08434', '04015', '04126', '07859', '08219', '08405', '04908', '08215']


100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [07:39<00:00, 19.98s/it]


In [None]:
# for i,ecg_section in enumerate(tqdm(split_ecg)):
    
#     new_filename_of_split_ecg = "AF_{}_{}_out_of_{}".format(os.path.join("split_ecgs", sample_of_interest), str(i).zfill(3), str(len(split_ecg)).zfill(3))
#     new_path_of_split_ecg = os.path.join(path_to_dataset, new_filename_of_split_ecg)
    
#     np.save(new_path_of_split_ecg, ecg_section)

In [None]:
# arr = np.load(f"2_data\\physionet_datasets\\mitbih\\mit-bih-arrhythmia-database-1.0.0\\split_ecgs\\{sample_of_interest}_000_out_of_0{len(split_ecg)}.npy")

In [None]:
# arr.shape