# Extraction of clean episode signals

The noise annotations in cudb are quite ambiguous, so extracting clean episode signals before
creating the segments seems like a good idea which will simplify the further data engineering


In [3]:
import pandas as pd
import os

ann_data_dir = 'data\physionet_dbs\\afdb\\annotations_systemized_125hz'
samp_data_dir = 'data\physionet_dbs\\afdb\samples_125hz'
write_data_dir = 'data\physionet_dbs\\afdb\\uniclass_episode_signals_testing_125hz'

if not os.path.exists(write_data_dir):
    os.makedirs(write_data_dir)

In [4]:
for file in os.listdir(ann_data_dir):

    file = str(file)

    if file in [ '04015.csv','04043.csv','04048.csv','04126.csv','04746.csv',
                 '04908.csv','04936.csv','05091.csv','05121.csv','05261.csv']:
        continue

    print(file)

    ann_file = pd.read_csv(f'{ann_data_dir}\\{file}')
    samp_file = pd.read_csv(f'{samp_data_dir}\\{file}')
    samp_file.columns = ['sample_idx', 'ecg']

    # beat annotations are irrelevant, so they are dismissed
    ann_file = ann_file[~ann_file['episode'].isna()].reset_index(drop = True)

    # the current episode that the segments belong to, and is kept if noise comes as an episode annotation
    ongoing_episode = None

    # iterating episodes
    for ann_idx, ann_row in ann_file.iterrows():

        # if last noise annotation, assume the rest of the signal is fine and noiseless
        if ann_row['episode'] == 'NOISE' and ann_idx == ann_file.shape[0] - 1:
            episode = ongoing_episode
            episode_start_sample = ann_row['sample_idx'] + 1
        # if there are noise anns following the current noise ann, because of ambiguity, dismiss the signal
        elif ann_row['episode'] == 'NOISE' and ann_file.iloc[ann_idx + 1]['episode'] == 'NOISE':
            episode = ongoing_episode
            episode_start_sample = ann_row['sample_idx'] + 1
        elif ann_row['episode'] == 'NOISE' and ann_file.iloc[ann_idx + 1]['episode'] != 'NOISE':
            continue
        # in the normal case when a rhythm episode begins
        else:
            episode = ann_row['episode']
            ongoing_episode = ann_row['episode']
            episode_start_sample = ann_row['sample_idx']

        # if last episode, go to end of file, otherwise end is the start of next episode
        if ann_idx == ann_file.shape[0] - 1:
            episode_end_sample = samp_file.iloc[-1]['sample_idx']
        else:
            episode_end_sample = ann_file.iloc[ann_idx + 1]['sample_idx'] - 1

        clean_episode_signal = samp_file[
            (samp_file['sample_idx'] >= episode_start_sample) &
            (samp_file['sample_idx'] <= episode_end_sample)
        ]

        clean_episode_signal.to_csv(
                f'{write_data_dir}\\'
                f'{file.replace(".csv","")}_'
                f'{episode.replace("(","")}_'
                f'{ann_idx}.csv',
                index = False
        )

        # break
    #
    # break

06426.csv
06453.csv
06995.csv
07162.csv
07859.csv
07879.csv
07910.csv
08215.csv
08219.csv
08378.csv
08405.csv
08434.csv
08455.csv


In [3]:
%reset