# Why is sistematization important?

The annotation files are not consistent between databases,
and even within the cudb database itself.

For example, in some annotation files the start of a VF episode is annotated by "+, (VF",
and somewhere only by '\[' for start and '\]' for end.
Also, in some cases the end of the vf episode is not annotated.

In the other databases, all episodes start with "+, (episode",
so the cudb files will be parsed so that they are consistent
if more databases are used going forward.



In [11]:
import os
import pandas as pd

CUDB_HZ = 250
CUDB_EPISODES = ['(AF', '(VF', '(VT', '(N']

data_dir = 'data\physionet_dbs\cudb\\annotations'
write_data_dir = 'data\physionet_dbs\cudb\\annotations_systemized'
if not os.path.exists(write_data_dir):
    os.makedirs(write_data_dir)

In [12]:

for file in os.listdir(data_dir):
    print(str(file).replace('.txt',''))

    dataset = pd.read_table(f'{data_dir}/{file}', header=None, index_col=False, engine='python', sep = '[\s]{2,}')
    print(dataset.shape)

    # columns and dtypes
    dataset.columns = ['timestamp','sample_idx', 'beat', 'misc_1', 'misc_2','episode']
    dataset.drop(labels = ['timestamp', 'misc_1', 'misc_2'], axis = 1, inplace = True)
    dataset['beat'] = dataset['beat'].astype(str)
    dataset['episode'] = dataset['episode'].astype(str)

    # so that 0s in sample_idx are not replaced by the following step
    sample_idx_column = dataset['sample_idx']
    dataset.drop(labels = ['sample_idx'], inplace = True, axis = 1)

    # handling problematic parsing
    dataset.replace({0: '', '0': ''}, inplace = True)
    for episode_iter in CUDB_EPISODES:
        dataset.replace({f'0\t{episode_iter}': f'{episode_iter}'}, inplace = True)
    dataset.replace({'': None}, inplace = True)

    # recover sample_idx column
    dataset.insert(0, 'sample_idx', sample_idx_column)
    dataset['sample_idx'] = dataset['sample_idx'].astype('int32')

    # --------------------------------------------------------------------------------
    # CORRECTION 1: Removing VFL redundant brackets '[' and ']'

    # cannot just delete them because the start of the next episode needs to be specified if it is not
    # dataset = dataset[dataset['beat']!= ']']

    # -----------------------------------------
    # Case 1 for [: if there is an explicitly annotated VF episode, remove the [

    dataset.reset_index(drop = True, inplace = True)
    row_idxs_to_remove = list()

    for row_idx, row in dataset.iterrows():

        if row['beat'] == '[':

            # if not first and last row
            if row_idx != 0 and row_idx != dataset.shape[0] - 1:

                # if (VF precedes or follows a [
                if dataset.iloc[row_idx-1]['episode'] == '(VF' or dataset.iloc[row_idx+1]['episode'] == '(VF':
                    row_idxs_to_remove.append(False); continue

            elif row_idx == 0:
                if dataset.iloc[row_idx+1]['episode'] == '(VF':
                    row_idxs_to_remove.append(False); continue

            else:
                if dataset.iloc[row_idx-1]['episode'] == '(VF':
                    row_idxs_to_remove.append(False); continue

        # every other case
        row_idxs_to_remove.append(True)

    dataset = dataset[row_idxs_to_remove]


    # -----------------------------------------
    # Case 2 for [: if there isn't an explicitly annotated VF episode, replace the [ with +, VF

    for row_idx, row in dataset.iterrows():
            if row['beat'] == '[':
                dataset.loc[row_idx, 'beat'] = '+'
                dataset.loc[row_idx, 'episode'] = '(VF'


    # -----------------------------------------
    # Case 1 for ]: if there is an episode following the VF episode, remove the ]

    dataset.reset_index(drop = True, inplace = True)
    row_idxs_to_remove = list()

    for row_idx, row in dataset.iterrows():

        # assuming ] cannot be found in a first row
        if row_idx == 0:
            row_idxs_to_remove.append(True)
            continue

        if row['beat'] == ']':
            if row_idx == dataset.shape[0] - 1 or dataset.iloc[row_idx+1]['episode'] in CUDB_EPISODES:
                row_idxs_to_remove.append(False); continue

        # every other case
        row_idxs_to_remove.append(True)

    dataset = dataset[row_idxs_to_remove]

    # -----------------------------------------
    # Case 2 for ]: if there isn't an episode following the VF episode, assume (N
    for row_idx, row in dataset.iterrows():
            if row['beat'] == ']':
                dataset.loc[row_idx, 'beat'] = '+'
                dataset.loc[row_idx, 'episode'] = '(N'

    dataset.reset_index(drop = True, inplace = True)

    # --------------------------------------------------------------------------------
    # CORRECTION 2: Signal quality annotation change from '~, ' to '~, NOISE'

    for row_idx, row in dataset.iterrows():
        if row['beat'] == '~':
            dataset.loc[row_idx, 'episode'] = 'NOISE'

    # --------------------------------------------------------------------------------
    # CORRECTION 3: Ensure explicit episode annotation at begining of file

    if dataset.iloc[0]['episode'] is None:
        dataset = dataset.append(pd.DataFrame(data = [[0,'+','(N']], columns = ['sample_idx', 'beat', 'episode']))
        dataset.reset_index(drop = True, inplace = True)

    if dataset.iloc[0]['beat']=='~':
        dataset = dataset.append(pd.DataFrame(data = [[0,'+','(N']], columns = ['sample_idx', 'beat', 'episode']))
        dataset.reset_index(drop = True, inplace = True)

    dataset.sort_values(by = ['sample_idx'], inplace = True)
    dataset.reset_index(drop = True, inplace = True)

    dataset.to_csv(f'{write_data_dir}\\{str(file.replace(".ann",""))}.csv', index = False)

    # break

cu01.ann
(206, 6)
cu02.ann
(970, 6)
cu03.ann
(941, 6)
cu04.ann
(248, 6)
cu05.ann
(697, 6)
cu06.ann
(453, 6)
cu07.ann
(377, 6)
cu08.ann
(1209, 6)
cu09.ann
(925, 6)
cu10.ann
(557, 6)
cu11.ann
(508, 6)
cu12.ann
(410, 6)
cu13.ann
(870, 6)
cu14.ann
(534, 6)
cu15.ann
(284, 6)
cu16.ann
(836, 6)
cu17.ann
(533, 6)
cu18.ann
(688, 6)
cu19.ann
(679, 6)
cu20.ann
(208, 6)
cu21.ann
(647, 6)
cu22.ann
(440, 6)
cu23.ann
(443, 6)
cu24.ann
(511, 6)
cu25.ann
(559, 6)
cu26.ann
(773, 6)
cu27.ann
(1065, 6)
cu28.ann
(385, 6)
cu29.ann
(543, 6)
cu30.ann
(118, 6)
cu31.ann
(262, 6)
cu32.ann
(780, 6)
cu33.ann
(549, 6)
cu34.ann
(255, 6)
cu35.ann
(329, 6)
