# Why is sistematization important?

The annotation files are not consistent between databases,
and even within the cudb database itself.

This was due to planning on combining multiple databases, but i opted for a different approach.

The afdb samples and annotations are downsampled to 125hz because of big RAM requirements and longer training times.

For example, in some annotation files the start of a VF episode is annotated by "+, (VF",
and somewhere only by '\[' for start and '\]' for end.
Also, in some cases the end of the vf episode is not annotated.

In the other databases, all episodes start with "+, (episode",
so the cudb files will be parsed so that they are consistent
if more databases are used going forward.



In [4]:
import os
import pandas as pd

AFDB_HZ = 250
AFDB_EPISODES = ['(N', '(AFIB', '(AFL', '(J']

data_dir = 'data\physionet_dbs\\afdb\\annotations'
write_data_dir = 'data\physionet_dbs\\afdb\\annotations_systemized_125hz'
if not os.path.exists(write_data_dir):
    os.makedirs(write_data_dir)

In [5]:

for file in os.listdir(data_dir):
    print(str(file).replace('.txt',''))

    dataset = pd.read_table(f'{data_dir}/{file}', header=None, index_col=False, engine='python', sep = '[\s]{2,}')
    print(dataset.shape)

    # columns and dtypes
    dataset.columns = ['timestamp','sample_idx', 'beat', 'misc_1', 'misc_2','episode']
    dataset.drop(labels = ['timestamp', 'misc_1', 'misc_2'], axis = 1, inplace = True)
    dataset['beat'] = dataset['beat'].astype(str)
    dataset['episode'] = dataset['episode'].astype(str)

    # so that 0s in sample_idx are not replaced by the following step
    sample_idx_column = dataset['sample_idx']
    dataset.drop(labels = ['sample_idx'], inplace = True, axis = 1)

    # handling problematic parsing
    dataset.replace({0: '', '0': ''}, inplace = True)
    for episode_iter in AFDB_EPISODES:
        dataset.replace({f'0\t{episode_iter}': f'{episode_iter}'}, inplace = True)
    dataset.replace({'': None}, inplace = True)

    # recover sample_idx column
    dataset.insert(0, 'sample_idx', sample_idx_column)
    dataset['sample_idx'] = dataset['sample_idx'].astype('int32')

    for row_idx, row in dataset.iterrows():
        dataset.loc[row_idx, 'sample_idx'] = int(round(row['sample_idx']/2))

    dataset.sort_values(by = ['sample_idx'], inplace = True)
    dataset.reset_index(drop = True, inplace = True)

    dataset.to_csv(f'{write_data_dir}\\{str(file.replace(".ann",""))}.csv', index = False)


04015.ann
(15, 6)
04043.ann
(166, 6)
04048.ann
(15, 6)
04126.ann
(15, 6)
04746.ann
(11, 6)
04908.ann
(17, 6)
04936.ann
(73, 6)
05091.ann
(17, 6)
05121.ann
(42, 6)
05261.ann
(23, 6)
06426.ann
(55, 6)
06453.ann
(13, 6)
06995.ann
(11, 6)
07162.ann
(1, 6)
07859.ann
(1, 6)
07879.ann
(5, 6)
07910.ann
(10, 6)
08215.ann
(4, 6)
08219.ann
(79, 6)
08378.ann
(17, 6)
08405.ann
(4, 6)
08434.ann
(7, 6)
08455.ann
(4, 6)


In [6]:
%reset