# OnHW Dataset Preparing

In [None]:
import json
import os
import pickle
from copy import copy
from datetime import datetime
from glob import glob

import numpy as np
from tqdm import tqdm

# dir_raw = '../../data/raw/Words500_indep_R'
# dir_out = '../../data/onhw_wi_word_rh'
# writer_indep = True

dir_raw = '../../data/raw/Words500_dep_R'
dir_out = '../../data/onhw_wd_word_rh'
writer_indep = False

cats = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÖÜäöüß'

annos_temp = {
    'info': {
        'idxs_channel': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
        'num_channel': 13,
        'rate_sample_target': 100,
        'sensors': ['AF', 'AR', 'G', 'M', 'F'],
        'time_build': datetime.now().strftime('%Y%m%d%H%M%S'),
        'num_fold': 5,
        'writer_independent': writer_indep,
    },
    'categories': sorted(list(cats)),
    'annotations': {},
}
cats = cats + ' '
dirs_fd = [obj for obj in glob(f'{dir_raw}/*') if os.path.isdir(obj)]

## Format val dataset

In [None]:
annos_val = copy(annos_temp)

# process val data
for i, dir_fd in enumerate(dirs_fd):
    annos_val['annotations'][i] = []
    cnt = 0

    os.makedirs(os.path.join(dir_out, 'data', str(i), 'val'), exist_ok=True)

    # load val data
    with open(os.path.join(dir_fd, 'all_x_dat_val_imu.pkl'), 'rb') as f:
        seqs_val = pickle.load(f)

    with open(os.path.join(dir_fd, 'all_val_gt.pkl'), 'rb') as f:
        gts_val = pickle.load(f)

    with open(os.path.join(dir_fd, 'val_ids.pkl'), 'rb') as f:
        ids_val = pickle.load(f)

    assert (
        len(seqs_val) == len(gts_val) == len(ids_val)
    ), 'Numbers of seqs, gts and ids of test set are not the same.'

    # remove empty sequences
    data_fd = [
        (seq, gt, id)
        for seq, gt, id in zip(seqs_val, gts_val, ids_val)
        if len(seq) > 0 and len(seq) <= 1024
    ]

    # save val data
    for seq, gt, id in tqdm(data_fd):
        anno = {
            'filename': os.path.join(
                'data', str(i), 'val', str(cnt).zfill(8) + '.csv'
            ),
            'id': cnt,
            'label': ''.join([cats[val] for val in gt]).strip(),
            'id_writer': id,
            'rate_sample_orig': 100,
        }
        np.savetxt(os.path.join(dir_out, anno['filename']), seq, delimiter=';')
        annos_val['annotations'][i].append(anno)
        cnt += 1

with open(os.path.join(dir_out, 'val.json'), 'w') as f:
    json.dump(annos_val, f)

## Format train dataset

In [None]:
annos_train = copy(annos_temp)

# process train data
for i, dir_fd in enumerate(dirs_fd):
    annos_train['annotations'][i] = []
    cnt = 0

    os.makedirs(os.path.join(dir_out, 'data', str(i), 'train'), exist_ok=True)

    # load train data
    with open(os.path.join(dir_fd, 'all_x_dat_train_imu.pkl'), 'rb') as f:
        seqs_train = pickle.load(f)

    with open(os.path.join(dir_fd, 'all_train_gt.pkl'), 'rb') as f:
        gts_train = pickle.load(f)

    with open(os.path.join(dir_fd, 'train_ids.pkl'), 'rb') as f:
        ids_train = pickle.load(f)

    assert (
        len(seqs_train) == len(gts_train) == len(ids_train)
    ), 'Numbers of seqs, gts and ids of test set are not the same.'

    # remove empty sequences
    data_fd = [
        (seq, gt, id)
        for seq, gt, id in zip(seqs_train, gts_train, ids_train)
        if len(seq) > 0 and len(seq) <= 1024
    ]

    # save train data
    for seq, gt, id in tqdm(data_fd):
        anno = {
            'filename': os.path.join(
                'data', str(i), 'train', str(cnt).zfill(8) + '.csv'
            ),
            'id': cnt,
            'label': ''.join([cats[val] for val in gt]).strip(),
            'id_writer': id,
            'rate_sample_orig': 100,
        }
        np.savetxt(os.path.join(dir_out, anno['filename']), seq, delimiter=';')
        annos_train['annotations'][i].append(anno)
        cnt += 1

with open(os.path.join(dir_out, 'train.json'), 'w') as f:
    json.dump(annos_train, f)