In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
import gc
import h5py

In [None]:
WITH_LABELS, WITHOUT_LABELS = True, False

files = [
    #('test', WITH_LABELS),
    ('train', WITH_LABELS),
    # ('train_full_genuine', WITH_LABELS),
    # ('train_95_genuine', WITH_LABELS),
    # ('train_90_genuine', WITH_LABELS),
]

fields = [
    'snd_pos_x',
    'snd_pos_y',
    'snd_spd_x',
    'snd_spd_y',
    'snd_acl_x',
    'snd_acl_y',
    'snd_hed_x',
    'snd_hed_y',
    'rcv_pos_x',
    'rcv_pos_y',
    'rcv_spd_x',
    'rcv_spd_y',
    'rcv_acl_x',
    'rcv_acl_y',
    'rcv_hed_x',
    'rcv_hed_y',
    'delta_time',
]

max_window_size = 20
stride = 10

for file, save_labels in files:
    sequences = []
    if save_labels:
        labels = []
    df = pd.read_feather(f'out_veremi/{file}.feather')
    
    # group by sender and receiver
    # TODO: group only by sender
    grouped = df.groupby(['dataset_id', 'sender', 'receiver', 'attack_type'])
    sorted_order = grouped.size().sort_values(ascending=False).index
    
    for k in tqdm(sorted_order):
        t = grouped.get_group(k)
        t = t.sort_values('bsm_rcv_time')
        attack_type = k[3]
        
        array = t[fields].to_numpy()

        num_windows = (len(t) - max_window_size) // stride + 1
        
        for i in range(num_windows):
            start_idx = i * stride
            end_idx = start_idx + max_window_size
            window = array[start_idx : end_idx]
            sequences.append(torch.tensor(window))
            if save_labels:
                labels.append(attack_type)

        final_window_size = len(t) % max_window_size
        if final_window_size > 0:
            final_window = array[-final_window_size:]
            sequences.append(torch.tensor(final_window))
            if save_labels:
                labels.append(attack_type)
        del t
    
    del sorted_order
    del df
    
    gc.collect()
    
    if save_labels:
        labels = sorted(enumerate(labels), key=lambda x: sequences[x[0]].shape[0], reverse=True)
        labels_array = np.array(labels)
        with h5py.File("out_veremi/veremi.h5", "a") as hf:
            ds_name = f"{file}_labels"
            if ds_name in hf.keys():
                del hf[ds_name]
            hf.create_dataset(ds_name, data=labels_array)
        del labels_array
        del labels
        
    sequences.sort(key=lambda x: x.shape[0], reverse=True)
    padded = nn.utils.rnn.pad_sequence(sequences).numpy()
    with h5py.File("out_veremi/veremi.h5", "a") as hf:
        ds_name = file
        if ds_name in hf.keys():
            del hf[ds_name]
        hf.create_dataset(file, data=padded)
    
    del padded
    del sequences

  0%|          | 0/3311449 [00:00<?, ?it/s]