In [1]:
import json
import math
import glob
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from collections import OrderedDict

In [2]:
BSM_SIZE = 101347646 # Already counted
N_FEATURES = 22
root_dir = '/home/jovyan/work/VeReMi/'
filenames = glob.glob(root_dir + '**/traceJSON*.json', recursive=True)

In [3]:
# # Count the number of BSM messages in the dataset
# line_num = 0
# for filepath in tqdm(filenames):
#     f = open(filepath)
#     for line in f:
#         if '"type":3' in line:
#             line_num += 1
    
# line_num

In [4]:
attacker_sender_map = {}

datasets = OrderedDict()

for filepath in filenames:
    path_parts = filepath.split('/')
    filename = path_parts[-1]
    dataset_name = path_parts[-3]
    _, vehicle, _, attacker = filename.split('-')[:4]
    attacker_type = int(attacker[1:])
    vehicle_id = int(vehicle)
    attacker_sender_map[f'{dataset_name}:{vehicle_id}'] = attacker_type
    datasets[dataset_name] = 1

for i, k in enumerate(datasets.keys()):
    datasets[k] = i

datasets

OrderedDict([('ConstSpeed_0709', 0),
             ('EventualStop_1416', 1),
             ('DataReplaySybil_0709', 2),
             ('RandomSpeed_1416', 3),
             ('DoSRandomSybil_1416', 4),
             ('DoSRandom_0709', 5),
             ('DataReplay_0709', 6),
             ('DataReplay_1416', 7),
             ('RandomPos_1416', 8),
             ('ConstSpeed_1416', 9),
             ('RandomPos_0709', 10),
             ('GridSybil_0709', 11),
             ('ConstPos_1416', 12),
             ('RandomPosOffset_1416', 13),
             ('DoS_0709', 14),
             ('ConstPosOffset_1416', 15),
             ('Disruptive_1416', 16),
             ('DoS_1416', 17),
             ('DelayedMessages_0709', 18),
             ('ConstSpeedOffset_0709', 19),
             ('DoSDisruptiveSybil_0709', 20),
             ('ConstPos_0709', 21),
             ('MixAll_0024', 22),
             ('DoSRandom_1416', 23),
             ('RandomSpeed_0709', 24),
             ('RandomSpeedOffset_1416', 25),
 

In [None]:
veremi = np.memmap('veremi.memmap', dtype='float32', mode='w+', shape=(BSM_SIZE, N_FEATURES))
# filenames = filenames[:1000]

veremi_pos = 0
for filepath in (pbar := tqdm(filenames)):
    f = open(filepath)
    path_parts = filepath.split('/')
    filename = path_parts[-1]
    dataset_name = path_parts[-3]
    receiver = int(filename.split('-')[1])
    dataset_id = datasets[dataset_name]
    pbar.set_description("Vehicle ID {}".format(receiver))
    gps = None
    for line in f:
        data = json.loads(line)
        if data['type'] == 2:
            gps = data
        elif data['type'] == 3:
            attack_type = attacker_sender_map.get(f'{dataset_name}:{data["sender"]}', None)
            if attack_type is not None:
                out = (
                    dataset_id,
                    data['messageID'],
                    data['rcvTime'],
                    receiver,
                    data['sender'],
                    attack_type,
                    data['pos'][0],
                    data['pos'][1],
                    data['spd'][0],
                    data['spd'][1],
                    data['acl'][0],
                    data['acl'][1],
                    data['hed'][0],
                    data['hed'][1],
                    gps['pos'][0],
                    gps['pos'][1],
                    gps['spd'][0],
                    gps['spd'][1],
                    gps['acl'][0],
                    gps['acl'][1],
                    gps['hed'][0],
                    gps['hed'][1],
                )
                veremi[veremi_pos] = out
                veremi_pos += 1

  0%|          | 0/134082 [00:00<?, ?it/s]

In [None]:
veremi = np.memmap('veremi.memmap', dtype='float32', mode='r', shape=(BSM_SIZE, N_FEATURES))

df = pd.DataFrame(veremi, columns=[
    'dataset_id',
    'message_id',
    'bsm_rcv_time',
    'receiver',
    'sender',
    'attack_type',
    'snd_pos_x',
    'snd_pos_y',
    'snd_spd_x',
    'snd_spd_y',
    'snd_acl_x',
    'snd_acl_y',
    'snd_hed_x',
    'snd_hed_y',
    'rcv_pos_x',
    'rcv_pos_y',
    'rcv_spd_x',
    'rcv_spd_y',
    'rcv_acl_x',
    'rcv_acl_y',
    'rcv_hed_x',
    'rcv_hed_y',
])
# df.drop(df[df.attack_type.between(1, 9)].index, inplace=True) # drop faults (1 - 9)
df.drop(df[df.receiver == df.sender].index, inplace=True) # drop bad registers

df.sort_values(['dataset_id', 'sender', 'receiver', 'bsm_rcv_time'], inplace=True)
df.reset_index(inplace=True, drop=True)

# Calc delta_time based on the diff between the messages
def calc_delta(x):
    return pd.Series(np.insert(np.diff(x.bsm_rcv_time), 0, 0))

grouped = df[['dataset_id',
              'sender',
              'receiver',
              'bsm_rcv_time']].groupby(['dataset_id',
                                        'sender',
                                        'receiver'])

indexes = grouped.size().index

for k in tqdm(indexes):
    t = grouped.get_group(k)
    t = t.sort_values('bsm_rcv_time')
    delta_time = calc_delta(t)
    df.loc[t.index, 'delta_time'] = delta_time.values

df.to_feather('out_veremi/veremi.feather')
df.head(50)

In [None]:
df = pd.read_feather('out_veremi/veremi.feather')

random_state = 42

# group by sender and receiver
# TODO: group only by sender
grouped = df.groupby(['dataset_id', 'sender', 'receiver', 'attack_type'])
interactions = grouped.size().rename('seq_len').reset_index()

train = interactions.sample(frac=0.8, random_state=random_state)
test = interactions.drop(train.index)

train_full_genuine = train[train.attack_type == 0]

sample_size_5_pctg = round(len(train_full_genuine) / 0.95 - len(train_full_genuine))
train_5_pctg_atk = train[train.attack_type != 0].sample(sample_size_5_pctg, random_state=random_state)
train_95_genuine = pd.concat([train_full_genuine, train_5_pctg_atk]).sample(frac=1, random_state=random_state) # concat and shuffle

sample_size_10_pctg = round(len(train_full_genuine) / 0.9 - len(train_full_genuine))
train_10_pctg_atk = train[train.attack_type != 0].sample(sample_size_10_pctg, random_state=random_state)
train_90_genuine = pd.concat([train_full_genuine, train_10_pctg_atk]).sample(frac=1, random_state=random_state) # concat and shuffle

del train_10_pctg_atk
del train_5_pctg_atk
del interactions
del grouped

print(f'Train: {len(train)}, Train 0%: {len(train_full_genuine)}, Train 5%: {len(train_95_genuine)}, Train 10%: {len(train_90_genuine)}, Test: {len(test)}')

In [None]:
cols = ['dataset_id', 'sender', 'receiver', 'attack_type']

test_df = df.join(test.set_index(cols), how='inner', on=cols)
test_df.reset_index(drop=True).to_feather('out_veremi/test.feather')
del test_df
del test

train_df = df.join(train.set_index(cols), how='inner', on=cols)
train_df.reset_index(drop=True).to_feather('out_veremi/train.feather')
del train_df
del train

train_full_genuine_df = df.join(train_full_genuine.set_index(cols), how='inner', on=cols)
train_full_genuine_df.reset_index(drop=True).to_feather('out_veremi/train_full_genuine.feather')
del train_full_genuine_df
del train_full_genuine

train_95_genuine_df = df.join(train_95_genuine.set_index(cols), how='inner', on=cols)
train_95_genuine_df.reset_index(drop=True).to_feather('out_veremi/train_95_genuine.feather')
del train_95_genuine_df
del train_95_genuine

train_90_genuine_df = df.join(train_90_genuine.set_index(cols), how='inner', on=cols)
train_90_genuine_df.reset_index(drop=True).to_feather('out_veremi/train_90_genuine.feather')
del train_90_genuine_df
del train_90_genuine