In [1]:
import json
import math
import glob
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from collections import OrderedDict

In [2]:
BSM_SIZE = 45748391 # 1242779 
N_FEATURES = 22
root_dir = '/home/jovyan/work/VeReMi/'  # DoSRandom_1416/'
filenames = glob.glob(root_dir + '**/traceJSON*.json', recursive=True)

remove_datasets = [
    'MixAll',
    'EventualStop',
    'ConstPos',
    'ConstPosOffset',
    'RandomPos',
    'RandomPosOffset',
    'ConstSpeed',
    'ConstSpeedOffset',
    'RandomSpeed',
    'RandomSpeedOffset',
    'DelayedMessages',
]

filenames = list(filter(lambda filename: all(dataset not in filename for dataset in remove_datasets), filenames))

In [3]:
# # Count the number of BSM messages in the dataset
# line_num = 0
# for filepath in tqdm(filenames):
#     f = open(filepath)
#     for line in f:
#         if '"type":3' in line:
#             line_num += 1
    
# line_num

In [4]:
attacker_sender_map = {}

datasets = OrderedDict()

for filepath in filenames:
    path_parts = filepath.split('/')
    filename = path_parts[-1]
    dataset_name = path_parts[-3]
    _, vehicle, _, attacker = filename.split('-')[:4]
    attacker_type = int(attacker[1:])
    vehicle_id = int(vehicle)
    attacker_sender_map[f'{dataset_name}:{vehicle_id}'] = attacker_type
    datasets[dataset_name] = 1

for i, k in enumerate(datasets.keys()):
    datasets[k] = i

datasets

OrderedDict([('DataReplaySybil_0709', 0),
             ('DoSRandomSybil_1416', 1),
             ('DoSRandom_0709', 2),
             ('DataReplay_0709', 3),
             ('DataReplay_1416', 4),
             ('GridSybil_0709', 5),
             ('DoS_0709', 6),
             ('Disruptive_1416', 7),
             ('DoS_1416', 8),
             ('DoSDisruptiveSybil_0709', 9),
             ('DoSRandom_1416', 10),
             ('DoSRandomSybil_0709', 11),
             ('Disruptive_0709', 12),
             ('GridSybil_1416', 13),
             ('DoSDisruptive_0709', 14),
             ('DoSDisruptive_1416', 15),
             ('DataReplaySybil_1416', 16),
             ('DoSDisruptiveSybil_1416', 17)])

In [23]:
# veremi = np.memmap('veremi_atk.memmap', dtype='float32', mode='w+', shape=(BSM_SIZE, N_FEATURES))
# # filenames = filenames[:1000]

# veremi_pos = 0
# for filepath in (pbar := tqdm(filenames)):
#     f = open(filepath)
#     path_parts = filepath.split('/')
#     filename = path_parts[-1]
#     dataset_name = path_parts[-3]
#     receiver = int(filename.split('-')[1])
#     dataset_id = datasets[dataset_name]
#     pbar.set_description("Vehicle ID {}".format(receiver))
#     gps = None
#     for line in f:
#         data = json.loads(line)
#         if data['type'] == 2:
#             gps = data
#         elif data['type'] == 3:
#             attack_type = attacker_sender_map.get(f'{dataset_name}:{data["sender"]}', None)
#             if attack_type is not None:
#                 out = (
#                     dataset_id,
#                     data['messageID'],
#                     data['rcvTime'],
#                     receiver,
#                     data['sender'],
#                     attack_type,
#                     data['pos'][0],
#                     data['pos'][1],
#                     data['spd'][0],
#                     data['spd'][1],
#                     data['acl'][0],
#                     data['acl'][1],
#                     data['hed'][0],
#                     data['hed'][1],
#                     gps['pos'][0],
#                     gps['pos'][1],
#                     gps['spd'][0],
#                     gps['spd'][1],
#                     gps['acl'][0],
#                     gps['acl'][1],
#                     gps['hed'][0],
#                     gps['hed'][1],
#                 )
#                 veremi[veremi_pos] = out
#                 veremi_pos += 1

  0%|          | 0/51831 [00:00<?, ?it/s]

In [5]:
veremi = np.memmap('veremi_atk.memmap', dtype='float32', mode='r', shape=(BSM_SIZE, N_FEATURES))

df = pd.DataFrame(veremi, columns=[
    'dataset_id',
    'message_id',
    'bsm_rcv_time',
    'receiver',
    'sender',
    'attack_type',
    'snd_pos_x',
    'snd_pos_y',
    'snd_spd_x',
    'snd_spd_y',
    'snd_acl_x',
    'snd_acl_y',
    'snd_hed_x',
    'snd_hed_y',
    'rcv_pos_x',
    'rcv_pos_y',
    'rcv_spd_x',
    'rcv_spd_y',
    'rcv_acl_x',
    'rcv_acl_y',
    'rcv_hed_x',
    'rcv_hed_y',
])

# df.drop(df[df.attack_type.between(1, 9)].index, inplace=True) # drop faults (1 - 9)
df.drop(df[df.receiver == df.sender].index, inplace=True) # drop bad registers

# Correcão vide ITA
# df.loc[(df.attack_type == 12) & ((df.snd_spd_x != 0) | (df.snd_spd_y != 0)), 'attack_type'] = 0

# teste
df = df.groupby('message_id').first()

df.sort_values(['dataset_id', 'sender', 'bsm_rcv_time', 'receiver'], inplace=True)

df.reset_index(inplace=True, drop=True)

# Calc delta_time based on the diff between the messages
def calc_delta(x):
    return pd.Series(np.insert(np.diff(x.bsm_rcv_time), 0, 0))

grouped = df[['dataset_id',
              'sender',
              'bsm_rcv_time']].groupby(['dataset_id',
                                        'sender'])

indexes = grouped.size().index

for k in tqdm(indexes):
    t = grouped.get_group(k)
    t = t.sort_values('bsm_rcv_time')
    time_id = np.insert((np.diff(t.bsm_rcv_time) > 1).cumsum(), 0, 0)
    df.loc[t.index, 'time_id'] = time_id
    # delta_time = calc_delta(t)
    # df.loc[t.index, 'delta_time'] = delta_time.values


df.to_feather('out_veremi/veremi-atk.feather')
df.head(50)

  0%|          | 0/46034 [00:00<?, ?it/s]

Unnamed: 0,dataset_id,bsm_rcv_time,receiver,sender,attack_type,snd_pos_x,snd_pos_y,snd_spd_x,snd_spd_y,snd_acl_x,...,snd_hed_y,rcv_pos_x,rcv_pos_y,rcv_spd_x,rcv_spd_y,rcv_acl_x,rcv_acl_y,rcv_hed_x,rcv_hed_y,time_id
0,0.0,25212.603516,45.0,9.0,0.0,257.602203,141.911057,-2.410089,14.316008,-0.066441,...,0.989303,265.827545,46.203983,-0.225153,2.17882,-0.211896,2.050527,-0.10279,0.994703,0.0
1,0.0,25213.603516,45.0,9.0,0.0,255.154266,156.202408,-2.566737,14.20993,0.153161,...,0.987495,268.693298,49.799755,-0.443232,4.275897,-0.236263,2.279253,-0.101413,0.994844,0.0
2,0.0,25214.603516,45.0,9.0,0.0,252.798218,170.49411,-2.342911,14.25425,0.036026,...,0.991032,268.047882,55.328945,-0.671702,6.479974,-0.173019,1.669123,-0.10136,0.99485,0.0
3,0.0,25215.603516,45.0,9.0,0.0,250.514755,184.825287,-2.3528,14.315076,0.011439,...,0.991249,270.147308,63.315498,-1.120083,8.541922,-0.314273,2.396685,-0.128275,0.991739,0.0
4,0.0,25216.603516,45.0,9.0,0.0,248.169205,199.165024,-2.079366,14.36093,-0.079467,...,0.994338,268.599731,72.921837,-1.636185,10.526543,-0.295651,1.902098,-0.151856,0.988403,0.0
5,0.0,25217.603516,45.0,9.0,0.0,246.022827,213.485306,-2.108613,14.352053,-0.075982,...,0.994114,266.938568,84.632545,-1.753861,12.601857,-0.266542,1.915159,-0.134291,0.990942,0.0
6,0.0,25218.603516,45.0,9.0,0.0,243.987335,227.85231,-2.116579,14.373932,-0.066674,...,0.994262,264.539948,98.215652,-2.562923,14.168457,0.028669,-0.158475,-0.174469,0.984663,0.0
7,0.0,25219.603516,45.0,9.0,0.0,241.758118,242.185226,-2.15586,14.337243,-0.001513,...,0.993931,262.383179,112.406685,-2.324593,14.166652,-0.037018,0.225608,-0.158382,0.987378,0.0
8,0.0,25220.603516,45.0,9.0,0.0,239.467636,256.520111,-2.156315,14.340267,0.031683,...,0.993931,260.31073,126.590225,-2.263361,14.161423,0.094545,-0.59154,-0.151306,0.988487,0.0
9,0.0,25221.603516,45.0,9.0,0.0,237.29158,270.856812,-2.154088,14.325277,0.025443,...,0.993931,257.96225,140.796707,-2.363964,14.146927,0.074231,-0.444214,-0.156658,0.987653,0.0


In [6]:
df = pd.read_feather('out_veremi/veremi-atk.feather')

random_state = 42

# group by sender and receiver
#grouped = df.groupby(['dataset_id', 'sender', 'receiver', 'attack_type'])
grouped = df.groupby(['dataset_id', 'sender', 'attack_type', 'time_id'])
interactions = grouped.size().rename('seq_len').reset_index()

train = interactions.sample(frac=0.8, random_state=random_state)
test = interactions.drop(train.index)

train_full_genuine = train[train.attack_type == 0]

sample_size_5_pctg = round(len(train_full_genuine) / 0.95 - len(train_full_genuine))
train_5_pctg_atk = train[train.attack_type != 0].sample(sample_size_5_pctg, random_state=random_state)
train_95_genuine = pd.concat([train_full_genuine, train_5_pctg_atk]).sample(frac=1, random_state=random_state) # concat and shuffle

sample_size_10_pctg = round(len(train_full_genuine) / 0.9 - len(train_full_genuine))
train_10_pctg_atk = train[train.attack_type != 0].sample(sample_size_10_pctg, random_state=random_state)
train_90_genuine = pd.concat([train_full_genuine, train_10_pctg_atk]).sample(frac=1, random_state=random_state) # concat and shuffle

del train_10_pctg_atk
del train_5_pctg_atk
del interactions
del grouped

print(f'Train: {len(train)}, Train 0%: {len(train_full_genuine)}, Train 5%: {len(train_95_genuine)}, Train 10%: {len(train_90_genuine)}, Test: {len(test)}')

Train: 255662, Train 0%: 212880, Train 5%: 224084, Train 10%: 236533, Test: 63916


In [7]:
# cols = ['dataset_id', 'sender', 'receiver', 'attack_type']
cols = ['dataset_id', 'sender', 'attack_type', 'time_id']

test_df = df.join(test.set_index(cols), how='inner', on=cols)
test_df.reset_index(drop=True).to_feather('out_veremi/test-atk.feather')
del test_df
del test

train_df = df.join(train.set_index(cols), how='inner', on=cols)
train_df.reset_index(drop=True).to_feather('out_veremi/train-atk.feather')
del train_df
del train

train_full_genuine_df = df.join(train_full_genuine.set_index(cols), how='inner', on=cols)
train_full_genuine_df.reset_index(drop=True).to_feather('out_veremi/train_full_genuine-atk.feather')
del train_full_genuine_df
del train_full_genuine

train_95_genuine_df = df.join(train_95_genuine.set_index(cols), how='inner', on=cols)
train_95_genuine_df.reset_index(drop=True).to_feather('out_veremi/train_95_genuine-atk.feather')
del train_95_genuine_df
del train_95_genuine

train_90_genuine_df = df.join(train_90_genuine.set_index(cols), how='inner', on=cols)
train_90_genuine_df.reset_index(drop=True).to_feather('out_veremi/train_90_genuine-atk.feather')
del train_90_genuine_df
del train_90_genuine