In [2]:
import json
import math
import glob
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from collections import OrderedDict

In [3]:
BSM_SIZE = 101347646 # Already counted
N_FEATURES = 22
root_dir = '/home/jovyan/work/VeReMi/'
filenames = glob.glob(root_dir + '**/traceJSON*.json', recursive=True)

In [4]:
# # Count the number of BSM messages in the dataset
# line_num = 0
# for filepath in tqdm(filenames):
#     f = open(filepath)
#     for line in f:
#         if '"type":3' in line:
#             line_num += 1
    
# line_num

In [5]:
attacker_sender_map = {}

datasets = OrderedDict()

for filepath in filenames:
    path_parts = filepath.split('/')
    filename = path_parts[-1]
    dataset_name = path_parts[-3]
    _, vehicle, _, attacker = filename.split('-')[:4]
    attacker_type = int(attacker[1:])
    vehicle_id = int(vehicle)
    attacker_sender_map[f'{dataset_name}:{vehicle_id}'] = attacker_type
    datasets[dataset_name] = 1

for i, k in enumerate(datasets.keys()):
    datasets[k] = i

datasets

OrderedDict([('ConstSpeed_0709', 0),
             ('EventualStop_1416', 1),
             ('DataReplaySybil_0709', 2),
             ('RandomSpeed_1416', 3),
             ('DoSRandomSybil_1416', 4),
             ('DoSRandom_0709', 5),
             ('DataReplay_0709', 6),
             ('DataReplay_1416', 7),
             ('RandomPos_1416', 8),
             ('ConstSpeed_1416', 9),
             ('RandomPos_0709', 10),
             ('GridSybil_0709', 11),
             ('ConstPos_1416', 12),
             ('RandomPosOffset_1416', 13),
             ('DoS_0709', 14),
             ('ConstPosOffset_1416', 15),
             ('Disruptive_1416', 16),
             ('DoS_1416', 17),
             ('DelayedMessages_0709', 18),
             ('ConstSpeedOffset_0709', 19),
             ('DoSDisruptiveSybil_0709', 20),
             ('ConstPos_0709', 21),
             ('MixAll_0024', 22),
             ('DoSRandom_1416', 23),
             ('RandomSpeed_0709', 24),
             ('RandomSpeedOffset_1416', 25),
 

In [6]:
# veremi = np.memmap('veremi.memmap', dtype='float32', mode='w+', shape=(BSM_SIZE, N_FEATURES))
# # filenames = filenames[:1000]

# veremi_pos = 0
# for filepath in (pbar := tqdm(filenames)):
#     f = open(filepath)
#     path_parts = filepath.split('/')
#     filename = path_parts[-1]
#     dataset_name = path_parts[-3]
#     receiver = int(filename.split('-')[1])
#     dataset_id = datasets[dataset_name]
#     pbar.set_description("Vehicle ID {}".format(receiver))
#     gps = None
#     for line in f:
#         data = json.loads(line)
#         if data['type'] == 2:
#             gps = data
#         elif data['type'] == 3:
#             attack_type = attacker_sender_map.get(f'{dataset_name}:{data["sender"]}', None)
#             if attack_type is not None:
#                 out = (
#                     dataset_id,
#                     data['messageID'],
#                     data['rcvTime'],
#                     receiver,
#                     data['sender'],
#                     attack_type,
#                     data['pos'][0],
#                     data['pos'][1],
#                     data['spd'][0],
#                     data['spd'][1],
#                     data['acl'][0],
#                     data['acl'][1],
#                     data['hed'][0],
#                     data['hed'][1],
#                     gps['pos'][0],
#                     gps['pos'][1],
#                     gps['spd'][0],
#                     gps['spd'][1],
#                     gps['acl'][0],
#                     gps['acl'][1],
#                     gps['hed'][0],
#                     gps['hed'][1],
#                 )
#                 veremi[veremi_pos] = out
#                 veremi_pos += 1

  0%|          | 0/134082 [00:00<?, ?it/s]

In [7]:
veremi = np.memmap('veremi.memmap', dtype='float32', mode='r', shape=(BSM_SIZE, N_FEATURES))

df = pd.DataFrame(veremi, columns=[
    'dataset_id',
    'message_id',
    'bsm_rcv_time',
    'receiver',
    'sender',
    'attack_type',
    'snd_pos_x',
    'snd_pos_y',
    'snd_spd_x',
    'snd_spd_y',
    'snd_acl_x',
    'snd_acl_y',
    'snd_hed_x',
    'snd_hed_y',
    'rcv_pos_x',
    'rcv_pos_y',
    'rcv_spd_x',
    'rcv_spd_y',
    'rcv_acl_x',
    'rcv_acl_y',
    'rcv_hed_x',
    'rcv_hed_y',
])

# df.drop(df[df.attack_type.between(1, 9)].index, inplace=True) # drop faults (1 - 9)
df.drop(df[df.receiver == df.sender].index, inplace=True) # drop bad registers

# Correcão vide ITA
df.loc[(df.attack_type == 12) & ((df.snd_spd_x != 0) | (df.snd_spd_y != 0)), 'attack_type'] = 0

df.sort_values(['dataset_id', 'sender', 'bsm_rcv_time', 'receiver'], inplace=True)
df.reset_index(inplace=True, drop=True)

# Calc delta_time based on the diff between the messages
def calc_delta(x):
    return pd.Series(np.insert(np.diff(x.bsm_rcv_time), 0, 0))

grouped = df[['dataset_id',
              'sender',
              'receiver',
              'bsm_rcv_time']].groupby(['dataset_id',
                                        'sender',
                                        'receiver'])

indexes = grouped.size().index

for k in tqdm(indexes):
    t = grouped.get_group(k)
    t = t.sort_values('bsm_rcv_time')
    delta_time = calc_delta(t)
    df.loc[t.index, 'delta_time'] = delta_time.values

df.to_feather('out_veremi/veremi.feather')
df.head(50)

  0%|          | 0/4139311 [00:00<?, ?it/s]

Unnamed: 0,dataset_id,message_id,bsm_rcv_time,receiver,sender,attack_type,snd_pos_x,snd_pos_y,snd_spd_x,snd_spd_y,...,snd_hed_y,rcv_pos_x,rcv_pos_y,rcv_spd_x,rcv_spd_y,rcv_acl_x,rcv_acl_y,rcv_hed_x,rcv_hed_y,delta_time
0,0.0,30373.0,25212.603516,45.0,9.0,0.0,257.780304,141.935547,-2.409947,14.31615,...,0.996311,265.584961,46.20739,-0.225147,2.178826,-0.211875,2.050548,-0.099654,0.995022,0.0
1,0.0,30570.0,25213.603516,45.0,9.0,0.0,255.39151,156.225891,-2.566602,14.210065,...,0.995935,268.402649,49.804451,-0.443194,4.275935,-0.236231,2.279285,-0.099511,0.995036,1.0
2,0.0,30717.0,25214.603516,45.0,9.0,0.0,252.924088,170.524399,-2.342777,14.254384,...,0.997234,268.013977,55.340389,-0.67166,6.480017,-0.172973,1.669169,-0.099213,0.995066,1.0
3,0.0,30864.0,25215.603516,45.0,9.0,0.0,250.56929,184.848038,-2.352759,14.315117,...,0.997965,270.411774,63.335522,-1.120073,8.541932,-0.31423,2.396728,-0.126134,0.992013,1.0
4,0.0,31011.0,25216.603516,45.0,9.0,0.0,248.450958,199.198212,-2.079189,14.361108,...,0.999001,268.668121,72.928406,-1.636057,10.526671,-0.2956,1.902149,-0.146044,0.989278,1.0
5,0.0,35481.0,25217.603516,45.0,9.0,0.0,246.43045,213.512863,-2.108438,14.352227,...,0.998906,267.10318,84.64209,-1.753642,12.602077,-0.266491,1.915211,-0.130282,0.991477,1.0
6,0.0,35652.0,25218.603516,45.0,9.0,0.0,244.237335,227.877441,-2.116601,14.373911,...,0.999249,264.486786,98.224663,-2.562417,14.168962,0.028799,-0.158345,-0.170302,0.985392,1.0
7,0.0,35823.0,25219.603516,45.0,9.0,0.0,242.006561,242.208847,-2.155858,14.337245,...,0.999127,262.485565,112.421425,-2.324022,14.167223,-0.036879,0.225748,-0.150742,0.988573,1.0
8,0.0,35994.0,25220.603516,45.0,9.0,0.0,239.786972,256.541992,-2.156313,14.340269,...,0.999471,260.303955,126.605698,-2.262772,14.162011,0.094678,-0.591407,-0.13961,0.990207,1.0
9,0.0,36165.0,25221.603516,45.0,9.0,0.0,237.808563,270.891815,-2.154063,14.325304,...,0.999471,258.187378,140.805725,-2.363448,14.147443,0.07437,-0.444074,-0.145987,0.989286,1.0


In [8]:
df = pd.read_feather('out_veremi/veremi.feather')

random_state = 42

# group by sender and receiver
#grouped = df.groupby(['dataset_id', 'sender', 'receiver', 'attack_type'])
grouped = df.groupby(['dataset_id', 'sender', 'attack_type'])
interactions = grouped.size().rename('seq_len').reset_index()

train = interactions.sample(frac=0.8, random_state=random_state)
test = interactions.drop(train.index)

train_full_genuine = train[train.attack_type == 0]

sample_size_5_pctg = round(len(train_full_genuine) / 0.95 - len(train_full_genuine))
train_5_pctg_atk = train[train.attack_type != 0].sample(sample_size_5_pctg, random_state=random_state)
train_95_genuine = pd.concat([train_full_genuine, train_5_pctg_atk]).sample(frac=1, random_state=random_state) # concat and shuffle

sample_size_10_pctg = round(len(train_full_genuine) / 0.9 - len(train_full_genuine))
train_10_pctg_atk = train[train.attack_type != 0].sample(sample_size_10_pctg, random_state=random_state)
train_90_genuine = pd.concat([train_full_genuine, train_10_pctg_atk]).sample(frac=1, random_state=random_state) # concat and shuffle

del train_10_pctg_atk
del train_5_pctg_atk
del interactions
del grouped

print(f'Train: {len(train)}, Train 0%: {len(train_full_genuine)}, Train 5%: {len(train_95_genuine)}, Train 10%: {len(train_90_genuine)}, Test: {len(test)}')

Train: 108392, Train 0%: 76228, Train 5%: 80240, Train 10%: 84698, Test: 27098


In [9]:
# cols = ['dataset_id', 'sender', 'receiver', 'attack_type']
cols = ['dataset_id', 'sender', 'attack_type']

test_df = df.join(test.set_index(cols), how='inner', on=cols)
test_df.reset_index(drop=True).to_feather('out_veremi/test.feather')
del test_df
del test

# train_df = df.join(train.set_index(cols), how='inner', on=cols)
# train_df.reset_index(drop=True).to_feather('out_veremi/train.feather')
# del train_df
# del train

train_full_genuine_df = df.join(train_full_genuine.set_index(cols), how='inner', on=cols)
train_full_genuine_df.reset_index(drop=True).to_feather('out_veremi/train_full_genuine.feather')
del train_full_genuine_df
del train_full_genuine

# train_95_genuine_df = df.join(train_95_genuine.set_index(cols), how='inner', on=cols)
# train_95_genuine_df.reset_index(drop=True).to_feather('out_veremi/train_95_genuine.feather')
# del train_95_genuine_df
# del train_95_genuine

# train_90_genuine_df = df.join(train_90_genuine.set_index(cols), how='inner', on=cols)
# train_90_genuine_df.reset_index(drop=True).to_feather('out_veremi/train_90_genuine.feather')
# del train_90_genuine_df
# del train_90_genuine