In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
sys.path.append(os.path.abspath('../../src/'))
sys.path.append(os.path.abspath('../../'))

In [2]:
import numpy as np
from preprocessing.train_test_split import train_test_split
import itertools
from pathlib import Path
import random

from src.preprocessing.train_test_split import Writer
from src.preprocessing.train_test_split import write_to_file

In [3]:
data_dir = '../../../Data/LISA/'
car_models = ['Kia', 'BMW', 'Tesla']
classes = ['Normal', 'Fuzzy', 'Replay']

## Generate train/val/test file

In [4]:
for cm, c in itertools.product(car_models, classes):
    file_name = Path(data_dir) / f'{cm}/{c}.npz'
    data = np.load(file_name)
    X, y = data['X'], data['y']
    print(f'Car: {cm} - Class {c} size = {len(X)}')
    indices_lists = train_test_split(len(X), test_fraction=0.2, val_fraction=0.1) 
    prefix = ['train', 'val', 'test']
    for prefix, indices in zip(prefix, indices_lists):
        print(f'{prefix} size: ', len(indices))
        X_subset = X[indices] 
        y_subset = y[indices]
        save_file = Path(data_dir) / f'{cm}/{prefix}_{c}.npz'
        # np.savez_compressed(save_file, X=X_subset, y=y_subset)

Car: Kia - Class Normal size = 80658
train size:  56462
val size:  8065
test size:  16131
Car: Kia - Class Fuzzy size = 35868
train size:  25109
val size:  3586
test size:  7173
Car: Kia - Class Replay size = 1282
train size:  898
val size:  128
test size:  256
Car: BMW - Class Normal size = 43194
train size:  30237
val size:  4319
test size:  8638
Car: BMW - Class Fuzzy size = 80938
train size:  56658
val size:  8093
test size:  16187
Car: BMW - Class Replay size = 38493
train size:  26946
val size:  3849
test size:  7698
Car: Tesla - Class Normal size = 157987
train size:  110592
val size:  15798
test size:  31597
Car: Tesla - Class Fuzzy size = 34960
train size:  24472
val size:  3496
test size:  6992
Car: Tesla - Class Replay size = 5998
train size:  4200
val size:  599
test size:  1199


# Generate train/test/folder

In [4]:
def resampling_data(car_model, in_dir, file_type, N_samples, attack_normal_ratio):
    """ 
    car_model: BMW, Tesla, Kia
    in_dir: directory for input data
    file_type: train, test, val
    N_samples: the size of total sampling data
    """
    in_dir = in_dir + '/{}/'  # to adapt with car_model
    in_path = Path(in_dir.format(car_model))
    classes = ['Normal', 'Fuzzy', 'Replay']
    def read_file(f):
        data = np.load(in_path / f)
        return data['X'], data['y']

    def sampling_data(d, indices):
        return d[indices]

    file_name = f'{file_type}_{{}}.npz'
    files = [file_name.format(c) for c in classes]
    data = list(map(lambda x: read_file(x), files))
    
    # Calculate the size of each class based on the attack/normal ratio
    class_distribution = np.array([1 - attack_normal_ratio, attack_normal_ratio / 2, attack_normal_ratio / 2])
    class_size = (N_samples * class_distribution).astype('int')
    # Sampling the indices according to the generated size
    indices = [np.arange(len(d[0])) for d in data]
    sampling_indices = [np.random.choice(idx, 
                                        size=size if size <= len(idx) else len(idx), 
                                        replace=False) 
                                        for idx, size in zip(indices, class_size)]
    # Take the data from sampling_indices
    X_subset = list(map(lambda p: sampling_data(p[0][0], p[1]), zip(data, sampling_indices)))
    y_subset = list(map(lambda p: sampling_data(p[0][1], p[1]), zip(data, sampling_indices)))
    return class_distribution, class_size, X_subset, y_subset

In [5]:
def create_folder_from_npz_file(car_model, in_dir, out_dir, dir_type, size, attack_normal_ratio):
    # Resample data
    class_dist, class_size, Xs, ys = resampling_data(car_model=car_model, in_dir=in_dir, 
                                    file_type=dir_type, N_samples=size, attack_normal_ratio=attack_normal_ratio)
    X = np.concatenate(Xs)
    y = np.concatenate(ys)
    print('Resample data info')
    print('Distribution: ', class_dist)
    print('Size: ', class_size)
    print('Total size: ', len(X))
    out_dir = out_dir + '/{}/'  # to adapt with car_model
    out_path = Path(out_dir.format(car_model))
    writer = Writer(outdir=out_path, type_name=f'{dir_type}_{size}')
    return write_to_file(writer, X, y)


In [11]:
in_dir = '../../../Data/LISA/'
out_dir = '../../../Data/LISA/'
car_model = ['Kia', 'BMW', 'Tesla']
for cm in car_model:
    print('Model: ', cm)
    # Generate the attack/normal ratio
    attack_normal_ratio = random.uniform(0.2, 0.3)
    create_folder_from_npz_file(car_model=cm, in_dir=in_dir, out_dir=out_dir, 
                                dir_type='train', size=2000, attack_normal_ratio=attack_normal_ratio)
    # create_folder_from_npz_file(car_model=cm, in_dir=in_dir, out_dir=out_dir, 
    #                             dir_type='test', size=10000, attack_normal_ratio=attack_normal_ratio)
    # create_folder_from_npz_file(car_model=cm, in_dir=in_dir, out_dir=out_dir, 
    #                             dir_type='val', size=10000, attack_normal_ratio=attack_normal_ratio)

Model:  Kia
Resample data info
Distribution:  [0.77713202 0.11143399 0.11143399]
Size:  [1554  222  222]
Total size:  1998
Start writing to:  ../../../Data/LISA/Kia/train_2000


1998it [00:01, 1948.30it/s]


Model:  BMW
Resample data info
Distribution:  [0.75365845 0.12317078 0.12317078]
Size:  [1507  246  246]
Total size:  1999
Start writing to:  ../../../Data/LISA/BMW/train_2000


1999it [00:00, 2015.21it/s]


Model:  Tesla
Resample data info
Distribution:  [0.74132924 0.12933538 0.12933538]
Size:  [1482  258  258]
Total size:  1998
Start writing to:  ../../../Data/LISA/Tesla/train_2000


1998it [00:01, 1580.74it/s]
