In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

In [2]:
import pandas as pd
from numpy.lib.stride_tricks import as_strided
from pandarallel import pandarallel
import numpy as np
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Preprocessing (Run only 1 time)

In [3]:
def fill_flag(sample):
    if not isinstance(sample['Flag'], str):
        col = 'Data' + str(sample['DLC'])
        sample['Flag'], sample[col] = sample[col], sample['Flag']
    return sample

In [4]:
def sliding_window(data, win=29, s=1):
    itemsize  = data.itemsize
    N = len(data)
    sliding_data = as_strided(data, shape=((N - win) // s + 1, win), strides=(itemsize*s, itemsize))
    return sliding_data

In [5]:
attacks = ['DoS', 'Fuzzy', 'gear', 'RPM']
attributes = ['Timestamp', 'canID', 'DLC', 
                           'Data0', 'Data1', 'Data2', 
                           'Data3', 'Data4', 'Data5', 
                           'Data6', 'Data7', 'Flag']
def preprocessing(attack_id, window_size, strided):
    filename = f'../../Data/Car-Hacking/{attacks[attack_id]}_dataset.csv'
    df = pd.read_csv(filename, header=None, names=attributes)
    df = df.parallel_apply(fill_flag, axis=1)
    num_data_bytes = 8
    for x in range(num_data_bytes):
        df['Data'+str(x)] = df['Data'+str(x)].map(lambda x: int(x, 16), na_action='ignore')
    df = df.fillna(0)
    data_cols = ['Data{}'.format(x) for x in range(num_data_bytes)]
    df[data_cols] = df[data_cols].astype(int) 
    df['Data'] = df[data_cols].values.tolist()
    df['Flag'] = (df['Flag'] == 'T')
    sliding_label = sliding_window(df['Flag'].to_numpy(), win=window_size, s=strided)
    sliding_data = sliding_window(df['Data'].to_numpy(), win=window_size, s=strided)
    labels = np.any(sliding_label, axis=1).astype('int8')
    pp_df = pd.DataFrame({
    'data_seq': pd.Series(sliding_data.tolist()),
    'label': pd.Series(sliding_label.tolist())
    }, index=range(len(sliding_label)))
    pp_df['data_histogram'] = pp_df['data_seq'].parallel_apply(lambda x: np.histogram(np.array(x), bins=256)[0])
    pp_df['label'] = pp_df['label'].parallel_apply(lambda x: (attack_id + 1) if any(x) else 0)
    return pp_df

In [6]:
df_list = []
for i, a in enumerate(attacks): 
    print(f'LABEL: {a}')
    df = preprocessing(i, window_size=30, strided=10)
    df_list.append(df)

LABEL: DoS
LABEL: Fuzzy
LABEL: gear
LABEL: RPM


In [7]:
df = pd.concat(df_list)

In [8]:
df

Unnamed: 0,data_seq,label,data_histogram
0,"[[5, 33, 104, 9, 33, 33, 0, 111], [254, 91, 0,...",0,"[108, 0, 2, 2, 1, 4, 0, 1, 7, 2, 0, 3, 1, 0, 0..."
1,"[[229, 127, 0, 0, 72, 127, 11, 172], [0, 0, 0,...",0,"[110, 1, 2, 1, 1, 3, 0, 1, 8, 1, 1, 2, 3, 0, 1..."
2,"[[64, 187, 127, 20, 17, 32, 0, 20], [0, 0, 0, ...",0,"[114, 1, 1, 2, 0, 3, 0, 0, 4, 1, 1, 1, 4, 3, 2..."
3,"[[11, 128, 0, 255, 69, 128, 12, 133], [14, 128...",0,"[96, 1, 3, 1, 0, 3, 0, 0, 6, 3, 1, 1, 3, 3, 2,..."
4,"[[5, 33, 104, 9, 33, 33, 0, 111], [64, 187, 12...",0,"[109, 0, 2, 2, 0, 4, 0, 0, 4, 3, 0, 0, 1, 3, 3..."
...,...,...,...
462163,"[[0, 32, 0, 0, 0, 0, 0, 0], [0, 64, 96, 255, 1...",0,"[115, 3, 1, 0, 0, 3, 3, 3, 3, 4, 1, 0, 0, 0, 0..."
462164,"[[0, 128, 0, 0, 48, 127, 6, 68], [0, 0, 0, 0, ...",0,"[107, 2, 2, 0, 0, 3, 1, 3, 2, 4, 2, 0, 0, 0, 0..."
462165,"[[0, 64, 96, 255, 126, 133, 9, 0], [255, 0, 0,...",0,"[108, 3, 1, 0, 0, 4, 0, 3, 5, 4, 2, 0, 0, 0, 1..."
462166,"[[5, 34, 28, 10, 34, 30, 0, 111], [254, 89, 0,...",0,"[110, 2, 2, 0, 2, 3, 0, 0, 6, 5, 2, 0, 0, 0, 1..."


In [9]:
new_df = pd.DataFrame([pd.Series(x) for x in df['data_histogram']])
new_df.columns = ['{}'.format(x+1) for x in new_df.columns]

In [10]:
new_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,247,248,249,250,251,252,253,254,255,256
0,108,0,2,2,1,4,0,1,7,2,...,0,0,0,0,0,0,0,0,2,8
1,110,1,2,1,1,3,0,1,8,1,...,0,0,0,0,0,0,0,0,2,9
2,114,1,1,2,0,3,0,0,4,1,...,0,0,0,0,0,0,0,0,1,9
3,96,1,3,1,0,3,0,0,6,3,...,0,0,0,0,0,0,0,0,2,10
4,109,0,2,2,0,4,0,0,4,3,...,0,0,0,0,0,0,0,0,1,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1656934,115,3,1,0,0,3,3,3,3,4,...,0,0,0,0,0,0,0,0,1,12
1656935,107,2,2,0,0,3,1,3,2,4,...,0,0,0,0,0,0,0,0,2,9
1656936,108,3,1,0,0,4,0,3,5,4,...,0,1,0,0,0,0,0,0,1,12
1656937,110,2,2,0,2,3,0,0,6,5,...,0,1,0,0,0,0,0,0,2,9


In [11]:
X = new_df.to_numpy()
y = df['label']

In [21]:
np.savez_compressed('../../Data/Car-Hacking/full_histogram.npz', X=X, y=y)

## Modeling

In [3]:
data = np.load('../../Data/Car-Hacking/full_histogram.npz')
X,  y = data['X'], data['y']

In [4]:
import faiss
class FaissKNeighbors:
    def __init__(self, k=5):
        self.index = None
        self.y = None
        self.k = k

    def fit(self, X, y):
        self.index = faiss.IndexFlatL2(X.shape[1])
        self.index.add(X.astype(np.float32))
        self.y = y

    def predict(self, X):
        distances, indices = self.index.search(X.astype(np.float32), k=self.k)
        votes = self.y[indices]
        predictions = np.array([np.argmax(np.bincount(x)) for x in votes])
        return predictions

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from utils import cal_metric

In [6]:
sss = StratifiedShuffleSplit(n_splits=5, random_state=0)
ys = []
y_preds = []
total_results = {
    'fnr': np.zeros(5),
    'rec': np.zeros(5),
    'pre': np.zeros(5),
    'f1': np.zeros(5)
}
for i, (train_index, test_index) in enumerate(sss.split(X, y)):
    print('CV: ', i + 1)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    ys.append(y_test)
    # Prediction
    fast_knn = FaissKNeighbors(k=5)
    fast_knn.fit(X_train, y_train)
    y_pred = fast_knn.predict(X_test)
    y_preds.append(y_pred)
    ############
    cm, results = cal_metric(y_test, y_pred)
    print(results)
    for k, v in total_results.items():
        v += results[k]

CV:  1
{'fnr': array([0.00879808, 0.58000368, 1.13926994, 0.22710468, 0.16425121]), 'rec': array([0.99991202, 0.99419996, 0.9886073 , 0.99772895, 0.99835749]), 'pre': array([0.99720199, 0.99935221, 0.99984324, 1.        , 1.        ]), 'f1': array([0.99855517, 0.99676943, 0.99419352, 0.99886319, 0.99917807])}
CV:  2
{'fnr': array([0.00488782, 0.52476524, 1.19352089, 0.21126017, 0.16908213]), 'rec': array([0.99995112, 0.99475235, 0.98806479, 0.9978874 , 0.99830918]), 'pre': array([0.99721182, 0.99990746, 0.99968635, 1.        , 1.        ]), 'f1': array([0.99857959, 0.99732324, 0.9938416 , 0.99894258, 0.99915387])}
CV:  3
{'fnr': array([0.00391026, 0.39587553, 1.12376967, 0.22710468, 0.13043478]), 'rec': array([0.9999609 , 0.99604124, 0.9887623 , 0.99772895, 0.99869565]), 'pre': array([0.99748413, 0.99972279, 0.99992162, 1.        , 1.        ]), 'f1': array([0.99872098, 0.99787862, 0.99431065, 0.99886319, 0.9993474 ])}
CV:  4
{'fnr': array([0.00488782, 0.50635242, 1.40277455, 0.2376676

In [7]:
total_results = {k: v/5 for k, v in total_results.items()}
total_results


{'fnr': array([0.00527885, 0.50451114, 1.19507091, 0.22710468, 0.15845411]),
 'rec': array([0.99994721, 0.99495489, 0.98804929, 0.99772895, 0.99841546]),
 'pre': array([0.99722351, 0.99968548, 0.99984313, 1.        , 1.        ]),
 'f1': array([0.9985835 , 0.9973145 , 0.99391094, 0.99886318, 0.9992071 ])}