In [1]:
import pandas as pd


def load_dataset(path):
    # parse the dataset
    df = pd.read_csv(path)

    ################ distilled from notebook 0 ################
    # check the integrity
    assert df.isna().any().any() == False, 'There is at least one missing value.'
    assert df['Timestamp'].is_monotonic_increasing, 'Timestamp is not sorted.'

    # type-cast
    df['abstime'] = pd.to_datetime(df['Timestamp'], unit='s').round('us')
    df['monotime'] = df['Timestamp'] - df['Timestamp'].min()
    df['aid_int'] = df['Arbitration_ID'].map(lambda x: int(x, 16))
    df['y'] = df['Class'].map({'Normal': 0, 'Attack': 1})

    ################ distilled from notebook 1 ################
    # calculate the signal-wise timedelta
    df['Timedelta'] = df.groupby('Arbitration_ID')['Timestamp'].diff()

    return df


df_stationary = load_dataset('0_Preliminary/0_Training/Pre_train_S_0.csv')
df_driving = load_dataset('0_Preliminary/0_Training/Pre_train_D_0.csv')

# Measurement of average time intervals

In [2]:
df_td = pd.concat([
    df_stationary.groupby('Arbitration_ID')['Timedelta'].mean().rename('mean_stationary'),
    df_stationary.groupby('Arbitration_ID')['Timedelta'].std().rename('std_stationary'),
    df_driving.groupby('Arbitration_ID')['Timedelta'].mean().rename('mean_driving'),
    df_driving.groupby('Arbitration_ID')['Timedelta'].std().rename('std_driving'),

], axis=1)

df_td['diff_mean'] = (df_td['mean_stationary'] - df_td['mean_driving']).abs()
df_td['diff_std'] = (df_td['std_stationary'] - df_td['std_driving']).abs()

pd.options.display.max_rows = 100
df_td

Unnamed: 0_level_0,mean_stationary,std_stationary,mean_driving,std_driving,diff_mean,diff_std
Arbitration_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
043,0.999217,0.000218,0.999216,0.000387,3.930685e-07,0.0001688715
07F,1.000004,0.000184,1.00001,0.000266,6.091149e-06,8.192734e-05
130,0.010006,9.7e-05,0.010006,0.000101,8.312616e-09,4.150771e-06
140,0.010006,9.7e-05,0.010006,0.000102,1.116033e-08,4.285145e-06
153,0.01,0.00011,0.01,0.000128,7.323767e-09,1.806494e-05
164,0.01,0.00019,0.01,0.000202,2.063438e-08,1.179226e-05
220,0.01,0.000442,0.01,0.000375,7.052806e-08,6.722216e-05
251,0.01,0.00023,0.01,0.000231,2.426022e-08,1.062299e-06
260,0.010001,0.000538,0.010001,0.000806,2.40768e-08,0.0002686824
2B0,0.01,0.00023,0.01,0.000207,1.906393e-08,2.310247e-05


# Determination of the thresholds

Initially we try mean +- 3std.

In [3]:
df_td['threshold_low'] = df_td['mean_driving'] - 3 * df_td['std_driving']
df_td['threshold_high'] = df_td['mean_driving'] + 3 * df_td['std_driving']
df_td_threshold = df_td[['threshold_low', 'threshold_high']]
df_td_threshold

Unnamed: 0_level_0,threshold_low,threshold_high
Arbitration_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
043,0.998056,1.000376
07F,0.999212,1.000807
130,0.009703,0.01031
140,0.009702,0.010311
153,0.009616,0.010385
164,0.009394,0.010606
220,0.008876,0.011124
251,0.009308,0.010692
260,0.007582,0.012419
2B0,0.00938,0.01062


In [4]:
df_intrusion = load_dataset('0_Preliminary/0_Training/Pre_train_D_1.csv')
df_intrusion = df_intrusion.join(df_td_threshold, on='Arbitration_ID')
df_intrusion

Unnamed: 0,Timestamp,Arbitration_ID,DLC,Data,Class,SubClass,abstime,monotime,aid_int,y,Timedelta,threshold_low,threshold_high
0,1.597760e+09,153,8,20 A1 10 FF 00 FF 50 1F,Normal,Normal,2020-08-18 14:08:30.125893,0.000000,339,0,,0.009616,0.010385
1,1.597760e+09,220,8,13 24 7F 60 05 FF BF 10,Normal,Normal,2020-08-18 14:08:30.126151,0.000258,544,0,,0.008876,0.011124
2,1.597760e+09,507,4,08 00 00 01,Normal,Normal,2020-08-18 14:08:30.126310,0.000417,1287,0,,0.099536,0.100472
3,1.597760e+09,356,8,00 00 00 80 16 00 00 00,Normal,Normal,2020-08-18 14:08:30.127247,0.001354,854,0,,0.007816,0.012184
4,1.597760e+09,340,8,FC 03 00 E4 B7 21 FA 3C,Normal,Normal,2020-08-18 14:08:30.127480,0.001587,832,0,,0.009383,0.010620
...,...,...,...,...,...,...,...,...,...,...,...,...,...
806385,1.597760e+09,366,7,3B 28 0B 3B 30 00 01,Normal,Normal,2020-08-18 14:13:34.980384,304.854491,870,0,0.010013,0.007998,0.012002
806386,1.597760e+09,367,8,00 00 00 00 05 00 00 00,Normal,Normal,2020-08-18 14:13:34.980630,304.854737,871,0,0.010013,0.009066,0.010935
806387,1.597760e+09,368,8,00 00 00 00 01 28 0B 42,Normal,Normal,2020-08-18 14:13:34.980872,304.854979,872,0,0.010011,0.009082,0.010918
806388,1.597760e+09,47F,8,04 7F FF FF 00 7B 00 26,Normal,Normal,2020-08-18 14:13:34.981116,304.855223,1151,0,0.019768,0.018728,0.021272


In [5]:
df_intrusion['y_predicted'] = 0  # Init a column with 0
df_detected = df_intrusion.query('not (threshold_low <= Timedelta <= threshold_high)')
df_intrusion.loc[df_detected.index, 'y_predicted'] = 1

In [6]:
abstime_ceil = df_intrusion['abstime'].dt.ceil('10ms')
y = df_intrusion.groupby(abstime_ceil)['y'].max()
y_predicted = df_intrusion.groupby(abstime_ceil)['y_predicted'].max()

# Evaluation

calculate evaluation metrics (accuracy and recall) manually.

In [7]:
tn, fp = ((y==0) & (y_predicted==0)), ((y==0) & (y_predicted==1))
fn, tp = ((y==1) & (y_predicted==0)), ((y==1) & (y_predicted==1))
correctly_classified = (y == y_predicted)

print('Accuracy = ', correctly_classified.sum() / y.shape[0])
print('Precision = ', (tp.sum()) / (fp.sum() + tp.sum()))
print('Recall = ', (tp.sum()) / (fn.sum() + tp.sum()))

Accuracy =  0.6522124184078459
Precision =  0.48376259798432253
Recall =  1.0


get score with `sklearn.metrics` module.

In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print(accuracy_score(y, y_predicted))
print(precision_score(y, y_predicted))
print(recall_score(y, y_predicted))

0.6522124184078459
0.48376259798432253
1.0


two useful functions

In [9]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y, y_predicted))
print(classification_report(y, y_predicted, digits=4))

[[ 9948 10603]
 [    0  9936]]
              precision    recall  f1-score   support

           0     1.0000    0.4841    0.6523     20551
           1     0.4838    1.0000    0.6521      9936

    accuracy                         0.6522     30487
   macro avg     0.7419    0.7420    0.6522     30487
weighted avg     0.8318    0.6522    0.6523     30487



How great is the performance of the baseline detection model?