In [2]:
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report


def load_dataset(path):
    # parse the dataset
    df = pd.read_csv(path)

    ################ distilled from notebook 0 ################
    # check the integrity
    assert df.isna().any().any() == False, 'There is at least one missing value.'
    assert df['Timestamp'].is_monotonic_increasing, 'Timestamp is not sorted.'

    # type-cast
    df['abstime'] = pd.to_datetime(df['Timestamp'], unit='s').round('us')
    df['monotime'] = df['Timestamp'] - df['Timestamp'].min()
    df['aid_int'] = df['Arbitration_ID'].map(lambda x: int(x, 16))
    df['y'] = df['Class'].map({'Normal': 0, 'Attack': 1})

    ################ distilled from notebook 1 ################
    # calculate the stream-wise timedelta
    df['Timedelta'] = df.groupby('Arbitration_ID')['Timestamp'].diff()

    return df


df_submit1 = load_dataset('0_Preliminary/1_Submission/Pre_submit_D.csv')
df_submit2 = load_dataset('0_Preliminary/1_Submission/Pre_submit_S.csv')

Implement the `detect()` function. Then, submit the detection result!

In [3]:
def detect(df):
    df = df.copy()
    df['y_predicted'] = 0

    ########## implement your detection routine here ##########

    ###########################################################

    abstime_ceil = df['abstime'].dt.ceil('10ms')
    y = df.groupby(abstime_ceil)['y'].max()
    y_predicted = df.groupby(abstime_ceil)['y_predicted'].max()
    return y, y_predicted


def submit(*results):
    for i, result in enumerate(results):
        print(f'*** For submission set {i + 1} ***')
        print(confusion_matrix(*result))
        print(classification_report(*result, zero_division=0, digits=4))

    print('*** Overall result ***')
    list_y = pd.concat([x[0] for x in results])
    list_y_predicted = pd.concat([x[1] for x in results])
    assert list_y.shape[0] == list_y_predicted.shape[0], 'Error. Num of record mismatch.'
    print(confusion_matrix(list_y, list_y_predicted))
    print(classification_report(list_y, list_y_predicted, zero_division=0, digits=4))


result1 = detect(df_submit1)
result2 = detect(df_submit2)

submit(result1, result2)

*** For submission set 1 ***
[[39977     0]
 [34810     0]]
              precision    recall  f1-score   support

           0     0.5345    1.0000    0.6967     39977
           1     0.0000    0.0000    0.0000     34810

    accuracy                         0.5345     74787
   macro avg     0.2673    0.5000    0.3483     74787
weighted avg     0.2857    0.5345    0.3724     74787

*** For submission set 2 ***
[[33953     0]
 [30912     0]]
              precision    recall  f1-score   support

           0     0.5234    1.0000    0.6872     33953
           1     0.0000    0.0000    0.0000     30912

    accuracy                         0.5234     64865
   macro avg     0.2617    0.5000    0.3436     64865
weighted avg     0.2740    0.5234    0.3597     64865

*** Overall result ***
[[73930     0]
 [65722     0]]
              precision    recall  f1-score   support

           0     0.5294    1.0000    0.6923     73930
           1     0.0000    0.0000    0.0000     65722

    accu