In [None]:
# Import necessary libraries and helper functions
import pandas as pd
import os
from helper_functions import make_can_df, add_time_diff_per_aid_col, add_actual_attack_col, add_kde_val_col, add_gauss_val_col, get_results_binning, unpickle
import json
import tqdm


# Loading and saving normal traffic

In [None]:
def load_data(directory, exclude=[], file_condition=lambda file_name: True):
    print("Loading data from directory: " + directory)
    df_aggregation = []

    for file_name in os.listdir(directory):
        if file_condition(file_name) and not any(excl in file_name for excl in exclude):
            print("Loading file: " + file_name)
            df = make_can_df(os.path.join(directory, file_name))
            df = add_time_diff_per_aid_col(df)
            df_aggregation.append(df)
            
    return df_aggregation



def load_and_save_training_data(directory):
    if os.path.exists(directory + 'training_data.csv'):
        print("Training data already exists. Loading training data from: " + directory + 'training_data.csv')
        return pd.read_csv(directory + 'training_data.csv')
        
    df_aggregation = load_data(directory, file_condition=lambda file_name: "dyno" in file_name)
    # Concatenate all training datasets on the dyno
    df_training = pd.concat(df_aggregation)
    training_data = df_training[["time", "aid", "time_diffs"]]
    save_file = 'training_data.csv'
    print("Saving training data to: " + save_file)
    training_data.to_csv(directory + save_file)
    return training_data

In [None]:
training_data = load_and_save_training_data('/home/jbc0071/Documents/can-time-based-ids-benchmark/data/ambient/')
display(training_data)


## "Train" model: calculating stats

In [None]:
from sklearn.covariance import EllipticEnvelope
import scipy.stats

def preprocess(df, aid):
    """
    Preprocesses the data by removing outliers.
    """
    time_diffs = df[df.aid==aid].time_diffs.values
    print("before: ", len(time_diffs))

    # identify outliers in the dataset
    ee = EllipticEnvelope(contamination=0.0001, support_fraction=0.999) # support_fraction=0.99
    inliers = ee.fit_predict(time_diffs.reshape(-1, 1))

    # select all rows that are not outliers
    mask = inliers != -1
    outliers = sum(mask == False)
    print("outliers: ", outliers, 100*outliers/len(time_diffs))

    time_diffs = time_diffs[mask]
    # summarize the shape of the updated training dataset
    print("after: ", len(time_diffs))

    return time_diffs


def calculate_statistics(time_diffs):
    """
    Returns a dictionary including the mean of its time_diffs, standard deviation of its time_diffs
    and KDE of its time_diffs
    """
    aid_dict = {'mu': time_diffs.mean(), 'std': time_diffs.std(), 'kde': scipy.stats.gaussian_kde(time_diffs), 'gauss': scipy.stats.norm(loc = time_diffs.mean(), scale = time_diffs.std())}
    aid_dict["y_thresholds_kde"] = {}
    aid_dict["y_thresholds_gauss"] = {}
    return aid_dict


def calculate_statistics_for_each_aid(data):
    # Get a list of unique aids in the data
    unique_aids = data['aid'].unique()
    # Preprocess the data and calculate statistics for each unique aid
    stats = {aid: calculate_statistics(preprocess(data, aid)) for aid in unique_aids}
    # data = [add_kde_val_col(data[i], stats) for i in range(len(data))]
    # data = [add_gauss_val_col(data[i], stats) for i in range(len(data))]
    
    return stats




In [None]:
training_data_stats = calculate_statistics_for_each_aid(training_data)
display(training_data_stats)

## Loading and Annotating Attack Data

In [None]:
def annotate_attack_data(attack_data, injection_intervals):
    """
    Annotates the attack data based on the injection intervals.
    """
    for index, row in injection_intervals.iterrows():
        aid = row['aid']
        payload = row['payload']
        intervals = [(row['start_time'], row['end_time'])]
        attack_data = add_actual_attack_col(attack_data, intervals, aid, payload)
    return attack_data

def load_and_annotate_attack_data(directory, metadata_file):
    # Load the attack data
    df_aggregation = load_data(directory, exclude=['masquerade', 'accelerator', 'metadata', metadata_file])

    # Load the injection intervals from the metadata file
    with open(os.path.join(directory, metadata_file), "r") as read_file:
        attack_dict = json.load(read_file)

    attack_metadata = []
    count = 0  # Initialize count here
    for file_name in os.listdir(directory):
        file_base = file_name[:-4]
        if file_base not in attack_dict:
            continue
        if "masquerade" in file_name or "accelerator" in file_name:
            continue

        metadata = attack_dict[file_base]
        if metadata["injection_id"] != "XXX":
            injection_id = int(metadata["injection_id"], 16)
        else:
            injection_id = "XXX"

        # From metadata file
        attack_metadata.append([tuple(metadata["injection_interval"])])

        # Add column to each attack dataframe to indicate attack (True) or non-attack (False) for each signal
        df_aggregation[count] = add_actual_attack_col(df_aggregation[count], attack_metadata[count], injection_id, metadata["injection_data_str"])

        count += 1  # Increment count here, inside the loop where you add items to your lists

    return df_aggregation


In [None]:
attack_data = load_and_annotate_attack_data('../data/attacks', 'capture_metadata.json')

In [None]:
print("Sample attack data from 'reverse_light_on_attack_1.log':")
display(attack_data[2].head())

print("\nHow many have an actual attack:")
print(len(attack_data[2][attack_data[2].actual_attack == True]))

print("\nHow many do not have an actual attack:")
print(len(attack_data[2][attack_data[2].actual_attack == False]))


## Supporting functions for detecting attacks on data already collected

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

def get_results_binning(attack_list, D, n=6):
    """
    Simplified binning detection method that returns the results directly
    """
    results_binning = {}

    for i, attack in enumerate(attack_list):
        confusion_matrix_ = alert_by_bin(attack, D, n)
        precision = confusion_matrix_[1,1] / (confusion_matrix_[1,1] + confusion_matrix_[0,1])
        recall = confusion_matrix_[1,1] / (confusion_matrix_[1,1] + confusion_matrix_[1,0])
        false_positive = confusion_matrix_[0,1] / (confusion_matrix_[0,1] + confusion_matrix_[0,0])

        results_binning[i+1] = {
            'cm': confusion_matrix_,
            'prec': precision,
            'recall': recall,
            'false_pos': false_positive
        }

    return results_binning

def alert_by_bin(df, D, n=6):
    """
    Checks for time windows of length mu*4 (where mu is average time_diff for aid) with 6 or more signals
    """
    cm = np.array([[0,0], [0,0]])
    
    for aid in df.aid.unique():
        df_test = df[df.aid == aid]
        df_test['predicted_attack'] = df_test.time_diffs.rolling(n).sum() <= D[aid]['mu']*4

        cm += confusion_matrix(df_test['actual_attack'], df_test['predicted_attack'], labels = [0,1])
    
    return cm

In [None]:

    
# def detect_anomalies(models, attack_data, detection_method):
#     if detection_method == 'Binning':
#         return detect_anomalies_binning(models, attack_data)
#     # elif detection_method == 'Gaussian':
#     #     return detect_anomalies_gaussian(models, attack_data)
#     else:
#         raise ValueError(f"Unknown detection method: {detection_method}")


## Live Binning Attack Detection
This is implemented through a binning strategy for anomaly detection, where each bin corresponds \
to a time window of length mu*4. If a bin contains n or more messages, it's considered anomalous \
and marked as a potential attack.

In [None]:
class BinningAttackDetector:
    """
    This class implements a binning strategy for anomaly detection, where each bin corresponds 
    to a time window of length mu*4. If a bin contains n or more messages, it's considered 
    anomalous and marked as a potential attack.
    """
    def __init__(self, d, n=6):
        self.d = d
        self.n = n
        self.frames = {}

    def process_frame(self, frame):
        aid = frame['aid']
        time_diff = frame['time_diffs']

        # Initialize memory for this aid if it doesn't exist
        if aid not in self.frames:
            self.frames[aid] = []

        # Add this frame to memory
        self.frames[aid].append(time_diff)

        # If we don't have enough frames yet, return 'not enough frames'
        if len(self.frames[aid]) < self.n:
            return 'not enough frames'

        # If we have more than n frames, discard the oldest one
        if len(self.frames[aid]) > self.n:
            self.frames[aid].pop(0)

        # Check if the sum of time_diffs for the last n frames is less than or equal to mu*4
        if sum(self.frames[aid]) <= self.d[aid]['mu']*4:
            return 'attack'
        else:
            return 'not attack'

detector = BinningAttackDetector(training_data_stats)

from sklearn.metrics import confusion_matrix

def calculate_cm(df, predictions):
    cm = confusion_matrix(df['actual_attack'], predictions, labels=[False, True])
    return cm


## Calculation of metrics and total accuracy

In [None]:
# modified from get_results_binning in helper_functions.py
def get_results_binning_updated(attack_list, D, n=6):
    # Initialize results dictionary
    results_binning = {}
    for i in range(len(attack_list)):
        results_binning[i+1] = {'cm': [0], 'recall': 0, 'prec': 0, 'false_pos': 0}
    results_binning['total'] = {'cm': [0], 'recall': 0, 'prec': 0, 'false_pos': 0}

    for i in range(len(attack_list)):
        # Process each frame in the attack
        predictions = []
        for _, frame in attack_list[i].iterrows():
            result = detector.process_frame(frame.to_dict())
            predictions.append(result == 'attack')

        # Calculate the confusion matrix
        results_binning[i+1]['cm'] = calculate_cm(attack_list[i], predictions)

        # Calculate the other metrics here...
        denominator = (results_binning[i+1]['cm'][1, 1]+results_binning[i+1]['cm'][0, 1])
        if denominator != 0:
            results_binning[i+1]['prec'] = results_binning[i+1]['cm'][1, 1] / denominator
        else:
            results_binning[i+1]['prec'] = 0  # or np.nan, or however you want to handle this case

        results_binning[i+1]['recall'] = results_binning[i+1]['cm'][1, 1] / \
            (results_binning[i+1]['cm'][1, 1]+results_binning[i+1]['cm'][1, 0])
        results_binning[i+1]['false_pos'] = results_binning[i+1]['cm'][0, 1] / \
            (results_binning[i+1]['cm'][0, 1]+results_binning[i+1]['cm'][0, 0])
        results_binning['total']['cm'] += results_binning[i+1]['cm']

    # Calculate the total metrics here...
    results_binning['total']['prec'] = results_binning['total']['cm'][1, 1] / \
        (results_binning['total']['cm'][1, 1] +
         results_binning['total']['cm'][0, 1])
    results_binning['total']['recall'] = results_binning['total']['cm'][1, 1] / \
        (results_binning['total']['cm'][1, 1] +
         results_binning['total']['cm'][1, 0])
    results_binning['total']['f1'] = 2*((results_binning['total']['prec']*results_binning['total']['recall'])/(
        results_binning['total']['prec']+results_binning['total']['recall']))
    results_binning['total']['false_pos'] = results_binning['total']['cm'][0, 1] / \
        (results_binning['total']['cm'][0, 1] +
         results_binning['total']['cm'][0, 0])


    return results_binning



In [None]:
result_binning = get_results_binning(attack_data, training_data_stats)

In [None]:
result_binning_new = get_results_binning_updated(attack_data, training_data_stats)

In [None]:
display(result_binning)
display(result_binning_new['total'])

## Unfinished below

In [None]:

# bin_results = []
# for i in range(len(attack_data)):
#     result_binning_new = get_results_binning_updated([attack_data[i]], training_data_stats)
#     bin_results.append(result_binning_new)

In [None]:
# for result in bin_results:
#     display(result['total'])



In [None]:
display(attack_data)
display(training_data_stats)

In [None]:
results_binning_final = unpickle(os.path.dirname(os.getcwd()) + "/results_binning_final.pkl") 


In [None]:
display(results_binning_final)

In [None]:
results = detect_and_save_results('results', training_data_stats, attack_data, 'Binning') # Detect anomalies and save the results
# visualize_results(results)