In [74]:
# Import necessary libraries and helper functions
import pandas as pd
import os
from helper_functions import make_can_df, add_time_diff_per_aid_col, add_actual_attack_col, add_kde_val_col, add_gauss_val_col, get_results_binning, unpickle
import json
import tqdm


In [75]:
def load_data(directory, exclude=[], file_condition=lambda file_name: True):
    print("Loading data from directory: " + directory)
    df_aggregation = []

    for file_name in os.listdir(directory):
        if file_condition(file_name) and not any(excl in file_name for excl in exclude):
            print("Loading file: " + file_name)
            df = make_can_df(os.path.join(directory, file_name))
            df = add_time_diff_per_aid_col(df)
            df_aggregation.append(df)
            
    return df_aggregation



def load_and_save_training_data(directory):
    if os.path.exists(directory + 'training_data.csv'):
        print("Training data already exists. Loading training data from: " + directory + 'training_data.csv')
        return pd.read_csv(directory + 'training_data.csv')
        
    df_aggregation = load_data(directory, file_condition=lambda file_name: "dyno" in file_name)
    # Concatenate all training datasets on the dyno
    df_training = pd.concat(df_aggregation)
    training_data = df_training[["time", "aid", "time_diffs"]]
    save_file = 'training_data.csv'
    print("Saving training data to: " + save_file)
    training_data.to_csv(directory + save_file)
    return training_data

In [76]:
from sklearn.covariance import EllipticEnvelope
import scipy.stats

def preprocess(df, aid):
    """
    Preprocesses the data by removing outliers.
    """
    time_diffs = df[df.aid==aid].time_diffs.values
    print("before: ", len(time_diffs))

    # identify outliers in the dataset
    ee = EllipticEnvelope(contamination=0.0001, support_fraction=0.999) # support_fraction=0.99
    inliers = ee.fit_predict(time_diffs.reshape(-1, 1))

    # select all rows that are not outliers
    mask = inliers != -1
    outliers = sum(mask == False)
    print("outliers: ", outliers, 100*outliers/len(time_diffs))

    time_diffs = time_diffs[mask]
    # summarize the shape of the updated training dataset
    print("after: ", len(time_diffs))

    return time_diffs


def calculate_statistics(time_diffs):
    """
    Returns a dictionary including the mean of its time_diffs, standard deviation of its time_diffs
    and KDE of its time_diffs
    """
    aid_dict = {'mu': time_diffs.mean(), 'std': time_diffs.std(), 'kde': scipy.stats.gaussian_kde(time_diffs), 'gauss': scipy.stats.norm(loc = time_diffs.mean(), scale = time_diffs.std())}
    aid_dict["y_thresholds_kde"] = {}
    aid_dict["y_thresholds_gauss"] = {}
    return aid_dict


def calculate_statistics_for_each_aid(data):
    # Get a list of unique aids in the data
    unique_aids = data['aid'].unique()
    # Preprocess the data and calculate statistics for each unique aid
    stats = {aid: calculate_statistics(preprocess(data, aid)) for aid in unique_aids}
    # data = [add_kde_val_col(data[i], stats) for i in range(len(data))]
    # data = [add_gauss_val_col(data[i], stats) for i in range(len(data))]
    
    return stats




In [77]:
def annotate_attack_data(attack_data, injection_intervals):
    """
    Annotates the attack data based on the injection intervals.
    """
    for index, row in injection_intervals.iterrows():
        aid = row['aid']
        payload = row['payload']
        intervals = [(row['start_time'], row['end_time'])]
        attack_data = add_actual_attack_col(attack_data, intervals, aid, payload)
    return attack_data

def load_and_annotate_attack_data(directory, metadata_file):
    # Load the attack data
    df_aggregation = load_data(directory, exclude=['masquerade', 'accelerator', 'metadata', metadata_file])

    # Load the injection intervals from the metadata file
    with open(os.path.join(directory, metadata_file), "r") as read_file:
        attack_dict = json.load(read_file)

    attack_metadata = []
    count = 0  # Initialize count here
    for file_name in os.listdir(directory):
        file_base = file_name[:-4]
        if file_base not in attack_dict:
            continue
        if "masquerade" in file_name or "accelerator" in file_name:
            continue

        metadata = attack_dict[file_base]
        if metadata["injection_id"] != "XXX":
            injection_id = int(metadata["injection_id"], 16)
        else:
            injection_id = "XXX"

        # From metadata file
        attack_metadata.append([tuple(metadata["injection_interval"])])

        # Add column to each attack dataframe to indicate attack (True) or non-attack (False) for each signal
        df_aggregation[count] = add_actual_attack_col(df_aggregation[count], attack_metadata[count], injection_id, metadata["injection_data_str"])

        count += 1  # Increment count here, inside the loop where you add items to your lists

    return df_aggregation


In [78]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

def get_results_binning(attack_list, D, n=6):
    """
    Simplified binning detection method that returns the results directly
    """
    results_binning = {}

    for i, attack in enumerate(attack_list):
        confusion_matrix_ = alert_by_bin(attack, D, n)
        precision = confusion_matrix_[1,1] / (confusion_matrix_[1,1] + confusion_matrix_[0,1])
        recall = confusion_matrix_[1,1] / (confusion_matrix_[1,1] + confusion_matrix_[1,0])
        false_positive = confusion_matrix_[0,1] / (confusion_matrix_[0,1] + confusion_matrix_[0,0])

        results_binning[i+1] = {
            'cm': confusion_matrix_,
            'prec': precision,
            'recall': recall,
            'false_pos': false_positive
        }

    return results_binning

def alert_by_bin(df, D, n=6):
    """
    Checks for time windows of length mu*4 (where mu is average time_diff for aid) with 6 or more signals
    """
    cm = np.array([[0,0], [0,0]])
    
    for aid in df.aid.unique():
        df_test = df[df.aid == aid]
        df_test['predicted_attack'] = df_test.time_diffs.rolling(n).sum() <= D[aid]['mu']*4

        cm += confusion_matrix(df_test['actual_attack'], df_test['predicted_attack'], labels = [0,1])
    
    return cm

In [79]:

    
def detect_anomalies(models, attack_data, detection_method):
    if detection_method == 'Binning':
        return detect_anomalies_binning(models, attack_data)
    # elif detection_method == 'Gaussian':
    #     return detect_anomalies_gaussian(models, attack_data)
    else:
        raise ValueError(f"Unknown detection method: {detection_method}")


In [80]:

# Function to visualize the results
def visualize_results(results):
    # Add your visualization code here
    pass


In [81]:

# training_data = load_and_save_training_data('/home/jbc0071/Documents/can-time-based-ids-benchmark/data/ambient/')
# mac
training_data = load_and_save_training_data('/Users/jamescourson/Documents/GAN_play/can-time-based-ids-benchmark/data/ambient/') 

Training data already exists. Loading training data from: /Users/jamescourson/Documents/GAN_play/can-time-based-ids-benchmark/data/ambient/training_data.csv


In [82]:
print(training_data.head())

   Unnamed: 0      time  aid  time_diffs
0         192  0.313438    6    0.210577
1         984  1.307807    6    0.994369
2        1784  2.312546    6    1.004739
3        2581  3.316765    6    1.004219
4        3361  4.291507    6    0.974742


In [83]:
training_data_stats = calculate_statistics_for_each_aid(training_data)

before:  6340
outliers:  1 0.015772870662460567
after:  6339
before:  580454
outliers:  59 0.010164457476389171
after:  580395
before:  7656
outliers:  1 0.013061650992685475
after:  7655
before:  570718
outliers:  58 0.010162637239407203
after:  570660
before:  11572
outliers:  2 0.017283097131005877
after:  11570
before:  64264
outliers:  7 0.010892568156355035
after:  64257
before:  57860
outliers:  6 0.010369858278603527
after:  57854
before:  6441
outliers:  1 0.01552553951249806
after:  6440
before:  11569
outliers:  2 0.017287578874578616
after:  11567
before:  578687
outliers:  58 0.010022689294903808
after:  578629
before:  160636
outliers:  17 0.010582932841953236
after:  160619
before:  285343
outliers:  29 0.01016320708761035
after:  285314
before:  6419
outliers:  1 0.015578750584203147
after:  6418
before:  570727
outliers:  58 0.010162476981113562
after:  570669
before:  57864
outliers:  6 0.010369141435089175
after:  57858
before:  12850
outliers:  2 0.01556420233463035

In [84]:
display(training_data_stats)

{6: {'mu': 1.0155329309177052,
  'std': 0.4947486037052496,
  'kde': <scipy.stats._kde.gaussian_kde at 0x130f2cb50>,
  'gauss': <scipy.stats._distn_infrastructure.rv_continuous_frozen at 0x134661190>,
  'y_thresholds_kde': {},
  'y_thresholds_gauss': {}},
 14: {'mu': 0.010000053884418625,
  'std': 0.0004491061208629517,
  'kde': <scipy.stats._kde.gaussian_kde at 0x130f2c3d0>,
  'gauss': <scipy.stats._distn_infrastructure.rv_continuous_frozen at 0x130465940>,
  'y_thresholds_kde': {},
  'y_thresholds_gauss': {}},
 37: {'mu': 0.8426860569506046,
  'std': 0.5836586794325013,
  'kde': <scipy.stats._kde.gaussian_kde at 0x130d37e50>,
  'gauss': <scipy.stats._distn_infrastructure.rv_continuous_frozen at 0x130d37c10>,
  'y_thresholds_kde': {},
  'y_thresholds_gauss': {}},
 51: {'mu': 0.010000021841499838,
  'std': 0.0014177402701057897,
  'kde': <scipy.stats._kde.gaussian_kde at 0x130d37880>,
  'gauss': <scipy.stats._distn_infrastructure.rv_continuous_frozen at 0x130dd26d0>,
  'y_thresholds_kd

In [85]:
attack_data = load_and_annotate_attack_data('../data/attacks', 'capture_metadata.json')
# results = detect_and_save_results('results', models, attack_data, 'mean') # Detect anomalies and save the results
# visualize_results(results)

Loading data from directory: ../data/attacks
Loading file: correlated_signal_attack_1.log
Loading file: correlated_signal_attack_3.log
Loading file: correlated_signal_attack_2.log
Loading file: reverse_light_off_attack_1.log
Loading file: max_speedometer_attack_1.log
Loading file: reverse_light_off_attack_2.log
Loading file: max_speedometer_attack_3.log
Loading file: max_speedometer_attack_2.log
Loading file: reverse_light_off_attack_3.log
Loading file: max_engine_coolant_temp_attack.log
Loading file: fuzzing_attack_3.log
Loading file: fuzzing_attack_2.log
Loading file: fuzzing_attack_1.log
Loading file: reverse_light_on_attack_2.log
Loading file: reverse_light_on_attack_3.log
Loading file: reverse_light_on_attack_1.log


In [86]:
print(attack_data[0].head())
# print length of attack datra that have actual attack
print(len(attack_data[0][attack_data[0].actual_attack == True]))
print(len(attack_data[0][attack_data[0].actual_attack == False]))

# now total number of actual attacks for all 16 attacks
print(len(attack_data[0][attack_data[0].actual_attack == True]) + len(attack_data[1][attack_data[1].actual_attack == True]) + len(attack_data[2][attack_data[2].actual_attack == True]) + len(attack_data[3][attack_data[3].actual_attack == True]) + len(attack_data[4][attack_data[4].actual_attack == True]) + len(attack_data[5][attack_data[5].actual_attack == True]) + len(attack_data[6][attack_data[6].actual_attack == True]) + len(attack_data[7][attack_data[7].actual_attack == True]) + len(attack_data[8][attack_data[8].actual_attack == True]) + len(attack_data[9][attack_data[9].actual_attack == True]) + len(attack_data[10][attack_data[10].actual_attack == True]) + len(attack_data[11][attack_data[11].actual_attack == True]) + len(attack_data[12][attack_data[12].actual_attack == True]) + len(attack_data[13][attack_data[13].actual_attack == True]) + len(attack_data[14][attack_data[14].actual_attack == True]) + len(attack_data[15][attack_data[15].actual_attack == True]))



           time  aid              data  time_diffs  actual_attack
3873   1.618163    6  0800006400000000    0.999845          False
6266   2.618064    6  0800006400000000    0.999901          False
8655   3.617806    6  0800006400000000    0.999742          False
11049  4.617810    6  0800006400000000    1.000004          False
13441  5.618164    6  0800006400000000    1.000354          False
4172
71959
61516


In [87]:
class BinningAttackDetector:
    def __init__(self, d, n=6):
        self.d = d
        self.n = n
        self.frames = {}

    def process_frame(self, frame):
        aid = frame['aid']
        time_diff = frame['time_diffs']

        # Initialize memory for this aid if it doesn't exist
        if aid not in self.frames:
            self.frames[aid] = []

        # Add this frame to memory
        self.frames[aid].append(time_diff)

        # If we don't have enough frames yet, return 'not enough frames'
        if len(self.frames[aid]) < self.n:
            return 'not enough frames'

        # If we have more than n frames, discard the oldest one
        if len(self.frames[aid]) > self.n:
            self.frames[aid].pop(0)

        # Check if the sum of time_diffs for the last n frames is less than or equal to mu*4
        if sum(self.frames[aid]) <= self.d[aid]['mu']*4:
            return 'attack'
        else:
            return 'not attack'

detector = BinningAttackDetector(training_data_stats)

from sklearn.metrics import confusion_matrix

def calculate_cm(df, predictions):
    cm = confusion_matrix(df['actual_attack'], predictions, labels=[False, True])
    return cm


In [93]:
# modified from get_results_binning in helper_functions.py
def get_results_binning_updated(attack_list, D, n=6):
    # Initialize results dictionary
    results_binning = {}
    for i in range(len(attack_list)):
        results_binning[i+1] = {'cm': [0], 'recall': 0, 'prec': 0, 'false_pos': 0}
    results_binning['total'] = {'cm': [0], 'recall': 0, 'prec': 0, 'false_pos': 0}

    for i in range(len(attack_list)):
        # Process each frame in the attack
        predictions = []
        for _, frame in attack_list[i].iterrows():
            result = detector.process_frame(frame.to_dict())
            predictions.append(result == 'attack')

        # Calculate the confusion matrix
        results_binning[i+1]['cm'] = calculate_cm(attack_list[i], predictions)

        # Calculate the other metrics here...
        denominator = (results_binning[i+1]['cm'][1, 1]+results_binning[i+1]['cm'][0, 1])
        if denominator != 0:
            results_binning[i+1]['prec'] = results_binning[i+1]['cm'][1, 1] / denominator
        else:
            results_binning[i+1]['prec'] = 0  # or np.nan, or however you want to handle this case

        results_binning[i+1]['recall'] = results_binning[i+1]['cm'][1, 1] / \
            (results_binning[i+1]['cm'][1, 1]+results_binning[i+1]['cm'][1, 0])
        results_binning[i+1]['false_pos'] = results_binning[i+1]['cm'][0, 1] / \
            (results_binning[i+1]['cm'][0, 1]+results_binning[i+1]['cm'][0, 0])
        results_binning['total']['cm'] += results_binning[i+1]['cm']

    # Calculate the total metrics here...
    results_binning['total']['prec'] = results_binning['total']['cm'][1, 1] / \
        (results_binning['total']['cm'][1, 1] +
         results_binning['total']['cm'][0, 1])
    results_binning['total']['recall'] = results_binning['total']['cm'][1, 1] / \
        (results_binning['total']['cm'][1, 1] +
         results_binning['total']['cm'][1, 0])
    results_binning['total']['f1'] = 2*((results_binning['total']['prec']*results_binning['total']['recall'])/(
        results_binning['total']['prec']+results_binning['total']['recall']))
    results_binning['total']['false_pos'] = results_binning['total']['cm'][0, 1] / \
        (results_binning['total']['cm'][0, 1] +
         results_binning['total']['cm'][0, 0])


    return results_binning



In [94]:
result_binning = get_results_binning(attack_data, training_data_stats)
result_binning_new = get_results_binning_updated(attack_data, training_data_stats)

  precision = confusion_matrix_[1,1] / (confusion_matrix_[1,1] + confusion_matrix_[0,1])


Attack detected at index 75484
Attack detected at index 75676
Attack detected at index 76149
Attack detected at index 22001
Attack detected at index 22021
Attack detected at index 22034
Attack detected at index 22046
Attack detected at index 22054
Attack detected at index 22081
Attack detected at index 22092
Attack detected at index 22105
Attack detected at index 22107
Attack detected at index 22131
Attack detected at index 22144
Attack detected at index 22159
Attack detected at index 22166
Attack detected at index 22188
Attack detected at index 22207
Attack detected at index 22222
Attack detected at index 22224
Attack detected at index 22241
Attack detected at index 22255
Attack detected at index 22265
Attack detected at index 22267
Attack detected at index 22284
Attack detected at index 22298
Attack detected at index 22308
Attack detected at index 22312
Attack detected at index 22334
Attack detected at index 22347
Attack detected at index 22357
Attack detected at index 22359
Attack d

In [None]:
display(attack_data)
display(training_data_stats)

[            time   aid              data  time_diffs  actual_attack
 3873    1.618163     6  0800006400000000    0.999845          False
 6266    2.618064     6  0800006400000000    0.999901          False
 8655    3.617806     6  0800006400000000    0.999742          False
 11049   4.617810     6  0800006400000000    1.000004          False
 13441   5.618164     6  0800006400000000    1.000354          False
 ...          ...   ...               ...         ...            ...
 80160  32.636582  1788  00000738D2B85800    0.099969          False
 80402  32.736450  1788  0000073732B86000    0.099868          False
 80641  32.836612  1788  0000073892B86800    0.100162          False
 80874  32.936532  1788  00000737B2B87000    0.099920          False
 81111  33.036464  1788  00000738B2B87800    0.099932          False
 
 [76131 rows x 5 columns],
             time   aid              data  time_diffs  actual_attack
 2910    1.220904     6  0800006400000000    0.999842          False
 5302

{6: {'mu': 1.0155329309177052,
  'std': 0.4947486037052496,
  'kde': <scipy.stats._kde.gaussian_kde at 0x130e1ab20>,
  'gauss': <scipy.stats._distn_infrastructure.rv_continuous_frozen at 0x132db4f10>,
  'y_thresholds_kde': {},
  'y_thresholds_gauss': {}},
 14: {'mu': 0.010000053884418625,
  'std': 0.0004491061208629517,
  'kde': <scipy.stats._kde.gaussian_kde at 0x133d18a90>,
  'gauss': <scipy.stats._distn_infrastructure.rv_continuous_frozen at 0x130f2cd30>,
  'y_thresholds_kde': {},
  'y_thresholds_gauss': {}},
 37: {'mu': 0.8426860569506046,
  'std': 0.5836586794325013,
  'kde': <scipy.stats._kde.gaussian_kde at 0x130506910>,
  'gauss': <scipy.stats._distn_infrastructure.rv_continuous_frozen at 0x130506c40>,
  'y_thresholds_kde': {},
  'y_thresholds_gauss': {}},
 51: {'mu': 0.010000021841499838,
  'std': 0.0014177402701057897,
  'kde': <scipy.stats._kde.gaussian_kde at 0x130f2ce80>,
  'gauss': <scipy.stats._distn_infrastructure.rv_continuous_frozen at 0x130f2cbb0>,
  'y_thresholds_kd

In [None]:
results_binning_final = unpickle(os.path.dirname(os.getcwd()) + "/results_binning_final.pkl") 


In [None]:
display(results_binning_final)

{1: {'cm': array([[12182,     0],
         [    3,     0]]),
  'recall': 0.0,
  'prec': nan,
  'false_pos': 0.0},
 2: {'cm': array([[45457,    92],
         [   36,     0]]),
  'recall': 0.0,
  'prec': 0.0,
  'false_pos': 0.0020198028496783683},
 3: {'cm': array([[120692,    175],
         [     2,   3980]]),
  'recall': 0.9994977398292315,
  'prec': 0.9578820697954272,
  'false_pos': 0.0014478724548470632},
 4: {'cm': array([[71954,     5],
         [    1,  4171]]),
  'recall': 0.9997603068072867,
  'prec': 0.9988026819923371,
  'false_pos': 6.948401172890118e-05},
 5: {'cm': array([[141427,    119],
         [     3,   4699]]),
  'recall': 0.9993619736282433,
  'prec': 0.9753009547530096,
  'false_pos': 0.0008407160922950843},
 6: {'cm': array([[188114,     14],
         [     2,  12212]]),
  'recall': 0.9998362534796136,
  'prec': 0.9988548993947325,
  'false_pos': 7.441741792821908e-05},
 7: {'cm': array([[194489,    624],
         [     2,   4886]]),
  'recall': 0.999590834697217

In [None]:
results = detect_and_save_results('results', training_data_stats, attack_data, 'Binning') # Detect anomalies and save the results
# visualize_results(results)

NameError: name 'detect_and_save_results' is not defined