In [1]:

from json import load
from dotenv import load_dotenv
from helpers import *
import os

load_dotenv()

data_path = os.getenv('DATA_PATH')
ambient_dir = f'{data_path}/ambient' 
attack_dir = f'{data_path}/attacks'

ambient_metadata_file = os.path.join(ambient_dir, 'capture_metadata.json')
attack_metadata_file = os.path.join(attack_dir, 'capture_metadata.json')

with open(ambient_metadata_file) as f:
    ambient_metadata = load(f)

with open(attack_metadata_file) as f:
    attack_metadata = load(f)

ambient_keys = [
                "ambient_dyno_drive_benign_anomaly", 
                "ambient_dyno_drive_basic_long",
                "ambient_highway_street_driving_long",
                "ambient_dyno_reverse",
                "ambient_dyno_idle_radio_infotainment",
                "ambient_dyno_drive_radio_infotainment",
                "ambient_dyno_drive_winter",
                "ambient_dyno_exercise_all_bits",
                "ambient_dyno_drive_extended_short",
                "ambient_dyno_drive_basic_short",
                "ambient_dyno_drive_extended_long",
                "ambient_highway_street_driving_diagnostics"
]

attack_keys = [
                "accelerator_attack_reverse_1",
                "accelerator_attack_drive_1",
                "accelerator_attack_drive_2",
                "accelerator_attack_reverse_2",
                "fuzzing_attack_1",
                "fuzzing_attack_2",
                "fuzzing_attack_3",
                "correlated_signal_attack_1",
                "correlated_signal_attack_2",
                "correlated_signal_attack_3",
                "reverse_light_on_attack_1",
                "reverse_light_on_attack_2",
                "reverse_light_on_attack_3",
                "reverse_light_off_attack_1",
                "reverse_light_off_attack_2",
                "reverse_light_off_attack_3",
                "max_speedometer_attack_1",
                "max_speedometer_attack_2",
                "max_speedometer_attack_3",
                "max_engine_coolant_temp_attack",
]

# load parquet files into dataframes
ambient_dfs = {}
for parquet_file in ambient_keys:
    parquet_filepath = os.path.join(ambient_dir, f'{parquet_file}.parquet')
    df = pd.read_parquet(parquet_filepath)
    ambient_dfs[parquet_file] = df

attack_dfs = {}
for parquet_file in attack_keys:
    parquet_filepath = os.path.join(attack_dir, f'{parquet_file}.parquet')
    df = pd.read_parquet(parquet_filepath)
    
    attack_dfs[parquet_file] = df

In [2]:
import pickle
# pickle ambient and attack dataframes
with open('ambient_dfs.pickle', 'wb') as f:
    pickle.dump(ambient_dfs, f)

with open('attack_dfs.pickle', 'wb') as f:
    pickle.dump(attack_dfs, f)

In [3]:
attack_dfs['correlated_signal_attack_1']

Unnamed: 0,time,aid,data
0,0.000000,1505,893FC00B0A013880
1,0.000001,651,0000000000000000
2,0.000003,167,0010FA24D12E00A0
3,0.000004,208,4A7704600201F000
4,0.000997,51,000698000E4207D0
...,...,...,...
76231,33.095972,651,0000000000000000
76232,33.096941,51,177FA9788DC007D0
76233,33.096943,167,00510BA5212BA0A0
76234,33.096944,61,0000020000000000


In [7]:

ambient_dfs_with_time_diff = {} 
for key, ambient_file_df in ambient_dfs.items():
    ambient_dfs_with_time_diff[key] = add_time_diff_per_aid_col(ambient_file_df, True)
    ambient_dfs_with_time_diff[key] = add_time_diff_since_last_msg_col(ambient_file_df, True)


In [5]:
attack_dfs_with_time_diff = {}
for key, attack_file_df in attack_dfs.items():
    attack_dfs_with_time_diff[key] = add_time_diff_per_aid_col(attack_file_df, True)
    attack_dfs_with_time_diff[key] = add_time_diff_since_last_msg_col(attack_file_df, True)


In [1]:
def payload_matches(payload, injection_data_str):
    for i, value in enumerate(payload):
        if injection_data_str[i] == "X":
            continue
        else:
            if value != injection_data_str[i]:
                return False
    return True

for key, attack_file_df in attack_dfs_with_time_diff.items():

    injection_data_str = attack_metadata[key].get("injection_data_str")
    injection_id = attack_metadata[key].get("injection_id")
    injection_interval = attack_metadata[key].get("injection_interval")

    print(injection_data_str, injection_id, injection_interval)
    if injection_data_str and injection_data_str and injection_data_str:
        print(key)
        ambient_file_df['actual_attack'] = False
        if injection_id == "XXX":
            for index, row in attack_file_df.iterrows():
                if injection_interval[0] < row['time'] < injection_interval[1]:
                    if payload_matches(row['data'], injection_data_str):
                        print("found attack")
                        attack_file_df.at[index, 'actual_attack'] = True
        else:
            for index, row in attack_file_df.iterrows():
                # some of the injection ids are in hex some are in decimal
                try:
                    injection_id = int(injection_id, 16)
                except:
                    pass
                if injection_interval[0] < row['time'] < injection_interval[1] \
                    and row['aid'] == injection_id \
                    and payload_matches(row['data'], injection_data_str):
                            attack_file_df.at[index, 'actual_attack'] = True

NameError: name 'attack_dfs_with_time_diff' is not defined

In [11]:
def payload_matches(payload, injection_data_str):
    return all([(v1 == v2 or v2 == "X") for v1, v2 in zip(payload, injection_data_str)])

for key, attack_file_df in attack_dfs_with_time_diff.items():

    injection_data_str = attack_metadata[key].get("injection_data_str")
    injection_id = attack_metadata[key].get("injection_id")
    injection_interval = attack_metadata[key].get("injection_interval")

    print(injection_data_str, injection_id, injection_interval)
    if injection_data_str and injection_data_str and injection_data_str:
        print(key)
        ambient_file_df['actual_attack'] = False
        
        # Move the injection_id conversion outside the loop
        try:
            injection_id = int(injection_id, 16)
        except:
            pass

        mask_time_interval = (attack_file_df['time'] > injection_interval[0]) & (attack_file_df['time'] < injection_interval[1])
        if injection_id == "XXX":
            mask_payload_matches = attack_file_df.loc[mask_time_interval, 'data'].apply(payload_matches, args=(injection_data_str,))
            attack_file_df.loc[mask_payload_matches, 'actual_attack'] = True
        else:
            mask_id_matches = attack_file_df['aid'] == injection_id
            mask_payload_matches = attack_file_df.loc[mask_time_interval & mask_id_matches, 'data'].apply(payload_matches, args=(injection_data_str,))
            attack_file_df.loc[mask_payload_matches, 'actual_attack'] = True
            if mask_payload_matches.any():
                print("found attack")

None None None
None None None
None None None
None None None
FFFFFFFFFFFFFFFF XXX [4.622975, 7.958234]
fuzzing_attack_1


TypeError: unhashable type: 'Series'

In [77]:
attack_dfs_with_time_diff['correlated_signal_attack_1']["actual_attack"].value_counts()

actual_attack
False    74568
True      1668
Name: count, dtype: int64

In [79]:
# safe to parquet
for key, df in attack_dfs_with_time_diff.items():
    df.to_parquet(f'{key}_actual_attack.parquet')

In [33]:
# for key in attack_keys:
    # print(key)
    
    # print(attack_dfs_with_time_diff[key]['actual_attack'].value_counts())
attack_dfs_with_time_diff["fuzzing_attack_2"]['actual_attack']

30315

In [20]:
# ambient_dfs_with_time_diff['ambient_dyno_drive_basic_long']
# show true values for actual_attack column

attack_dfs_with_time_diff['accelerator_attack_drive_1']['actual_attack'].value_counts()

actual_attack
False    192017
Name: count, dtype: int64

In [None]:

for df_keys in ambient_dfs_with_time_diff.keys():
    ambient_parquet_file = os.path.join(ambient_dir, f'{df_keys}_with_time_diffs.parquet')
    ambient_dfs[df_keys].to_parquet(ambient_parquet_file, index=False)


for df_keys in attack_dfs_with_time_diff.keys():
    attack_parquet_file = os.path.join(attack_dir, f'{df_keys}_with_time_diffs.parquet')
    attack_dfs[df_keys].to_parquet(attack_parquet_file, index=False)

In [2]:
display(ambient_dfs['ambient_dyno_drive_benign_anomaly'])

Unnamed: 0,time,aid,data
0,0.000000e+00,737,0000000000000004
1,9.536743e-07,852,1FFF40000003A780
2,1.013994e-03,403,00080803E6280000
3,1.014948e-03,1505,893FE0070A000480
4,1.016021e-03,526,4E2003A0003FAFFF
...,...,...,...
720922,4.564519e+02,651,0000000000000000
720923,4.564529e+02,1760,0000000000000000
720924,4.564529e+02,167,2010FA24F12B30A0
720925,4.564529e+02,61,0001F48000000000


In [2]:
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
from chunkhelper import process_key

N = 10
samples = []
labels = []

# Define the number of processes to be used. 
# You can adjust this number based on the available cores and the desired parallelism.
num_processes = 10

with ProcessPoolExecutor(max_workers=num_processes) as executor:
    results = list(executor.map(process_key, ambient_dfs.keys(), ambient_dfs.values()))

# Extract results from the futures and add them to the main samples and labels lists
for local_samples, local_labels in results:
    samples.extend(local_samples)
    labels.extend(local_labels)

# Convert to DataFrame
df_train = pd.DataFrame(samples, columns=['aid'] + [f'time_diff_{i+1}' for i in range(N)])
df_train['label'] = labels

print(df_train.head())

   aid  time_diff_1  time_diff_2  time_diff_3  time_diff_4  time_diff_5  \
0    6     0.000000     0.999828     0.999973     1.000946     0.999913   
1    6     0.999828     0.999973     1.000946     0.999913     0.999930   
2    6     0.999973     1.000946     0.999913     0.999930     1.000033   
3    6     1.000946     0.999913     0.999930     1.000033     0.999910   
4    6     0.999913     0.999930     1.000033     0.999910     1.000095   

   time_diff_6  time_diff_7  time_diff_8  time_diff_9  time_diff_10     label  
0     0.999930     1.000033     0.999910     1.000095      1.000379  0.999917  
1     1.000033     0.999910     1.000095     1.000379      0.999917  1.000178  
2     0.999910     1.000095     1.000379     0.999917      1.000178  1.000145  
3     1.000095     1.000379     0.999917     1.000178      1.000145  1.000188  
4     1.000379     0.999917     1.000178     1.000145      1.000188  1.000127  


In [3]:
print(df_train)

           aid  time_diff_1  time_diff_2  time_diff_3  time_diff_4  \
0            6     0.000000     0.999828     0.999973     1.000946   
1            6     0.999828     0.999973     1.000946     0.999913   
2            6     0.999973     1.000946     0.999913     0.999930   
3            6     1.000946     0.999913     0.999930     1.000033   
4            6     0.999913     0.999930     1.000033     0.999910   
...        ...          ...          ...          ...          ...   
22607956  1788     0.099996     0.099611     0.100348     0.100021   
22607957  1788     0.099611     0.100348     0.100021     0.099976   
22607958  1788     0.100348     0.100021     0.099976     0.100036   
22607959  1788     0.100021     0.099976     0.100036     0.099986   
22607960  1788     0.099976     0.100036     0.099986     0.099995   

          time_diff_5  time_diff_6  time_diff_7  time_diff_8  time_diff_9  \
0            0.999913     0.999930     1.000033     0.999910     1.000095   
1    

In [5]:
# save parquet file
df_train.to_parquet(f'{data_path}/ambient/ambient_train_chunks.parquet')

In [10]:
def annotate_attack(row, metadata, threshold_percentage=0.05):
    """
    Annotates the row as "attack" or "not attack" based on predicted time difference.
    """
    # Check if the capture exists in metadata and has an injection interval
    capture_name = row['capture_name'] # Assuming your dataframe has a column 'capture_name'
    if capture_name in metadata and metadata[capture_name].get('injection_interval'):
        injection_interval = metadata[capture_name]['injection_interval']
        if injection_interval[0] <= row['time'] <= injection_interval[1]:
            pred_time_diff = row['predicted_time_diff']  # Assuming you have a column 'predicted_time_diff'
            actual_time_diff = row['actual_time_diff']  # Assuming you have a column 'actual_time_diff'
            
            lower_bound = actual_time_diff - (threshold_percentage * actual_time_diff)
            upper_bound = actual_time_diff + (threshold_percentage * actual_time_diff)
            
            if lower_bound <= pred_time_diff <= upper_bound:
                return "not attack"
            else:
                return "attack"
    return "not attack"

In [8]:
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
from chunkhelper import process_key

N = 10
samples = []
labels = []

# Define the number of processes to be used. 
# You can adjust this number based on the available cores and the desired parallelism.
num_processes = 10

with ProcessPoolExecutor(max_workers=num_processes) as executor:
    results = list(executor.map(process_key, attack_dfs.keys(), attack_dfs.values()))

# Extract results from the futures and add them to the main samples and labels lists
for local_samples, local_labels in results:
    samples.extend(local_samples)
    labels.extend(local_labels)

# Convert to DataFrame
df_test = pd.DataFrame(samples, columns=['aid'] + [f'time_diff_{i+1}' for i in range(N)])
df_test['label'] = labels

print(df_test.head())

   aid  time_diff_1  time_diff_2  time_diff_3  time_diff_4  time_diff_5  \
0    6     0.000000     1.001106     1.000699     1.001261     0.999206   
1    6     1.001106     1.000699     1.001261     0.999206     1.000270   
2    6     1.000699     1.001261     0.999206     1.000270     1.001233   
3    6     1.001261     0.999206     1.000270     1.001233     1.000176   
4    6     0.999206     1.000270     1.001233     1.000176     1.000197   

   time_diff_6  time_diff_7  time_diff_8  time_diff_9  time_diff_10     label  
0     1.000270     1.001233     1.000176     1.000197      1.000203  1.000382  
1     1.001233     1.000176     1.000197     1.000203      1.000382  1.000520  
2     1.000176     1.000197     1.000203     1.000382      1.000520  1.000975  
3     1.000197     1.000203     1.000382     1.000520      1.000975  0.999417  
4     1.000203     1.000382     1.000520     1.000975      0.999417  1.000457  


In [11]:
df_test.to_parquet(f'{data_path}/attacks/attacks_test_chunks.parquet')

In [4]:
import numpy as np

# Convert DataFrame to a numpy array
data_array = df_train.values

# Save the numpy array to a .npy file
np.save('ambient_train_chunks.npy', data_array)

In [None]:
import tensorflow as tf
import pandas as pd

# Shuffle the DataFrame
df_train = df_train.sample(frac=1).reset_index(drop=True)

# Split into features and labels
X = df_train.drop(columns=['label'])
y = df_train['label']

# Convert to NumPy arrays
X = X.values
y = y.values

# Convert to tf.data.Dataset
dataset = tf.data.Dataset.from_tensor_slices((X, y))

# Specify batch size
batch_size = 32
dataset = dataset.batch(batch_size)

# Assuming you have defined a Keras model named "model"
# model.fit(dataset, epochs=10)


In [9]:
from helpers import add_time_diff_since_last_msg_col

ambient_dfs_with_time_diff = {} 
for key, ambient_file_df in ambient_dfs.items():
    ambient_dfs_with_time_diff[key] = add_time_diff_per_aid_col(ambient_file_df, True)
    ambient_dfs_with_time_diff[key] = add_time_diff_since_last_msg_col(ambient_file_df, True)

attack_dfs_with_time_diff = {}
for key, attack_file_df in attack_dfs.items():
    attack_dfs_with_time_diff[key] = add_time_diff_per_aid_col(attack_file_df, True)
    attack_dfs_with_time_diff[key] = add_time_diff_since_last_msg_col(attack_file_df, True)

In [5]:
ambient_dfs_with_time_diff

{'ambient_dyno_drive_benign':                 time   aid              data  time_diffs  \
 0       0.000000e+00   737  0000000000000004 -456.419053   
 1       9.536743e-07   852  1FFF40000003A780 -456.404208   
 2       1.013994e-03   403  00080803E6280000 -456.443165   
 3       1.014948e-03  1505  893FE0070A000480 -456.386205   
 4       1.016021e-03   526  4E2003A0003FAFFF -456.422088   
 ...              ...   ...               ...         ...   
 720922  4.564519e+02   651  0000000000000000    0.019981   
 720923  4.564529e+02  1760  0000000000000000    0.008721   
 720924  4.564529e+02   167  2010FA24F12B30A0    0.007702   
 720925  4.564529e+02    61  0001F48000000000    0.099969   
 720926  4.564539e+02   705  01F32FC7CB1F1CDE    0.104799   
 
         time_diff_since_last_msg  
 0                            NaN  
 1                   9.536743e-07  
 2                   1.013041e-03  
 3                   9.536743e-07  
 4                   1.072884e-06  
 ...                 

In [10]:
for df_keys in ambient_dfs_with_time_diff.keys():
    ambient_parquet_file = os.path.join(ambient_dir, f'{df_keys}_with_time_diffs.parquet')
    ambient_dfs[df_keys].to_parquet(ambient_parquet_file, index=False)


for df_keys in attack_dfs_with_time_diff.keys():
    attack_parquet_file = os.path.join(attack_dir, f'{df_keys}_with_time_diffs.parquet')
    attack_dfs[df_keys].to_parquet(attack_parquet_file, index=False)