#### Load Data

In [1]:
from CANDataset import CANDataset
from dotenv import load_dotenv
import torch
import torch.nn as nn
import os

load_dotenv()
data_path = os.getenv('DATA_PATH')
dataset = CANDataset(data_path, log_verbosity=1)

Found ambient and attack directories.
Loading CAN metadata...
Loading raw can data...
Saving parquet files...
Processing raw can data...
Done processing CAN data.
Saving processed parquet files...
Done saving processed parquet files.
Loading processing data into 'CanData' structure


#### ML model

In [2]:
class CANnoloAutoencoder(nn.Module):
    def __init__(self, embedding_dim, lstm_units, dense_units, dropout_rate, num_embeddings, feature_vec_length):
        super(CANnoloAutoencoder, self).__init__()

        # Encoder
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.encoder_dense = nn.Linear(embedding_dim+feature_vec_length, dense_units)
        self.encoder_dropout = nn.Dropout(dropout_rate)
        self.encoder_lstm = nn.LSTM(input_size=dense_units, hidden_size=lstm_units, num_layers=2, batch_first=True)

        # Decoder
        self.decoder_lstm = nn.LSTM(input_size=lstm_units, hidden_size=lstm_units, num_layers=2, batch_first=True)
        self.decoder_dense = nn.Linear(lstm_units, feature_vec_length)
        self.decoder_output = nn.Sigmoid()  # To reconstruct the original packets

    def forward(self, can_ids, features):
        # Encoding
        embedded_ids = self.embedding(can_ids)
        # You might need to concatenate the embedded IDs with other features
        x = torch.cat([embedded_ids, features], dim=1)
        x = torch.tanh(self.encoder_dense(x))
        x = self.encoder_dropout(x)
        x, _ = self.encoder_lstm(x)

        # Decoding
        x, _ = self.decoder_lstm(x)
        x = self.decoder_dense(x)
        reconstructed = self.decoder_output(x)

        return reconstructed

In [6]:
dataset.attack_data.accelerator_attack_drive_1

Unnamed: 0,time,aid,data,delta_time_last_msg,delta_time_last_same_aid
0,0.000000,852,1FFF40000003C580,0.000000e+00,
1,0.000001,1505,893FE0070A000080,1.192093e-06,
2,0.000002,651,0000000000000000,9.536743e-07,
3,0.000992,167,005108E5112A00A0,9.899139e-04,
4,0.000994,722,0000500000000000,2.145767e-06,
...,...,...,...,...,...
204753,86.459022,560,F700000A7C000E00,9.698868e-04,0.019958
204754,86.461950,339,00000000000C1002,2.928019e-03,0.019914
204755,86.462905,1634,4E60000040000000,9.551048e-04,0.019882
204756,86.462906,412,02FC200002002730,9.536743e-07,0.019881


#### Define config
This is what we feed to the CANDataset object to create a dataloader.

In [7]:
config = {
    "batch_size": 32,
    "delta_time_last_msg": {
        "specific_to_can_id": False,
        "records_back": 30
    },
    "delta_time_last_same_aid": {
        "specific_to_can_id": True,
        "records_back": 15
    },
}

use `get_dataloaders` on CANDataset object to get the data loaders

In [8]:
ambient_loader, validation_loader, attack_loader = dataset.get_dataloaders(config)

#### Example Data
From the config we defined:
    - Batch size of `32`
    - Keep track of the current Can ID.
    - want the last `30` `delta_time_last_msg`
    - want the last `15` `delta_time_last_same_aid`



In [9]:
example_data = ambient_loader.__getitem__(0) # input normally acts as index, but this does not really work as an index. More like get next item.
display(example_data)

(tensor([1760, 1176,  560,  778,  452,  676,  813, 1277,  339,   14, 1634,  737,
          852, 1505,  412,  208,   51,  628,  192,  293,  354,  167, 1760, 1175,
           60,  519,  263, 1225,  470,  208,   14,   51]),
 tensor([[0.0000e+00, 1.9073e-06, 9.5367e-07,  ..., 9.0060e-03, 1.0972e-02,
          1.0019e-02],
         [1.9073e-06, 9.5367e-07, 1.1921e-06,  ..., 2.0578e-02, 1.8947e-02,
          1.9974e-02],
         [9.5367e-07, 1.1921e-06, 9.9492e-04,  ..., 1.9925e-02, 2.0006e-02,
          1.9995e-02],
         ...,
         [1.9073e-06, 9.5367e-07, 9.5367e-07,  ..., 7.9238e-03, 1.2084e-02,
          7.9310e-03],
         [9.5367e-07, 9.5367e-07, 1.0362e-03,  ..., 1.0004e-02, 9.9571e-03,
          1.0008e-02],
         [9.5367e-07, 1.0362e-03, 9.5367e-07,  ..., 7.9730e-03, 1.3045e-02,
          8.0020e-03]]))

#### Example of 1 input

In [8]:
test_batch_can_ids, test_feature_vec = example_data

print(f'Represents Can ID: \n{test_batch_can_ids[0]}\n')
print(f'Represents Feature Vector: \n{test_feature_vec[0]}')

Represents Can ID: 
204

Represents Feature Vector: 
tensor([0.0000e+00, 9.5367e-07, 9.0849e-03, 9.9890e-03, 2.0004e-02, 9.7489e-04,
        1.9014e-02, 9.9111e-04, 6.4800e-03, 1.0200e-03, 9.5367e-07, 9.5367e-07,
        1.2046e-02, 1.0159e-03, 1.8436e-02, 1.0025e-02, 1.0021e-02, 9.9802e-04,
        1.0180e-03, 1.7963e-02, 1.7386e-02, 2.6519e-03, 1.0040e-03, 1.9073e-06,
        1.1921e-06, 1.0140e-03, 1.9073e-06, 9.5367e-07, 1.0171e-03, 9.5367e-07,
        1.0000e+00, 1.0000e+00, 1.0001e+00, 9.9998e-01, 1.0000e+00, 9.9993e-01,
        1.0001e+00, 1.0001e+00, 9.9992e-01, 1.0000e+00, 9.9998e-01, 1.0021e+00,
        9.9996e-01, 9.9994e-01, 1.0002e+00])


The `example_data` is a tuple containing a list of 32 (batch_size) Can ID's and the feature vectors defined in the config.

([`tensor containing Can ID's`],[`tensor containing features`])

In [9]:
unique_can_ids = dataset.get_unique_can_ids()
num_can_ids = len(unique_can_ids)
feature_vec_length = ambient_loader.features_len
print(f"Number of CAN IDs: {num_can_ids}")
print(f"Feature vector length: {feature_vec_length-1}") # minus one because the can id is the first

Number of CAN IDs: 105
Feature vector length: 45


In [25]:
# Hyperparameters
embedding_dim = num_can_ids  # embedding dimension should be equal to the number of CAN IDs
lstm_units = 128 # defined in canolo paper
dense_units = 256 # defined in canolo paper
dropout_rate = 0.2 # defined in canolo paper
num_embeddings = max(unique_can_ids) + 1 # not sure why + 1 rn but it works
print(f"Number of embeddings: {num_embeddings}")

# Model
model = CANnoloAutoencoder(embedding_dim, lstm_units, dense_units, dropout_rate, num_embeddings)

# Training parameters
batch_size = ambient_loader.batch_size
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.BCELoss()  # Binary Cross-Entropy Loss


Number of embeddings: 1789


In [11]:
import time
# time the training
start = time.time()
for i in range(100):
    ambient_loader.__getitem__(0)
end = time.time()
print(f"Time taken: {end-start}")
print(f"Time per batch: {(end-start)/100}")

Time taken: 6.544073104858398
Time per batch: 0.06544073104858399


In [12]:
# Running a forward pass with a batch of data
reconstructed_output = model(test_batch_can_ids, test_feature_vec)

mse_loss = torch.nn.MSELoss()
error = mse_loss(reconstructed_output, test_feature_vec)
print("Reconstruction Error:", error.item())

RuntimeError: Placeholder storage has not been allocated on MPS device!

#### Defining our loss function and optimizer

In [None]:
loss_fn = torch.nn.MSELoss()  # Example loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Example optimizer

In [None]:
time_in_sec = 0.08151132106781006 * ambient_loader.num_batches
print(f"Time in seconds: {time_in_sec}")
time_in_min = time_in_sec / 60
print(f"Time in minutes: {time_in_min}")
time_in_hours = time_in_min / 60
print(f"Time in hours: {time_in_hours}")

In [26]:
PSEUDO_EPOCH_SIZE = 3000

def validate_model(model, validation_loader, loss_fn):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    num_batches_to_validate = 1000
    with torch.no_grad():  # No need to track gradients during validation
        for i, batch in enumerate(validation_loader):
            can_ids, features = batch
            # can_ids, features = can_ids.to('mps'), features.to('mps')
            
            if i == num_batches_to_validate:
                break
            
            # Forward pass: compute the model output
            reconstructed = model(can_ids, features)
            # Compute the loss
            loss = loss_fn(reconstructed, features)  # Ensure correct target is used
            total_loss += loss.item()

    model.train()  # Revert to training mode
    num_processed_batches = validation_loader.batch_size * num_batches_to_validate
    avg_loss = total_loss / num_processed_batches
    return avg_loss

def train_model(model, train_loader, validation_loader, loss_fn, optimizer, num_epochs):
    total_train_loss = 0
    pseudo_epoch = 1
    num_processed_batches_in_epoch = train_loader.batch_size * PSEUDO_EPOCH_SIZE

    model.train()
    for i, batch in enumerate(train_loader):
        can_ids, features = batch
        # can_ids, features = can_ids.to('mps'), features.to('mps')
        print(f"{i}", end="\r")

        # Forward pass: compute the model output
        reconstructed = model(can_ids, features)

        # Compute the loss
        loss = loss_fn(reconstructed, features)  # Ensure correct target is used
        total_train_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()  # Clear existing gradients
        loss.backward()  # Compute gradients
        optimizer.step()  # Update weights

        if i % PSEUDO_EPOCH_SIZE == 0:

            if i == 0:
                continue

            # Validate model
            validation_loss = validate_model(model, validation_loader, loss_fn)
            print(f"Psuedo Epoch {pseudo_epoch}, Validation Loss: {validation_loss}")

            # Show training progress
            avg_train_loss = total_train_loss / num_processed_batches_in_epoch
            print(f"Epoch {pseudo_epoch-1}, Average Training Loss: {avg_train_loss}")
            
            if pseudo_epoch > num_epochs:
                break
            

            # Save model
            torch.save(model.state_dict(), f'./saved_model/canolo_model_{pseudo_epoch}.pt')

            # save metadata
            total_batches_processed = i

            metadata = {
                "total_batches_processed": total_batches_processed,
                "total_train_loss": total_train_loss,
                "avg_train_loss": avg_train_loss,
                "validation_loss": validation_loss
            }

            with open(f'training_metadata.tsv', 'a') as f:
                f.write('\t'.join(str(metadata[key]) for key in metadata.keys()) + '\n')

            pseudo_epoch += 1
            total_train_loss = 0



num_epochs = 1
import time
# time the training
start = time.time()
train_model(model, ambient_loader, validation_loader, loss_fn, optimizer, num_epochs)
end = time.time()
print(f"Time taken: {end-start}")
print(f"Time per batch: {(end-start)/100}")


28

KeyboardInterrupt: 

In [16]:
Psuedo Epoch 1, Validation Loss: 0.0014687743630202022
Epoch 0, Average Training Loss: 0.002530826270366864
Psuedo Epoch 2, Validation Loss: 0.001387331632229234
Epoch 1, Average Training Loss: 0.0014076253618695773
Time taken: 964.5090320110321
Time per batch: 9.64509032011032

without cuda:

Psuedo Epoch 1, Validation Loss: 0.0020492164343595505
Epoch 0, Average Training Loss: 0.002164360677629399
Psuedo Epoch 2, Validation Loss: 0.0020707388259470464
Epoch 1, Average Training Loss: 0.002073641937187252
Time taken: 985.8024809360504
Time per batch: 9.858024809360504

SyntaxError: invalid syntax (2092159968.py, line 1)

In [24]:
for i in enumerate(ambient_loader.can_data):
    len(ambient_loader.can_data[i])/32

3341.78125

In [17]:
((len(ambient_loader)/3000) * 6) /60

18.8549

In [60]:
# # Step 1: Initialize the model
# # Hyperparameters
# embedding_dim = num_can_ids  # embedding dimension should be equal to the number of CAN IDs
# lstm_units = 128 # defined in canolo paper
# dense_units = 256 # defined in canolo paper
# dropout_rate = 0.2 # defined in canolo paper
# num_embeddings = max(unique_can_ids) + 1 # not sure why + 1 rn but it works

# # Model
# model2 = CANnoloAutoencoder(embedding_dim, lstm_units, dense_units, dropout_rate, num_embeddings)

# # Step 2: Load the state dictionary
# state_dict = torch.load("./saved_model/canolo_model_1.pt")
# model2.load_state_dict(state_dict)

# # If you want to use the model for inference, switch to evaluation mode
# model2.eval()


CANnoloAutoencoder(
  (embedding): Embedding(1789, 105)
  (encoder_dense): Linear(in_features=150, out_features=256, bias=True)
  (encoder_dropout): Dropout(p=0.2, inplace=False)
  (encoder_lstm): LSTM(256, 128, num_layers=2, batch_first=True)
  (decoder_lstm): LSTM(128, 128, num_layers=2, batch_first=True)
  (decoder_dense): Linear(in_features=128, out_features=45, bias=True)
  (decoder_output): Sigmoid()
)