In [13]:
pip install torch torchvision torchaudio transformers seaborn

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
!pip install numpy scipy pandas scikit-learn

Collecting scipy
  Downloading scipy-1.11.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.3/36.3 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting scikit-learn
  Downloading scikit_learn-1.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Collecting joblib>=1.1.1
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.3.2 scikit-learn-1.3.1 scipy-1.11.2 threadpoolctl-3.2.0
[0m

In [24]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import AdamW
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('UNSW_NB15_training-set.csv')
testdata = pd.read_csv('UNSW_NB15_testing-set.csv')

data = data.dropna()
testdata = testdata.dropna()

data = data.drop("attack_cat", axis=1)
testdata = testdata.drop("attack_cat",axis=1)

selected_columns = ['proto', 'service', 'state', 'sbytes', 'srcip', 'dstip', 'sport','dport','sttl','dttl','service']  # Add more columns as needed.

def row_to_text(row):
    text_pieces = ["[CLS]"]
    for col in row.index:
        if col not in ['attack_cat']:
            text_pieces.append(f"The {col} is {row[col]}")
    text_pieces.append("[SEP]")
    return ' '.join(text_pieces)

data['texts'] = data.apply(row_to_text, axis=1)
testdata['texts'] = testdata.apply(row_to_text, axis = 1)

le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])
testdata['label'] = le.fit_transform(testdata['label'])

train_texts, test_texts, train_labels, test_labels = train_test_split(data['texts'], data['label'], test_size=0.2)

print(train_texts.head())

13285    [CLS] The id is 13286 The dur is 8e-06 The pro...
1622     [CLS] The id is 1623 The dur is 1.591541 The p...
198      [CLS] The id is 199 The dur is 18.44595 The pr...
12051    [CLS] The id is 12052 The dur is 8e-06 The pro...
8778     [CLS] The id is 8779 The dur is 2.167147 The p...
Name: texts, dtype: object


In [25]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [26]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [27]:
texts = list(data['texts'])
labels = list(data['label'])

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(set(labels)))

max_length = 256
batch_size = 32

train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, max_length)
test_dataset = TextDataset(list(testdata['texts']), list(testdata['label']), tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# # Setup GPU
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# # Setup optimizer and scheduler
# optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
# total_steps = len(train_loader) * 3  # Number of epochs
# scheduler = get_linear_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=0,
#     num_training_steps=total_steps
# )

# # Training Loop
# for epoch in range(3):  # 3 epochs, modify as needed
#     model.train()
#     for batch in tqdm(train_loader):
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['label'].to(device)

#         outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         loss.backward()

#         torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
#         optimizer.step()
#         scheduler.step()
#         optimizer.zero_grad()

#     # Evaluate on validation set (omitted for brevity, please implement this part based on your needs)

# print("Training complete!")

import numpy as np
from sklearn.metrics import accuracy_score
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

N_EPOCHS = 3

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_loader) * N_EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(N_EPOCHS):  
    model.train()
    train_loss = 0.0
    true_labels_train = []
    pred_labels_train = []
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        train_loss += loss.item()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        
        _, preds = torch.max(outputs.logits, dim=1)
        true_labels_train.extend(labels.cpu().numpy())
        pred_labels_train.extend(preds.cpu().numpy())

    avg_train_loss = train_loss / len(train_loader)
    train_accuracy = accuracy_score(true_labels_train, pred_labels_train)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    
    model.eval()
    test_loss = 0.0
    true_labels_test = []
    pred_labels_test = []
    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            test_loss += loss.item()
            
            _, preds = torch.max(outputs.logits, dim=1)
            true_labels_test.extend(labels.cpu().numpy())
            pred_labels_test.extend(preds.cpu().numpy())

    avg_test_loss = test_loss / len(test_loader)
    test_accuracy = accuracy_score(true_labels_test, pred_labels_test)
    print(f"Epoch {epoch+1}, Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

print("Training complete!")

  0%|          | 0/625 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 625/625 [03:08<00:00,  3.32it/s]


Epoch 1, Train Loss: 0.0468, Train Accuracy: 0.9879


100%|██████████| 157/157 [00:21<00:00,  7.21it/s]


Epoch 1, Test Loss: 0.0420, Test Accuracy: 0.9930


100%|██████████| 625/625 [03:12<00:00,  3.24it/s]


Epoch 2, Train Loss: 0.0131, Train Accuracy: 0.9979


100%|██████████| 157/157 [00:21<00:00,  7.20it/s]


Epoch 2, Test Loss: 0.0329, Test Accuracy: 0.9940


100%|██████████| 625/625 [03:12<00:00,  3.25it/s]


Epoch 3, Train Loss: 0.0101, Train Accuracy: 0.9984


100%|██████████| 157/157 [00:21<00:00,  7.19it/s]

Epoch 3, Test Loss: 0.0322, Test Accuracy: 0.9942
Training complete!





In [None]:
import torch
torch.save(model.state_dict(), "Trained_Model.pth")