In [9]:
import torch
import torch.nn.utils.prune as prune
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_scheduler,DebertaTokenizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
import torch.nn as nn

# Step 1: Load Dataset
csv_file = "USB_Project/usb_capture/filtered_bulk.csv"
data = pd.read_csv(csv_file)

# Extract Benign and Malicious Data
data_benign = data[data['Label_Desc'] == 'Benign'].sample(n=5000, random_state=42)
data_malicious = data[data['Label_Desc'] == 'Attack'].sample(n=5000, random_state=42)
data_combined = pd.concat([data_benign, data_malicious])

# Extract Numeric Features and Labels
numeric_data = data_combined[['fw_fl_byt_s', 'bw_fl_byt_s', 'fw_fl_pkt_s', 'bw_fl_pkt_s', 'fw_pkt_s', 'bw_pkt_s']].values
labels = data_combined['Label_code'].values

# Normalize Data
feature_scaler = MinMaxScaler()
normalized_data = feature_scaler.fit_transform(numeric_data)
scaler = GradScaler()

# Convert Numeric Data to Textual Format
def convert_numeric_to_text(row):
    return " ".join([f"feature_{i}: {val:.6f}" for i, val in enumerate(row)])

text_data = [convert_numeric_to_text(sample) for sample in normalized_data]

# Step 2: Few-Shot Learning Setup
few_shot_size = 1000
few_shot_indices = np.random.choice(len(text_data), few_shot_size, replace=False)
few_shot_texts = [text_data[i] for i in few_shot_indices]
few_shot_labels = [labels[i] for i in few_shot_indices]

train_texts, test_texts, train_labels, test_labels = train_test_split(
    few_shot_texts, few_shot_labels, test_size=0.2, random_state=42, shuffle=True
)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, random_state=42, shuffle=True
)

# Step 3: Tokenization
model_name = "microsoft/deberta-v3-small"
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx], padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Step 4: Create DataLoaders
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)
test_dataset = CustomDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Step 5: Load Pretrained Model
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

# Step 6: Apply Layer-Wise Pruning (40% on Linear Layers)
for name, module in model.named_modules():
    if isinstance(module, nn.Linear):
        # prune.l1_unstructured(module, name="weight", amount=0.25)
        prune.ln_structured(module, name="weight", amount=0.3, n=1, dim=0)
        prune.remove(module, "weight")

model.train()

# Step 7: Training Setup
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.001)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 3)
loss_fn = nn.CrossEntropyLoss(weight=torch.tensor([0.6, 1.4]).to(device))
scaler = GradScaler()

def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model.train()
    total_loss, correct = 0, 0
    for batch in tqdm(data_loader, desc="Training"):
        input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)
        optimizer.zero_grad()
        with autocast():
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        correct += (outputs.logits.argmax(dim=1) == labels).sum().item()
        total_loss += loss.item()
    return correct / len(data_loader.dataset), total_loss / len(data_loader)

def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    correct, total_loss = 0, 0
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
            correct += (outputs.logits.argmax(dim=1) == labels).sum().item()
            total_loss += loss.item()
    return correct / len(data_loader.dataset), total_loss / len(data_loader)

# Training Loop
for epoch in range(6):
    print(f'Epoch {epoch + 1}/3')
    train_acc, train_loss = train_epoch(model, train_loader, loss_fn, optimizer, device)
    val_acc, val_loss = eval_model(model, val_loader, loss_fn, device)
    print(f'Train loss: {train_loss:.4f}, Train accuracy: {train_acc:.4f}')
    print(f'Validation loss: {val_loss:.4f}, Validation accuracy: {val_acc:.4f}')

# Step 8: Evaluate on Test Set
test_acc, test_loss = eval_model(model, test_loader, loss_fn, device)
print(f'Test loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}')



tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training:   8%|▊         | 15/180 [00:37<06:53,  2.51s/it]


KeyboardInterrupt: 