In [2]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from transformers import AutoTokenizer, AutoModel
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os

### Load data

In [4]:
df = pd.read_csv("dataset/PhiUSIIL.csv")

# Drop rows with missing labels
df = df.dropna(subset=["label"])

In [5]:
class PhishingDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(text,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=self.max_len,
                                  return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [10]:
# Use URL + Title + Domain as combined text
df['text'] = df['URL'].astype(str) + ' ' + df['Domain'].astype(str) + ' ' + df['Title'].astype(str)

# Label should be numeric
df['label'] = df['label'].astype(int)

# Train/test split
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Load tokenizer and model
model_name = "prajjwal1/bert-mini"
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_dataset = PhishingDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
val_dataset = PhishingDataset(val_texts.tolist(), val_labels.tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [11]:
class PhishingDetector(nn.Module):
    def __init__(self, model_name, num_labels):
        super(PhishingDetector, self).__init__()
        self.base_model = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.base_model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # Take CLS token
        return self.classifier(cls_output)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PhishingDetector(model_name, num_labels=2)
model.to(device)

PhishingDetector(
  (base_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-12, elem

In [12]:
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

best_accuracy = 0
model_path = "best_phishing_model.pt"

for epoch in range(4):
    model.train()
    train_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(true_labels, predictions)
    print(f"Epoch {epoch+1}: Train Loss={train_loss/len(train_loader):.4f}, Val Accuracy={acc:.4f}")

    # Save the best model
    if acc > best_accuracy:
        torch.save(model.state_dict(), model_path)
        best_accuracy = acc

Epoch 1: Train Loss=0.0124, Val Accuracy=0.9985
Epoch 2: Train Loss=0.0071, Val Accuracy=0.9983
Epoch 3: Train Loss=0.0057, Val Accuracy=0.9982
Epoch 4: Train Loss=0.0046, Val Accuracy=0.9981


In [13]:
print("\nBest Validation Accuracy:", best_accuracy)
conf_matrix = confusion_matrix(true_labels, predictions)
print("\nConfusion Matrix:\n", conf_matrix)


Best Validation Accuracy: 0.9984520452087619

Confusion Matrix:
 [[20073    51]
 [   40 26995]]
