In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from urllib.parse import urlparse
from sklearn.metrics import accuracy_score, confusion_matrix
from transformers import AutoTokenizer, AutoModel
import re
import string
from tqdm import tqdm

In [2]:
MODEL_PATH = "best_phishing_model.pt"
MODEL_NAME = "prajjwal1/bert-mini"
BATCH_SIZE = 16
MAX_LEN = 128

In [3]:
def clean_text(text):
    if not isinstance(text, str):
        return ''
    # Remove non-printable characters
    text = ''.join(char for char in text if char in string.printable)
    # Remove extra whitespace and control characters
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [4]:
df = pd.read_csv("dataset/test_urls.csv")

# Convert 'type' to binary label (1 = phishing, 0 = safe)
df['label'] = df['type'].apply(lambda x: 1 if x == "phishing" else 0)

# Extract domain
def extract_domain(url):
    return "https" if url.startswith("https") else "http"

df['url'] = df['url'].apply(clean_text)
df['Domain'] = df['url'].apply(extract_domain)
# Title is missing, use empty string
df['Title'] = ''
df['text'] = df['url'] + ' ' + df['Domain'] + ' ' + df['Title']

In [5]:
class PhishingDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(text,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=self.max_len,
                                  return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [6]:
class PhishingDetector(nn.Module):
    def __init__(self, model_name, num_labels):
        super(PhishingDetector, self).__init__()
        self.base_model = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.base_model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        return self.classifier(cls_output)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = PhishingDetector(MODEL_NAME, num_labels=2)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.to(device)
model.eval()

# Create DataLoader
dataset = PhishingDataset(df['text'].tolist(), df['label'].tolist(), tokenizer)
loader = DataLoader(dataset, batch_size=BATCH_SIZE)

predictions, true_labels = [], []

with torch.no_grad():
    for batch in tqdm(loader, desc='Inferencing'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs, dim=1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

Using device: cuda


Inferencing: 100%|██████████| 40700/40700 [05:52<00:00, 115.41it/s]


In [10]:
df['predicted_label'] = predictions
acc = accuracy_score(true_labels, predictions)
conf_matrix = confusion_matrix(true_labels, predictions)

print(f"\nInference Accuracy: {acc:.4f}")
print("Confusion Matrix:\n", conf_matrix)


Inference Accuracy: 0.8590
Confusion Matrix:
 [[556832    248]
 [ 91547   2564]]
