In [1]:
import tensorflow

In [2]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

import random
import time
from pathlib import Path

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.utils import shuffle
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
URL_CSV = "C:/Users/parth/Desktop/phising_Website Detection/Phish_URL.csv"
EMAIL_CSV = "C:/Users/parth/Desktop/phising_Website Detection/Phishing_Email.csv"

MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 128
BATCH_SIZE = 8
EPOCHS = 3
LR = 2e-5
SEED = 42

torch.manual_seed(SEED)


<torch._C.Generator at 0x1cf90810270>

In [7]:
def load_url_dataset():
    df = pd.read_csv(URL_CSV)

    # Map required columns
    df_url = pd.DataFrame()
    df_url["text"] = df["URL"].astype(str)          # NLP Input
    df_url["label"] = df["label"].astype(int)       # Target

    return df_url

In [8]:
def load_email_dataset():
    df = pd.read_csv(EMAIL_CSV)

    df_email = pd.DataFrame()
    df_email["text"] = df["Email Text"].astype(str)

    df_email["label"] = df["Email Type"].apply(
        lambda x: 1 if "phishing" in x.lower() else 0
    )

    return df_email

In [9]:
df_url = load_url_dataset()
df_email = load_email_dataset()

df = pd.concat([df_url, df_email], ignore_index=True)
df = df.sample(frac=1, random_state=SEED).reset_index(drop=True)

print(df.head())
print(df["label"].value_counts())

                             text  label
0     https://bikp11-gth.web.app/      0
1       http://www.pornhouse.mobi      0
2          https://www.blocku.com      1
3       https://www.class1895.com      1
4  https://www.countyleague.co.uk      1
label
1    142178
0    112267
Name: count, dtype: int64


In [10]:
texts = df["text"].tolist()
labels = df["label"].tolist()

X_train, X_val, y_train, y_val = train_test_split(
    texts, labels, test_size=0.2, random_state=SEED, stratify=labels
)

In [11]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            max_length=MAX_LEN,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_ds = TextDataset(X_train, y_train)
val_ds = TextDataset(X_val, y_val)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2
).to(device)

optimizer = AdamW(model.parameters(), lr=LR)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
best_f1 = 0

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        ids = batch["input_ids"].to(device)
        msk = batch["attention_mask"].to(device)
        lbl = batch["labels"].to(device)

        out = model(ids, msk, labels=lbl)
        loss = out.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {total_loss/len(train_loader):.4f}")

    # Evaluation
    model.eval()
    preds, trues = [], []

    with torch.no_grad():
        for batch in val_loader:
            ids = batch["input_ids"].to(device)
            msk = batch["attention_mask"].to(device)
            lbl = batch["labels"].to(device)

            logits = model(ids, msk).logits
            preds.extend(torch.argmax(logits, 1).cpu().numpy())
            trues.extend(lbl.cpu().numpy())

    p, r, f1, _ = precision_recall_fscore_support(trues, preds, average="binary")
    print("Validation F1:", f1)

    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), "best_model.pt")
        print("Model saved with F1 =", f1)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [None]:
print("\nFinal Evaluation Report:")
print(classification_report(trues, preds))

## website model