In [1]:
import tensorflow

In [2]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

import random
from pathlib import Path
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW

In [3]:
HTML_BASE_DIR = "C:/Users/parth/Desktop/phising_Website Detection/website + Text"
PHISH_DIR = "C:/Users/parth/Desktop/phising_Website Detection/website + Text/phishing_site_1"
GENUINE_DIR = "C:/Users/parth/Desktop/phising_Website Detection/website + Text/genuine_site_0"

MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 1
LR = 2e-5
SEED = 42

MODEL_SAVE_PATH = "website_phishing_model.pt"

torch.manual_seed(SEED)

<torch._C.Generator at 0x2e45d731630>

In [1]:
import random
import numpy as np
import torch

def fix_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [2]:
from bs4 import BeautifulSoup

def html_to_text(content: str) -> str:
    soup = BeautifulSoup(content, "lxml")

    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()

    text = soup.get_text(separator=" ")
    return " ".join(text.split())

In [3]:
from pathlib import Path
import pandas as pd
from bs4 import BeautifulSoup

def html_to_text(content: str) -> str:
    soup = BeautifulSoup(content, "lxml")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    text = soup.get_text(separator=" ")
    return " ".join(text.split())


def read_text_files_from_dir(directory: Path, label: int):
    records = []

    for file in directory.iterdir():
        if not file.is_file():
            continue

        raw = file.read_text(errors="ignore").strip()
        if len(raw) < 20:
            continue

        # Detect HTML vs plain text
        if "<html" in raw.lower() or "<body" in raw.lower():
            clean_text = html_to_text(raw)
        else:
            clean_text = raw

        if len(clean_text) > 20:
            records.append({
                "text": clean_text,
                "label": label
            })

    return pd.DataFrame(records)

In [4]:
from sklearn.utils import shuffle

SEED = 42
fix_seed(SEED)

DATA_DIR = "C:/Users/parth/Desktop/phising_Website Detection/website + Text"
GENUINE_SUBFOLDER = "C:/Users/parth/Desktop/phising_Website Detection/website + Text/genuine_site_0"
PHISH_SUBFOLDER = "C:/Users/parth/Desktop/phising_Website Detection/website + Text/phishing_site_1"

base = Path(DATA_DIR)

df_genuine = read_text_files_from_dir(base / GENUINE_SUBFOLDER, 0)
df_phish = read_text_files_from_dir(base / PHISH_SUBFOLDER, 1)

print("Genuine samples:", len(df_genuine))   # ≈ 1312 + 40
print("Phishing samples:", len(df_phish))    # ≈ 553 + 28

df = pd.concat([df_genuine, df_phish], ignore_index=True)
df = shuffle(df, random_state=SEED).reset_index(drop=True)

print("Final dataset shape:", df.shape)
df.head()

  soup = BeautifulSoup(content, "lxml")


Genuine samples: 1221
Phishing samples: 500
Final dataset shape: (1721, 2)


Unnamed: 0,text,label
0,Ranker vote on everything Watchworthy Weird Hi...,0
1,RFC 1097 - Telnet subliminal-message option Li...,0
2,Sean's Home Page Sean Maschue's personal home ...,0
3,å…¬å¹³å…¬æ­£-ä¸–ç•Œæ¯-NBAå®˜æ–¹èµžåŠ©,1
4,Page not found â€“ Cricketdiane's Weblog Skip ...,0


In [5]:
MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 1
LR = 2e-5
SEED = 42

MODEL_SAVE_PATH = "website_phishing_model.pt"

torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [10]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

import random
from pathlib import Path
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW

In [11]:

from sklearn.model_selection import train_test_split

In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    df["text"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    random_state=SEED,
    stratify=df["label"]
)

print("Train size:", len(X_train))
print("Validation size:", len(X_val))

Train size: 1376
Validation size: 345


In [13]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [14]:
class WebsiteDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [15]:
train_loader = DataLoader(
    WebsiteDataset(X_train, y_train),
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_loader = DataLoader(
    WebsiteDataset(X_val, y_val),
    batch_size=BATCH_SIZE,
    shuffle=False
)

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
).to(device)

optimizer = AdamW(model.parameters(), lr=LR)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
best_f1 = 0.0

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0

    for batch in train_loader:
        optimizer.zero_grad()

        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    print("Train Loss:", total_loss / len(train_loader))

    # ---------- Validation ----------
    model.eval()
    preds, trues = [], []

    with torch.no_grad():
        for batch in val_loader:
            ids = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            logits = model(ids, mask).logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            trues.extend(labels.cpu().numpy())

    p, r, f1, _ = precision_recall_fscore_support(trues, preds, average="binary")
    acc = accuracy_score(trues, preds)

    print(f"Validation Accuracy: {acc:.4f}")
    print(f"Validation F1-score: {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print("✅ Best model saved")

  attn_output = torch.nn.functional.scaled_dot_product_attention(



Epoch 1/1
Train Loss: 0.4352095525191967
Validation Accuracy: 0.8348
Validation F1-score: 0.6919
✅ Best model saved


In [18]:
model.load_state_dict(torch.load(MODEL_SAVE_PATH, map_location=device))
model.eval()

print("\nFinal Website Model Evaluation:\n")
print("Accuracy:", accuracy_score(trues, preds))
print(classification_report(
    trues,
    preds,
    target_names=["Genuine Website", "Phishing Website"]
))


Final Website Model Evaluation:

Accuracy: 0.8347826086956521
                  precision    recall  f1-score   support

 Genuine Website       0.86      0.91      0.89       245
Phishing Website       0.75      0.64      0.69       100

        accuracy                           0.83       345
       macro avg       0.81      0.78      0.79       345
    weighted avg       0.83      0.83      0.83       345

