## Required Libraries

 ## pip uninstall -y tensorflow tensorflow-intel keras
## pip install --upgrade pip
## pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
## pip install transformers==4.36.2 accelerate sentencepiece

In [2]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

import random
import time
from pathlib import Path

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.utils import shuffle
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Folder data
HTML_DIR = "C:/Users/parth/Desktop/phising_Website Detection/website + Text"
GENUINE_DIR = "C:/Users/parth/Desktop/phising_Website Detection/website + Text/genuine_site_0"
PHISH_DIR = "C:/Users/parth/Desktop/phising_Website Detection/website + Text/phishing_site_1"

# Separate CSVs
URL_CSV = "C:/Users/parth/Desktop/phising_Website Detection/Phish_URL.csv"
EMAIL_CSV = "C:/Users/parth/Desktop/phising_Website Detection/Phishing_Email.csv"

# Model hyperparams
MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 3
LR = 2e-5
VAL_SPLIT = 0.2
SEED = 42

OUT_DIR = "model_output"
os.makedirs(OUT_DIR, exist_ok=True)

In [4]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [5]:
def html_to_text(html):
    if not isinstance(html, str) or len(html.strip()) == 0:
        return ""
    soup = BeautifulSoup(html, "lxml")
    for bad in soup(["script", "style", "noscript"]):
        bad.decompose()
    text = soup.get_text(" ")
    return " ".join(text.split())

In [6]:
def load_website_dataset():
    genuine_path = Path(HTML_DIR) / GENUINE_DIR
    phish_path = Path(HTML_DIR) / PHISH_DIR

    data = []

    # load genuine
    for f in genuine_path.glob("*.txt"):
        html = f.read_text(errors="ignore")
        url = f.stem.split("_")[0]
        data.append([url, html, 0])

    # load phishing
    for f in phish_path.glob("*.txt"):
        html = f.read_text(errors="ignore")
        url = f.stem.split("_")[0]
        data.append([url, html, 1])

    df = pd.DataFrame(data, columns=["url", "website_html", "label"])
    return df

In [10]:
def load_url_dataset():
    path = Path(URL_CSV)
    if not path.exists():
        print("URL dataset missing.")
        return pd.DataFrame(columns=["url","website_html","email_text","label"])

    df = pd.read_csv(path)

    # The real URL column is "URL" (uppercase)
    if "URL" not in df.columns or "label" not in df.columns:
        raise KeyError("Phish_URL.csv must contain 'URL' and 'label' columns")

    df2 = pd.DataFrame()
    df2["url"] = df["URL"].astype(str)
    df2["label"] = df["label"].astype(int)
    df2["website_html"] = ""
    df2["email_text"] = ""

    return df2

In [13]:
def load_email_dataset():
    path = Path(EMAIL_CSV)
    if not path.exists():
        print("Email dataset not found.")
        return pd.DataFrame(columns=["url","website_html","email_text","label"])

    df = pd.read_csv(path)

    # Extract email content
    if "Email Text" not in df.columns:
        raise KeyError("Column 'Email Text' not found in Email CSV.")

    df_out = pd.DataFrame()
    df_out["email_text"] = df["Email Text"].astype(str)

    # Convert label text → numeric
    if "Email Type" not in df.columns:
        raise KeyError("Column 'Email Type' not found in Email CSV.")

    df["Email Type"] = df["Email Type"].astype(str)

    df_out["label"] = df["Email Type"].apply(lambda x: 1 if "phish" in x.lower() else 0)

    # Add placeholder columns to match the format
    df_out["url"] = ""
    df_out["website_html"] = ""

    return df_out[["url", "website_html", "email_text", "label"]]

In [14]:
set_seed(SEED)

df_html = load_website_dataset()
df_url = load_url_dataset()
df_email = load_email_dataset()

df = pd.concat([df_html, df_url, df_email], ignore_index=True).fillna("")
df = shuffle(df, random_state=SEED).reset_index(drop=True)


# Merge all into one master dataframe
frames = []

df_html["email_text"] = ""
frames.append(df_html)

frames.append(df_url)

frames.append(df_email)

df = pd.concat(frames, ignore_index=True).fillna("")
df = shuffle(df, random_state=SEED).reset_index(drop=True)

In [15]:
df["html_text"] = df["website_html"].apply(html_to_text)

df["content_text"] = (
    df["email_text"].str.strip() + " " +
    df["html_text"].str.strip()
).str.strip()

# If empty, fallback to URL text
df["content_text"] = df.apply(
    lambda r: r["content_text"] if len(r["content_text"]) > 5 else r["url"],
    axis=1
)

In [16]:
df = df[df["content_text"].str.len() > 5].reset_index(drop=True)

In [17]:
X_train, X_val, y_train, y_val = train_test_split(
    df["content_text"].tolist(),
    df["label"].tolist(),
    test_size=VAL_SPLIT,
    random_state=SEED,
    stratify=df["label"]
)

In [18]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [19]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_ds = TextDataset(X_train, y_train, tokenizer, MAX_LEN)
val_ds = TextDataset(X_val, y_val, tokenizer, MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
).to(device)

ImportError: Traceback (most recent call last):
  File "C:\Users\parth\AppData\Local\Programs\Python\Python312\Lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 73, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.

In [None]:
optimizer = AdamW(model.parameters(), lr=LR)

train_losses = []
val_losses = []
val_f1s = []
best_f1 = 0
best_model_path = f"{OUT_DIR}/best_model.pt"

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        output = model(
            input_ids=input_ids,
            attention_mask=mask,
            labels=labels
        )

        loss = output.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    train_losses.append(total_loss / len(train_loader))

    # validation
    model.eval()
    preds = []
    trues = []
    val_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            output = model(
                input_ids=input_ids,
                attention_mask=mask,
                labels=labels
            )

            val_loss += output.loss.item()

            logits = output.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            trues.extend(labels.cpu().numpy())

    val_losses.append(val_loss / len(val_loader))

    p, r, f1, _ = precision_recall_fscore_support(trues, preds, average="binary")
    val_f1s.append(f1)

    print(f"EPOCH {epoch+1} → Train Loss={train_losses[-1]:.4f} | Val F1={f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), best_model_path)
        print("Best Model Saved.")



In [None]:
model.load_state_dict(torch.load(best_model_path))

print("\nValidation Metrics:")
print(classification_report(trues, preds))
