In [None]:
!pip install pandas scikit-learn

In [14]:
import pandas as pd
import numpy as np
import os

In [26]:
# Define file paths
kaggle_csv_path = "../data/urls/phishing_site_urls.csv"
openphish_txt_path = "../data/urls/feed.txt"

In [38]:
# Load Kaggle dataset
kaggle_df = pd.read_csv(kaggle_csv_path)

# Display structure
print("Kaggle Dataset:")
print(kaggle_df.shape)
print(kaggle_df.columns)
kaggle_df.head()

Kaggle Dataset:
(549346, 2)
Index(['URL', 'Label'], dtype='object')


Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [39]:
# With counts
print(kaggle_df["Label"].value_counts())

Label
good    392924
bad     156422
Name: count, dtype: int64


In [40]:
# Rename columns to lowercase for consistency
kaggle_df.columns = kaggle_df.columns.str.lower()

# Convert text labels to binary: bad → 1 (phishing), good/safe/legitimate → 0
label_map = {
    "bad": 1,
    "good": 0
}

kaggle_df["label"] = kaggle_df["label"].str.lower().map(label_map)

# Drop any rows with missing or unrecognized labels
kaggle_df.dropna(subset=["label"],inplace=True)

# Ensure labels are integers
kaggle_df["label"] = kaggle_df["label"].astype(int)

# View label distribution
kaggle_df["label"].value_counts()


label
0    392924
1    156422
Name: count, dtype: int64

In [41]:
import re

def normalize_url(url):
    # Remove http/https protocol
    url = re.sub(r'^https?:\/\/', '', url.strip().lower())
    # Remove trailing slashes
    return url.rstrip('/')


In [42]:
# After mapping but before combining
print("Before combining:")# Normalize Kaggle URLs
kaggle_df["url"] = kaggle_df["url"].apply(normalize_url)

# Drop duplicates within Kaggle before combining
kaggle_df.drop_duplicates(subset="url", inplace=True)

# Optional: drop NaNs just in case
kaggle_df.dropna(subset=["url", "label"], inplace=True)

print("Kaggle dataset after normalization:", kaggle_df.shape)

print(kaggle_df["label"].value_counts(dropna=False))


Before combining:
Kaggle dataset after normalization: (502242, 2)
label
0    392896
1    109346
Name: count, dtype: int64


In [43]:
# Load OpenPhish feed.txt
with open(openphish_txt_path, "r") as f:
    urls = f.read().splitlines()

# Create DataFrame and label them as phishing
openphish_df = pd.DataFrame(urls, columns=["url"])
openphish_df["label"] = 1

print("OpenPhish Dataset:")
print(openphish_df.shape)
openphish_df.head()


OpenPhish Dataset:
(500, 2)


Unnamed: 0,url,label
0,https://instaproapk.su/,1
1,https://soeaung.soeaung311092.workers.dev/,1
2,http://unbouncepages.com/meta-business-support...,1
3,http://tested-polar-ground.glitch.me/,1
4,http://instagram506308.blogspot.com/,1


In [44]:
# Normalize OpenPhish URLs
openphish_df["url"] = openphish_df["url"].apply(normalize_url)

# Drop duplicates within OpenPhish before combining
openphish_df.drop_duplicates(subset="url", inplace=True)

print("OpenPhish dataset after normalization:", openphish_df.shape)


OpenPhish dataset after normalization: (499, 2)


In [45]:
# Combine both datasets
combined_df = pd.concat([kaggle_df, openphish_df], ignore_index=True)

# Final deduplication across sources
combined_df.drop_duplicates(subset="url", inplace=True)
combined_df.dropna(subset=["url", "label"], inplace=True)

print("Combined dataset shape:", combined_df.shape)
print("Label distribution:")
print(combined_df["label"].value_counts())


Combined dataset shape: (502741, 2)
Label distribution:
label
0    392896
1    109845
Name: count, dtype: int64


In [46]:

print("Combined dataset shape:", combined_df.shape)
combined_df["label"].value_counts()

Combined dataset shape: (502741, 2)


label
0    392896
1    109845
Name: count, dtype: int64

In [47]:
combined_path = "../data/urls/phishing_urls_combined_cleaned.csv"
combined_df.to_csv(combined_path, index=False)
print(f"Saved combined cleaned dataset to {combined_path}")


Saved combined cleaned dataset to ../data/urls/phishing_urls_combined_cleaned.csv


In [None]:
!pip install transformers

In [49]:
import pandas as pd

# Load the final cleaned dataset
df = pd.read_csv("../data/urls/phishing_urls_combined_cleaned.csv")

# Confirm structure
print(df.shape)
df.head()

(502741, 2)


Unnamed: 0,url,label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,1
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,1
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,1
3,mail.printakid.com/www.online.americanexpress....,1
4,thewhiskeydregs.com/wp-content/themes/widescre...,1


In [50]:
from sklearn.model_selection import train_test_split

# Stratified split to preserve phishing/legitimate ratio
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["url"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

In [52]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [53]:
# Tokenize the URLs
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [54]:
import torch

class PhishingURLDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | {"labels": torch.tensor(self.labels[idx])}
    
    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = PhishingURLDataset(train_encodings, train_labels)
test_dataset = PhishingURLDataset(test_encodings, test_labels)


In [55]:
from transformers import AutoModelForSequenceClassification

# Load model (2 output labels: phishing, legitimate)
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=2
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="../models/url-bert-model",     # where to save the model
    eval_strategy="epoch",                 # evaluate after each epoch
    save_strategy="epoch",                       # save after each epoch
    logging_dir="../results/url/logs",     # log directory
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=2,
    logging_steps=50
)


In [57]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [58]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
