In [None]:
!pip install transformers pandas scikit-learn torch

In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
def batch_tokenize(texts, labels, tokenizer, batch_size=10000):
    all_encodings = {"input_ids": [], "attention_mask": []}
    all_labels = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_labels = labels[i:i+batch_size]
        batch_enc = tokenizer(batch_texts, truncation=True, padding=True, return_tensors="pt")
        all_encodings["input_ids"].extend(batch_enc["input_ids"])
        all_encodings["attention_mask"].extend(batch_enc["attention_mask"])
        all_labels.extend(batch_labels)
    return all_encodings, all_labels

class PhishingDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        return {
            key: self.encodings[key][idx] for key in self.encodings
        } | {"labels": torch.tensor(self.labels[idx])}
    def __len__(self):
        return len(self.labels)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [5]:
dataset_path = "/content/drive/MyDrive/phishing_chunks/phishing_urls_part2.csv"
df = pd.read_csv(dataset_path)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["url"].tolist(), df["label"].tolist(),
    test_size=0.2, stratify=df["label"], random_state=42
)

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
train_encodings, train_labels_final = batch_tokenize(train_texts, train_labels, tokenizer)
test_encodings, test_labels_final = batch_tokenize(test_texts, test_labels, tokenizer)
train_dataset = PhishingDataset(train_encodings, train_labels_final)
test_dataset = PhishingDataset(test_encodings, test_labels_final)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
model_path = "/content/drive/MyDrive/phishing_model_checkpoint"
try:
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)
except:
    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_args = TrainingArguments(
    output_dir=model_path,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="/content/drive/MyDrive/phishing_logs",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=2,
    logging_steps=50,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)


In [9]:
trainer.train()
trainer.evaluate()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0,0.0,1.0,0.0,0.0,0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.0,
 'eval_accuracy': 1.0,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_f1': 0.0,
 'eval_runtime': 223.0391,
 'eval_samples_per_second': 150.27,
 'eval_steps_per_second': 4.699,
 'epoch': 1.0}

In [9]:
trainer.save_model("/content/drive/MyDrive/phishing_model_v2_after_phase2")

In [10]:
import pandas as pd
df2 = pd.read_csv("/content/drive/MyDrive/phishing_chunks/phishing_urls_part2.csv")
print(df2["label"].value_counts())

label
0    167580
Name: count, dtype: int64


In [11]:
import pandas as pd
df2 = pd.read_csv("/content/drive/MyDrive/phishing_chunks/phishing_urls_part3.csv")
print(df2["label"].value_counts())


label
0    138376
1     29205
Name: count, dtype: int64


In [12]:
import pandas as pd

chunk2 = pd.read_csv("/content/drive/MyDrive/phishing_chunks/phishing_urls_part2.csv")
chunk3 = pd.read_csv("/content/drive/MyDrive/phishing_chunks/phishing_urls_part3.csv")

combined = pd.concat([chunk2, chunk3], ignore_index=True)

In [13]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    combined,
    test_size=0.2,
    stratify=combined["label"],
    random_state=42
)

In [14]:
train_df.to_csv("/content/drive/MyDrive/phishing_chunks/phase2_train.csv", index=False)
test_df.to_csv("/content/drive/MyDrive/phishing_chunks/phase2_test.csv", index=False)

In [15]:
combined.to_csv("/content/drive/MyDrive/phishing_chunks/phase2_combined.csv", index=False)