In [33]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [None]:
!pip -q install "transformers==4.43.3" "datasets==2.20.0" "accelerate==0.33.0" "evaluate==0.4.2" "scikit-learn==1.4.2" "torch==2.3.1"



In [4]:
import os, numpy as np, pandas as pd, torch
from pathlib import Path
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

# call datasets
DATA_DIR = Path("cleaned")
TRAIN_CSV = DATA_DIR/"train_dataset.csv"
TEST_CSV = DATA_DIR/"test_dataset.csv"
VAL_CSV = DATA_DIR/"val_dataset.csv"


MODEL_NAME = "distilbert-base-uncased"
ARTIFACTS = Path("artifacts/distilbert_phishing")
ARTIFACTS.mkdir(parents=True, exist_ok=True)

LABEL_NAMES = {0: "Ham", 1: "Phishing"}


#  Helpful: see what compute you'll use
device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
print("Torch device:", device)

# Optional: quick existence check; we'll assert on load next cell
print("Splits exist? train:", TRAIN_CSV.exists(), "val:", VAL_CSV.exists(), "test:", TEST_CSV.exists())

Torch device: mps
Splits exist? train: True val: True test: True


In [5]:
import pandas as pd

train_df = pd.read_csv(TRAIN_CSV)
val_df   = pd.read_csv(VAL_CSV)
test_df  = pd.read_csv(TEST_CSV)

for name, df in [("train", train_df), ("val", val_df), ("test", test_df)]:
    assert {"TEXT","LABEL"}.issubset(df.columns), f"{name} split missing TEXT/LABEL"
    assert set(df["LABEL"].unique()).issubset({0,1}), f"{name} has non-binary labels"
    print(name, df.shape, df["LABEL"].value_counts(normalize=True).round(3).to_dict())

train_df.head(3)


train (4883, 2) {0: 0.826, 1: 0.174}
val (610, 2) {0: 0.826, 1: 0.174}
test (611, 2) {0: 0.825, 1: 0.175}


Unnamed: 0,LABEL,TEXT
0,0,"geeee ... i miss you already, you know ? your ..."
1,0,i dnt wnt to tlk wid u
2,0,haven't left yet so probably gonna be here til...


In [7]:
# Length peek so you can justify MAX_LEN in your report
train_df["char_len"] = train_df["TEXT"].astype(str).str.len()
print(train_df["char_len"].describe(percentiles=[.5, .9, .95, .99]))

# Most SMS fit easily in 128 tokens; we’ll start there
MAX_LEN = 192

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

# Build Hugging Face Datasets (Trainer likes these)
ds = DatasetDict({
    "train": Dataset.from_pandas(train_df.drop(columns=["char_len"], errors="ignore"), preserve_index=False),
    "val":   Dataset.from_pandas(val_df, preserve_index=False),
    "test":  Dataset.from_pandas(test_df, preserve_index=False),
})

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)
print("Tokenizer loaded.")


count    4883.000000
mean       82.607413
std        59.287982
min         2.000000
50%        65.000000
90%       155.000000
95%       161.000000
99%       276.000000
max       910.000000
Name: char_len, dtype: float64
Tokenizer loaded.


In [14]:
# 1) Reduce sequence length and re-tokenise
MAX_LEN = 128
def tokenize_batch(batch):
    return tokenizer(batch["TEXT"], padding=False, truncation=True, max_length=MAX_LEN)

tokenized = ds.map(tokenize_batch, batched=True)
tokenized = tokenized.rename_column("LABEL", "labels")
for split in tokenized:
    tokenized[split] = tokenized[split].remove_columns(
        [c for c in tokenized[split].column_names if c not in ("input_ids","attention_mask","labels")]
    )
tokenized.set_format(type="torch")
tokenized


Map: 100%|██████████| 4883/4883 [00:00<00:00, 8503.59 examples/s] 
Map: 100%|██████████| 610/610 [00:00<00:00, 19540.57 examples/s]
Map: 100%|██████████| 611/611 [00:00<00:00, 24318.61 examples/s]


DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 4883
    })
    val: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 610
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 611
    })
})

In [9]:
import numpy as np, torch

classes, counts = np.unique([int(x) for x in train_df["LABEL"].values], return_counts=True)
total = counts.sum()
class_weights = torch.tensor(total / (len(classes) * counts), dtype=torch.float)

print("Class counts  :", dict(zip(classes, counts)))
print("Class weights :", class_weights.tolist(), "  # order matches classes:", classes.tolist())


Class counts  : {0: 4033, 1: 850}
Class weights : [0.6053805947303772, 2.8723528385162354]   # order matches classes: [0, 1]


In [19]:
# --- Clear caches & force CPU fallback ---
import os, gc, torch
gc.collect()
if torch.backends.mps.is_available():
    try:
        torch.mps.empty_cache()
    except Exception:
        pass

# This tells PyTorch to gracefully fall back when MPS can't allocate
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score

# Fresh model; keep gradient checkpointing (saves RAM even on CPU)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.gradient_checkpointing_enable()

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits.view(-1, model.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    probs_pos = torch.softmax(torch.tensor(logits), dim=-1)[:, 1].numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", pos_label=1)
    acc = accuracy_score(labels, preds)
    try:
        roc_auc = roc_auc_score(labels, probs_pos)
    except ValueError:
        roc_auc = float("nan")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1, "roc_auc": roc_auc}

# Conservative settings for CPU
BATCH  = 8
EPOCHS = 4            # CPU will be slower; 3–4 epochs is usually enough for DistilBERT on SMS
LR     = 2e-5

args = TrainingArguments(
    output_dir=str(ARTIFACTS/"training"),
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=LR,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    gradient_accumulation_steps=2,   # effective batch ~16
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=50,
    report_to="none",
    seed=42,
    fp16=False, bf16=False,
    dataloader_num_workers=0,
    eval_accumulation_steps=2,
    no_cuda=True                   # 👈 force CPU (disables CUDA/MPS)
)

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

train_result = trainer.train()
train_result.metrics


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  6%|▋         | 97/1530 [2:43:37<40:17:21, 101.22s/it]
  4%|▍         | 50/1220 [01:40<34:05,  1.75s/it]

{'loss': 0.3141, 'grad_norm': 0.552915632724762, 'learning_rate': 1.918032786885246e-05, 'epoch': 0.16}


  8%|▊         | 100/1220 [03:43<59:35,  3.19s/it] 

{'loss': 0.1126, 'grad_norm': 0.1927800476551056, 'learning_rate': 1.836065573770492e-05, 'epoch': 0.33}


 12%|█▏        | 150/1220 [05:50<40:31,  2.27s/it]  

{'loss': 0.1426, 'grad_norm': 1.103765606880188, 'learning_rate': 1.7540983606557377e-05, 'epoch': 0.49}


 16%|█▋        | 200/1220 [08:17<1:08:32,  4.03s/it]

{'loss': 0.1377, 'grad_norm': 0.6060847640037537, 'learning_rate': 1.6721311475409837e-05, 'epoch': 0.65}


 20%|██        | 250/1220 [10:58<36:31,  2.26s/it]  

{'loss': 0.0432, 'grad_norm': 0.047016218304634094, 'learning_rate': 1.5901639344262295e-05, 'epoch': 0.82}


 25%|██▍       | 300/1220 [12:37<34:31,  2.25s/it]

{'loss': 0.1434, 'grad_norm': 0.3174337148666382, 'learning_rate': 1.5081967213114754e-05, 'epoch': 0.98}


 25%|██▌       | 305/1220 [12:47<29:01,  1.90s/it]
 25%|██▌       | 305/1220 [12:58<29:01,  1.90s/it]

{'eval_loss': 0.09538261592388153, 'eval_accuracy': 0.9885245901639345, 'eval_precision': 0.9714285714285714, 'eval_recall': 0.9622641509433962, 'eval_f1': 0.966824644549763, 'eval_roc_auc': 0.9960879005690326, 'eval_runtime': 10.6462, 'eval_samples_per_second': 57.297, 'eval_steps_per_second': 7.233, 'epoch': 1.0}


 29%|██▊       | 350/1220 [15:52<1:16:11,  5.25s/it]

{'loss': 0.0369, 'grad_norm': 0.04437772557139397, 'learning_rate': 1.4262295081967214e-05, 'epoch': 1.15}


 33%|███▎      | 400/1220 [17:51<27:13,  1.99s/it]  

{'loss': 0.0485, 'grad_norm': 0.04577852040529251, 'learning_rate': 1.3442622950819673e-05, 'epoch': 1.31}


 37%|███▋      | 450/1220 [19:20<17:46,  1.39s/it]

{'loss': 0.0369, 'grad_norm': 0.03798583894968033, 'learning_rate': 1.2622950819672132e-05, 'epoch': 1.47}


 41%|████      | 500/1220 [20:50<24:23,  2.03s/it]

{'loss': 0.0666, 'grad_norm': 0.027329187840223312, 'learning_rate': 1.1803278688524591e-05, 'epoch': 1.64}


 45%|████▌     | 550/1220 [22:21<20:00,  1.79s/it]

{'loss': 0.0662, 'grad_norm': 0.02846544235944748, 'learning_rate': 1.0983606557377052e-05, 'epoch': 1.8}


 49%|████▉     | 600/1220 [23:52<21:58,  2.13s/it]

{'loss': 0.0335, 'grad_norm': 0.027946123853325844, 'learning_rate': 1.0163934426229509e-05, 'epoch': 1.96}


 50%|█████     | 611/1220 [24:12<15:25,  1.52s/it]
 50%|█████     | 611/1220 [24:25<15:25,  1.52s/it]

{'eval_loss': 0.11326508969068527, 'eval_accuracy': 0.9885245901639345, 'eval_precision': 0.9900990099009901, 'eval_recall': 0.9433962264150944, 'eval_f1': 0.966183574879227, 'eval_roc_auc': 0.9982966457023061, 'eval_runtime': 13.1382, 'eval_samples_per_second': 46.429, 'eval_steps_per_second': 5.861, 'epoch': 2.0}


 50%|█████     | 611/1220 [24:33<24:28,  2.41s/it]


{'train_runtime': 1473.3096, 'train_samples_per_second': 13.257, 'train_steps_per_second': 0.828, 'train_loss': 0.09676435241801892, 'epoch': 2.0}


{'train_runtime': 1473.3096,
 'train_samples_per_second': 13.257,
 'train_steps_per_second': 0.828,
 'total_flos': 134099179678116.0,
 'train_loss': 0.09676435241801892,
 'epoch': 2.0}

In [20]:
test_metrics = trainer.evaluate(eval_dataset=tokenized["test"])
test_metrics


100%|██████████| 77/77 [00:10<00:00,  7.29it/s]


{'eval_loss': 0.02986525185406208,
 'eval_accuracy': 0.9885433715220949,
 'eval_precision': 0.9464285714285714,
 'eval_recall': 0.9906542056074766,
 'eval_f1': 0.9680365296803652,
 'eval_roc_auc': 0.9998331108144193,
 'eval_runtime': 12.0174,
 'eval_samples_per_second': 50.843,
 'eval_steps_per_second': 6.407,
 'epoch': 2.0}

In [21]:
from sklearn.metrics import classification_report
preds = trainer.predict(tokenized["test"])
y_true = preds.label_ids
y_pred = preds.predictions.argmax(-1)
print(classification_report(y_true, y_pred, target_names=[LABEL_NAMES[0], LABEL_NAMES[1]]))


100%|██████████| 77/77 [00:10<00:00,  7.68it/s]

              precision    recall  f1-score   support

         Ham       1.00      0.99      0.99       504
    Phishing       0.95      0.99      0.97       107

    accuracy                           0.99       611
   macro avg       0.97      0.99      0.98       611
weighted avg       0.99      0.99      0.99       611






In [22]:
save_dir = ARTIFACTS
trainer.model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

import json
with open(save_dir/"metrics_test.json", "w") as f:
    json.dump({k: float(v) for k,v in test_metrics.items()}, f, indent=2)

print("Saved to:", save_dir.resolve())


Saved to: /Users/jordancroft/Documents/Documents - Jordan.’s MacBook Air/GitHub/Phishing-detector-backend/ml/classifier/artifacts/distilbert_phishing


In [24]:
from transformers import TextClassificationPipeline
pipe = TextClassificationPipeline(model=trainer.model, tokenizer=tokenizer, return_all_scores=True, truncation=True)

for s in [
    "Royal Mail: Your parcel is waiting, pay £1.99 to release: http://bit.ly/xyz",
    "Hey mate, are we still on for lunch tomorrow?",
    "URGENT: Your bank account is locked. Verify now at www.badsite.ru/login",
    "Friday means one thing...SurPRIZE Guy could be calling YOU! Make sure you answer! > https://eej.at/Q6Z7Gjxw Stop http://oot.rs/BI2GAUar"
]:
    scores = pipe(s, max_length=128)[0]
    p1 = next(x["score"] for x in scores if x["label"] in ("LABEL_1","Smishing","1"))
    pred = 1 if p1 >= 0.5 else 0
    print(f"{s}\n -> {pred} ({LABEL_NAMES[pred]}), prob_smishing={p1:.3f}\n")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Royal Mail: Your parcel is waiting, pay £1.99 to release: http://bit.ly/xyz
 -> 1 (Phishing), prob_smishing=0.998

Hey mate, are we still on for lunch tomorrow?
 -> 0 (Ham), prob_smishing=0.004

URGENT: Your bank account is locked. Verify now at www.badsite.ru/login
 -> 1 (Phishing), prob_smishing=0.996

Friday means one thing...SurPRIZE Guy could be calling YOU! Make sure you answer! > https://eej.at/Q6Z7Gjxw Stop http://oot.rs/BI2GAUar
 -> 1 (Phishing), prob_smishing=0.998



In [41]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tok = AutoTokenizer.from_pretrained(REPO_ID)
mdl = AutoModelForSequenceClassification.from_pretrained(REPO_ID)
mdl.eval()

enc = tok("Royal Mail: pay £1.99 to release your parcel http://bit.ly/xyz", return_tensors="pt", truncation=True, max_length=128)
with torch.no_grad():
    prob = torch.softmax(mdl(**enc).logits, dim=-1)[0,1].item()
print("Prob smishing:", round(prob, 3))


Prob smishing: 0.998
