<a href="https://colab.research.google.com/github/lazeicoder/Spam-and-smishing-detection-/blob/master/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from transformers import BertTokenizerFast, BertModel
from torch.optim import AdamW
from tqdm import tqdm
import random

In [None]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [None]:
df = pd.read_csv("Dataset_10191.csv")

texts = df["TEXT"].astype(str).tolist()
labels = df["LABEL"].tolist()

# Encode labels: ham, smishing, spam -> 0,1,2
le = LabelEncoder()
y = le.fit_transform(labels)
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

Label mapping: {np.str_('ham'): np.int64(0), np.str_('smishing'): np.int64(1), np.str_('spam'): np.int64(2)}


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    texts, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

MAX_LEN = 128

train_encodings = tokenizer(
    X_train, truncation=True, padding=True, max_length=MAX_LEN
)
test_encodings = tokenizer(
    X_test, truncation=True, padding=True, max_length=MAX_LEN
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [None]:
class BertTextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = BertTextDataset(train_encodings, y_train)
test_dataset = BertTextDataset(test_encodings, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        x = self.dropout(cls_output)
        logits = self.classifier(x)
        return logits

num_classes = len(le.classes_)
model = BertClassifier(num_classes).to(device)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)

EPOCHS = 6

In [None]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")

    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels_batch = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels_batch)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}")

Epoch 1/3: 100%|██████████| 510/510 [3:05:18<00:00, 21.80s/it, loss=0.633]


Epoch 1 - Average Loss: 0.2112


Epoch 2/3: 100%|██████████| 510/510 [3:04:23<00:00, 21.69s/it, loss=0.00296]


Epoch 2 - Average Loss: 0.0733


Epoch 3/3: 100%|██████████| 510/510 [3:04:12<00:00, 21.67s/it, loss=0.000249]

Epoch 3 - Average Loss: 0.0502





In [None]:
model.eval()
all_preds = []
all_true = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels_batch = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_true.extend(labels_batch.cpu().numpy())

accuracy = accuracy_score(all_true, all_preds)

print("\nTest Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(
    all_true,
    all_preds,
    target_names=le.classes_,
    digits=4
))

print("Confusion Matrix:")
print(confusion_matrix(all_true, all_preds))

Evaluating: 100%|██████████| 128/128 [13:45<00:00,  6.45s/it]


Test Accuracy: 0.9774399215301618

Classification Report:
              precision    recall  f1-score   support

         ham     1.0000    0.9912    0.9956       679
    smishing     0.9762    0.9632    0.9697       680
        spam     0.9568    0.9779    0.9673       680

    accuracy                         0.9774      2039
   macro avg     0.9777    0.9774    0.9775      2039
weighted avg     0.9777    0.9774    0.9775      2039

Confusion Matrix:
[[673   1   5]
 [  0 655  25]
 [  0  15 665]]





In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(all_true, all_preds)

plt.figure(figsize=(8, 6))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=le.classes_,
    yticklabels=le.classes_,
)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()