### Processing Data
Reading the data into pandas

In [1]:
import pandas as pd

def read_my_data(path):
    rows = []
    with open(path, encoding='utf-8', errors='replace') as f:
        for i, line in enumerate(f):
            line = line.rstrip('\n\r')
            if i == 0 and line.lower().startswith('label'):
                # skip header
                continue
            parts = line.split(',', 1)  # split only on first comma
            if len(parts) == 2:
                label, text = parts
            else:
                label = parts[0]
                text = ''
            # remove a single surrounding double quote if present
            if len(text) >= 2 and text[0] == '"' and text[-1] == '"':
                text = text[1:-1]
            rows.append({'label': label.strip(), 'text': text.strip()})
    return pd.DataFrame(rows)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Load Kaggle & Mendeley datasets
kaggle_data = read_my_data('./data/kaggle spam.csv')
mendeley_data = read_my_data('./data/Mendeley Data collection/Dataset_5971.csv')

# remove extraneous columns from Mendeley dataset
mendeley_data = mendeley_data[['label', 'text']]

# Combine datasets
data = pd.concat([kaggle_data, mendeley_data], ignore_index=True)

Explore the dataset & clean it up

In [12]:
# convert all text to lowercase, strip whitespace
data['text'] = data['text'].str.strip().str.lower()

# check for empty lines
print(f'number of null rows: {data['text'].isnull().sum()}')
print(f'number of empty rows: {data['text'].str.strip().eq('').sum()}\n')

# check for duplicates & drop
print(f'number of duplicate rows: {data['text'].duplicated().sum()}\n')
data = data.drop_duplicates(subset=['text'], keep='first').reset_index(drop=True)

# check all the labels are correct & replace variants/typos
print(f'{data['label'].value_counts()}\n')

mapping = {
    'Smishing': 'spam',
    'Spam': 'spam',
    'smishing': 'spam'
}
data['label'] = data['label'].replace(mapping)

# check label imbalance
print(f'{data['label'].value_counts()}\n')

# use sklearn preprocessing to convert labels to numerical
label_encoder = preprocessing.LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'].to_list())

# check text lengths by class
data_with_text_lengths = data
data_with_text_lengths['text_lengths'] = data['text'].str.len()
print(f'{data_with_text_lengths.groupby('label')['text_lengths'].describe()}\n')

print(f'{data.head()}\n')

# split into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['label'])

number of null rows: 0
number of empty rows: 0

number of duplicate rows: 0

label
0    9349
1    1774
Name: count, dtype: int64

label
0    9349
1    1774
Name: count, dtype: int64

        count        mean        std   min    25%    50%    75%    max
label                                                                 
0      9349.0   77.460584  56.815853   5.0   40.0   59.0   98.0  919.0
1      1774.0  146.226043  37.746373  16.0  135.0  154.0  164.0  791.0

   label                                               text  text_lengths
0      1  hey guys am looking to sell my students season...           134
1      1  light up your income from side hustle - join t...           791
2      1  my name is debby and we are hiring part-timers...           642
3      0  "go until jurong point, crazy.. available only...           116
4      0                   ok lar... joking wif u oni...,,,            32



### Using DistilBERT for Spam Classification
Tokenization

In [4]:
from datasets import Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

hf_train_data = Dataset.from_pandas(train_data)
hf_test_data = Dataset.from_pandas(test_data)

def tokenize(examples):
    return tokenizer(examples['text'], truncation=True)

tokenized_train_data = hf_train_data.map(tokenize, batched=True)
tokenized_test_data = hf_test_data.map(tokenize, batched=True)

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 8898/8898 [00:00<00:00, 22292.70 examples/s]
Map: 100%|██████████| 8898/8898 [00:00<00:00, 22292.70 examples/s]
Map: 100%|██████████| 2225/2225 [00:00<00:00, 36873.99 examples/s]
Map: 100%|██████████| 2225/2225 [00:00<00:00, 36873.99 examples/s]


Tune DistilBERT

In [8]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import numpy as np
import torch
import torch.nn as nn

model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy='epoch',
    logging_strategy='epoch'
)

counts = np.array(train_data['label'].value_counts().sort_index())
weights = torch.tensor(counts.sum() / counts, dtype=torch.float)
weights[1] /= 2 # slightly reduce weight for spam class
print(weights)

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Accept unexpected kwargs (some Trainer/Accelerate versions pass extra args)
        # Support either 'labels' or 'label' key from dataset
        labels = inputs.get("labels") if "labels" in inputs else inputs.get("label")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Ensure the class weights are on the same device as the logits
        device = logits.device if logits is not None else model.device
        device_weights = weights.to(device)
        loss_fct = nn.CrossEntropyLoss(weight=device_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_test_data,
    tokenizer=tokenizer,
    data_collator=data_collator
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(
  trainer = WeightedTrainer(


tensor([1.1897, 3.1353])


In [9]:
trainer.train()
trainer.save_model('./spam_transformer_model')

Epoch,Training Loss,Validation Loss
1,0.0895,0.033834
2,0.0255,0.043709
3,0.0095,0.036218
4,0.0056,0.026062
5,0.0035,0.038332


Testing Metrics

In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

preds_output = trainer.predict(tokenized_test_data)   # returns PredictionOutput
logits = preds_output.predictions
labels = preds_output.label_ids

preds = np.argmax(logits, axis=1)
print("Confusion matrix:")
print(confusion_matrix(labels, preds))
print("\nClassification report:")
print(classification_report(labels, preds, digits=4))

# If binary, compute ROC AUC
if logits.shape[-1] == 2:
    probs = np.exp(logits) / np.exp(logits).sum(axis=1, keepdims=True)
    pos_probs = probs[:, 1]
    try:
        print("ROC AUC:", roc_auc_score(labels, pos_probs))
    except Exception as e:
        print("ROC AUC error:", e)

Confusion matrix:
[[1868    2]
 [   6  349]]

Classification report:
              precision    recall  f1-score   support

           0     0.9968    0.9989    0.9979      1870
           1     0.9943    0.9831    0.9887       355

    accuracy                         0.9964      2225
   macro avg     0.9956    0.9910    0.9933      2225
weighted avg     0.9964    0.9964    0.9964      2225

ROC AUC: 0.9981983881901032


Making Predictions with our Tuned Model

In [32]:
from transformers import pipeline

device = 0 if torch.cuda.is_available() else -1

classifier = pipeline('text-classification', model='./spam_transformer_model', tokenizer=tokenizer, device=device, top_k=None)

result = classifier("Click here to claim your free prize")[0]
print(f'{result}\n')
result = 'ham' if result[0]['label'] == 'LABEL_0' else 'spam'
print(f'Prediction: {result}')

Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.999953031539917}, {'label': 'LABEL_0', 'score': 4.698185875895433e-05}]

Prediction: spam
