In [1]:
# Get data

from datasets import load_dataset
dataset = load_dataset("ucirvine/sms_spam", split="train")
dataset = dataset.train_test_split(test_size=0.2)

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 4459
    })
    test: Dataset({
        features: ['sms', 'label'],
        num_rows: 1115
    })
})

In [3]:
sum(dataset['train']['label'])/len(dataset['train'])

0.13568064588472753

## Note: Unbalanced labels warning.
labels are unbalanced. There are 87 % 0s and 13 % 1s. This may affect the  model. Check carefull

In [31]:
# Get a model
checkpoint = "mrm8488/bert-tiny-finetuned-sms-spam-detection"
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)


# loose the pretrained weight and create a fresh model with the same configuration
model = AutoModelForSequenceClassification.from_config(model.config)

In [32]:
#print(model)
#model.config

In [33]:
def tokenize_function(example):
    return tokenizer(example["sms"], 
    #padding=True, 
    #truncation=True
    )
tokenized_dataset = dataset.map(tokenize_function)

Map:   0%|          | 0/4459 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

In [34]:
import evaluate
import numpy as np

def compute_metrics(eval_preds):
    metric = evaluate.combine(["accuracy", "recall", "precision", "f1"])
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)



In [35]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [36]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['sms', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4459
    })
    test: Dataset({
        features: ['sms', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1115
    })
})

In [37]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    "mini-bert-sms-spam-fresh", # output dir 
    evaluation_strategy="epoch")

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [38]:
predictions = trainer.predict(tokenized_dataset['train'])

  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
predictions.metrics

{'test_loss': 0.6775296926498413,
 'test_model_preparation_time': 0.0008,
 'test_accuracy': 0.8643193541152725,
 'test_recall': 0.0,
 'test_precision': 0.0,
 'test_f1': 0.0,
 'test_runtime': 4.2691,
 'test_samples_per_second': 1044.483,
 'test_steps_per_second': 130.707}

In [40]:
trainer.train()

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=1674, training_loss=0.14630703715298196, metrics={'train_runtime': 75.5374, 'train_samples_per_second': 177.091, 'train_steps_per_second': 22.161, 'total_flos': 1804925353920.0, 'train_loss': 0.14630703715298196, 'epoch': 3.0})

In [41]:
predictions = trainer.predict(tokenized_dataset['test'])
predictions.metrics

{'test_loss': 0.09566442668437958,
 'test_model_preparation_time': 0.0008,
 'test_accuracy': 0.97847533632287,
 'test_recall': 0.8661971830985915,
 'test_precision': 0.9609375,
 'test_f1': 0.9111111111111111,
 'test_runtime': 1.7623,
 'test_samples_per_second': 632.709,
 'test_steps_per_second': 79.443}

In [42]:
# make an inference pipeline

from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

# Replace with your model checkpoint directory
checkpoint = "./mini-bert-sms-spam-fresh/checkpoint-1674"

# Load the model and tokenizer
model2 = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer2 = AutoTokenizer.from_pretrained(checkpoint)

# Create a pipeline
nlp_pipeline = pipeline("text-classification", model=model2, tokenizer=tokenizer2)

Device set to use cpu


In [43]:
# Test the pipeline
print(nlp_pipeline("I think I will take my girlfriend to this movie!"))

[{'label': 'LABEL_0', 'score': 0.9926934242248535}]


In [48]:
# Test the pipeline
print(nlp_pipeline("07732584351 - Rodger Burns - MSG = We tried to call you re your reply to our sms for a free nokia mobile + free camcorder. Please call now 8588811730 for delivery tomorrow"))

[{'label': 'LABEL_1', 'score': 0.9921478629112244}]
