In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
import numpy as np
from datasets import load_metric
from transformers import Trainer

# Import dataset - we are using the ag_news dataset from Huggingface
dataset = load_dataset("ag_news", split={'train': 'train', 'test': 'test'})

splits = ["train", "test"]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Prepare foundation model

## Tokenize dataset

In [2]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True
    )


Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

## Load Pre-trained model

In [3]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=4,  # AG News has 4 labels
    id2label={0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"},
    label2id={"World": 0, "Sports": 1, "Business": 2, "Sci/Tech": 3},
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Unfreeze model params for fine-tuning
for param in model.parameters():
    param.requires_grad = True


# Perform lightweight tuning

In [5]:
# Set up training arguments

training_args = TrainingArguments(
    output_dir="./data/ag_news",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
)


In [6]:
# define evaluation metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [7]:
# Create trainer instance

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## Train the model

In [8]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1935,0.171599,0.941711
2,0.1311,0.180975,0.946842


TrainOutput(global_step=15000, training_loss=0.18089691670735678, metrics={'train_runtime': 7311.7121, 'train_samples_per_second': 32.824, 'train_steps_per_second': 2.052, 'total_flos': 2.1881285829212544e+16, 'train_loss': 0.18089691670735678, 'epoch': 2.0})

## Evaluate trained model

In [9]:
evaluation_results = trainer.evaluate()
print(evaluation_results)

{'eval_loss': 0.17159855365753174, 'eval_accuracy': 0.9417105263157894, 'eval_runtime': 49.7975, 'eval_samples_per_second': 152.618, 'eval_steps_per_second': 2.39, 'epoch': 2.0}


Model is performing at ~94% accuracy on unseen data. This is a solid performance and likely could be marginally improved with additional training epochs.

## Save trained model

In [10]:
model.save_pretrained("./results/ag_news_fine_tuned")
tokenizer.save_pretrained("./results/ag_news_fine_tuned")


('./results/ag_news_fine_tuned/tokenizer_config.json',
 './results/ag_news_fine_tuned/special_tokens_map.json',
 './results/ag_news_fine_tuned/vocab.txt',
 './results/ag_news_fine_tuned/added_tokens.json',
 './results/ag_news_fine_tuned/tokenizer.json')

# Load fine-tuned model (if necessary)

In [11]:
# Load model
model_path = "./results/ag_news_fine_tuned"

model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


# Inference

In [12]:
def predict(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = logits.argmax(-1).tolist()
    return [model.config.id2label[prediction] for prediction in predictions]


sample_text = "The stock market closed lower today after a volatile trading session."
print(predict(sample_text, model, tokenizer))


['Business']
