In [1]:
!pip install -q transformers datasets evaluate accelerate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
from transformers import TrainingArguments, Trainer
import evaluate
import torch


In [3]:
dataset = load_dataset("mteb/tweet_sentiment_extraction")

# Convert to DataFrame (optional, for inspection)
df = pd.DataFrame(dataset['train'])
print(df.head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/1.86M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/240k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3432 [00:00<?, ? examples/s]

           id                                               text  label  \
0  cb774db0d1                I`d have responded, if I were going      1   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!      0   
2  088c60f138                          my boss is bullying me...      0   
3  9642c003ef                     what interview! leave me alone      0   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...      0   

  label_text  
0    neutral  
1   negative  
2   negative  
3   negative  
4   negative  


In [None]:
# If we check the dataset we just downloaded, it is a dataset containing a subset for training and a
# subset for testing. If we convert the training subset to a dataframe, it looks as follows.

In [None]:
# Step 3: Tokenizer
# Now that we already have our dataset, we need a tokenizer to prepare it to be parsed by our model.

# As LLMs work with tokens, we require a tokenizer to process the dataset. To process your dataset in one step,
#  use the Datasets map method to apply a preprocessing function over the entire dataset.

# This is why the second step is to load a pre-trained Tokenizer and tokenize our dataset so it can be used for
#  fine-tuning.

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# GPT-2 does not have a pad token, so we set it to eos_token
tokenizer.pad_token = tokenizer.eos_token


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [5]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128  # limit sequence length for faster training
    )

# Tokenize datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/26732 [00:00<?, ? examples/s]

Map:   0%|          | 0/3432 [00:00<?, ? examples/s]

In [None]:
# To improve our processing requirements, we can create a smaller subset of the full dataset to fine-tune
# our model. The training set will be used to fine-tune our model, while the testing set will be used to
# evaluate it.

In [6]:
# ===============================
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))

In [None]:
# Step 4: Initialize our base model
# Start by loading your model and specify the number of expected labels.
#  From the Tweet’s sentiment dataset card, you know there are three labels:

In [7]:
# We have 3 labels (positive, negative, neutral)
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=3)


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Set pad token id to eos token id
model.config.pad_token_id = model.config.eos_token_id

In [9]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    }


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# Step 6: Fine-tune using the Trainer Method
# Our final step is to set up the training arguments and start the training process.
# The Transformers library contains the Trainer class, which supports a wide range of training options and
# features such as logging, gradient accumulation, and mixed precision. We first define the training arguments
# together with the evaluation strategy. Once everything is defined, we can easily train the model simply using
#  the train() command.

# from transformers import TrainingArguments, Trainer

In [11]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",   # Evaluate at the end of each epoch
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,  # Small batch size for Colab
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    report_to="none",   # disable wandb
    fp16=torch.cuda.is_available()  # Mixed precision if GPU supports
)

In [12]:
# ===============================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [13]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.0905,0.868507,0.6,0.590821
2,0.6295,0.717595,0.696,0.697146


TrainOutput(global_step=250, training_loss=0.9336239471435547, metrics={'train_runtime': 108.7732, 'train_samples_per_second': 18.387, 'train_steps_per_second': 2.298, 'total_flos': 130649554944000.0, 'train_loss': 0.9336239471435547, 'epoch': 2.0})

In [14]:
results = trainer.evaluate()
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 0.7175954580307007, 'eval_accuracy': 0.696, 'eval_f1': 0.6971456443069302, 'eval_runtime': 6.2379, 'eval_samples_per_second': 80.155, 'eval_steps_per_second': 40.077, 'epoch': 2.0}


In [15]:
trainer.save_model("./gpt2-sentiment")
tokenizer.save_pretrained("./gpt2-sentiment")


('./gpt2-sentiment/tokenizer_config.json',
 './gpt2-sentiment/special_tokens_map.json',
 './gpt2-sentiment/vocab.json',
 './gpt2-sentiment/merges.txt',
 './gpt2-sentiment/added_tokens.json')

In [None]:
# https://www.datacamp.com/tutorial/fine-tuning-large-language-models