In [1]:
import pandas as pd
import numpy as np

import torch 
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
##1. Check if GPU is available, if not use cpu
device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu"
)

print(f"Using Device: {device}")

Using Device: cpu


In [3]:
dataset = load_dataset("emotion", split='train[:1%]')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [4]:
dataset.to_pandas()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3
...,...,...
155,i could feel her whimper to the thought of bei...,0
156,im certainly not going to sit and tell you wha...,5
157,im sorry that there wasnt more humor in this p...,5
158,i feel ive got my foot in the door of the fant...,1


In [12]:
class EmotionDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length=512):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            # Encode the inputs
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>',
                                       truncation=True, 
                                       max_length=max_length, 
                                       padding="max_length")
            
            self.input_ids.append(encodings_dict['input_ids'])
            self.attn_masks.append(encodings_dict['attention_mask'])
            # For language modeling, labels are input_ids shifted by one
            self.labels.append(encodings_dict['input_ids'])

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        item = {'input_ids': torch.tensor(self.input_ids[idx]),
                'attention_mask': torch.tensor(self.attn_masks[idx]),
                'labels': torch.tensor(self.labels[idx])}
        return item


In [13]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# Set the padding token to the EOS token
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained("gpt2")

In [14]:
# Count the total number of parameters
total_params = sum(p.numel() for p in model.parameters()) / 1000000

# Count the number of trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) / 1000000

print(f"Total Parameters: {total_params} M")
print(f"Trainable Parameters: {trainable_params} M")


Total Parameters: 124.439808 M
Trainable Parameters: 124.439808 M


In [15]:
data = EmotionDataset(dataset['text'], tokenizer)


In [16]:
training_args = TrainingArguments(
    output_dir="./gpt2_finetuned_emotion",
    overwrite_output_dir=True,
    num_train_epochs=1,  # For demonstration, keep it to 1
    per_device_train_batch_size=4,  # Adjust based on GPU memory
    save_steps=1000,
    save_total_limit=1,
    logging_dir='./logs',
    logging_steps=10,
)


In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data,
)

trainer.train()


Step,Training Loss
10,2.8559
20,0.2282
30,0.2185
40,0.1785


TrainOutput(global_step=40, training_loss=0.870284378528595, metrics={'train_runtime': 1110.25, 'train_samples_per_second': 0.144, 'train_steps_per_second': 0.036, 'total_flos': 41806725120000.0, 'train_loss': 0.870284378528595, 'epoch': 1.0})

In [18]:
from transformers import pipeline

# Load the fine-tuned model
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Example prompt
prompt = "Text: I felt so happy and joyful.\nEmotion:"

# Generate completion
completion = pipe(prompt, max_length=50, num_return_sequences=1)
print(completion[0]["generated_text"])


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Text: I felt so happy and joyful.
Emotion: It was wonderful to see so many people get the support of so they were able to communicate with the idea of what to do and to enjoy while still in that feeling of so much happiness
