# Part II: Fine-Tuning GPT-2 for Joke Generation

In [3]:
import pandas as pd
from datasets import Dataset

from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

In [4]:
from google.colab import drive
drive.mount('/content/drive')

# Load the dataset
df = df = pd.read_csv('/content/drive/MyDrive/data/data')

# Extract the jokes column
jokes = df['Joke'].tolist()

# View an example of a joke
print(jokes[:1])


Mounted at /content/drive
['What did the bartender say to the jumper cables? You better not try to start anything.']


In [5]:
# Convert the list of jokes into a dataset
jokes_dataset = Dataset.from_dict({"text": jokes})

# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Add [PAD] token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Resize the model’s embedding layer to account for the added special token
model.resize_token_embeddings(len(tokenizer))

# Tokenize the jokes
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_jokes = jokes_dataset.map(tokenize_function, batched=True)

# Convert the dataset into PyTorch DataLoader
tokenized_jokes.set_format(type="torch", columns=["input_ids", "attention_mask"])
dataloader = DataLoader(tokenized_jokes, batch_size=16, shuffle=True)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
epochs = 5
model.train()  # Set the model to training mode

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    loop = tqdm(dataloader, leave=True)  # Progress bar

    for batch in loop:
        # Move batch data to the same device as the model
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Optimize
        optimizer.step()
        optimizer.zero_grad()

        # Update progress bar
        loop.set_description(f"Loss {loss.item():.4f}")

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_gpt2_jokes")
tokenizer.save_pretrained("./fine_tuned_gpt2_jokes")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/1622 [00:00<?, ? examples/s]

Epoch 1/5


Loss 0.6922: 100%|██████████| 102/102 [00:52<00:00,  1.93it/s]


Epoch 2/5


Loss 0.5675: 100%|██████████| 102/102 [00:54<00:00,  1.88it/s]


Epoch 3/5


Loss 0.6409: 100%|██████████| 102/102 [00:53<00:00,  1.90it/s]


Epoch 4/5


Loss 0.6135: 100%|██████████| 102/102 [00:53<00:00,  1.89it/s]


Epoch 5/5


Loss 0.5928: 100%|██████████| 102/102 [00:53<00:00,  1.90it/s]


('./fine_tuned_gpt2_jokes/tokenizer_config.json',
 './fine_tuned_gpt2_jokes/special_tokens_map.json',
 './fine_tuned_gpt2_jokes/vocab.json',
 './fine_tuned_gpt2_jokes/merges.txt',
 './fine_tuned_gpt2_jokes/added_tokens.json')

In [8]:
def generate_joke(input_words, max_length=50):
    # Convert the input words to a string (in case they're a list of words)
    input_prompt = ' '.join(input_words)

    # Tokenize the input prompt
    input_ids = tokenizer.encode(input_prompt, return_tensors='pt').to(device)

    # Generate joke continuation
    output = model.generate(
        input_ids,
        max_length=max_length,  # Maximum length of the joke (adjust as needed)
        num_return_sequences=1,  # Generate only one joke at a time
        pad_token_id=tokenizer.eos_token_id,  # Ensure proper padding
        do_sample=True,  # Random sampling for variability
        top_k=50,  # Use top-k sampling for diversity
        top_p=0.95  # Use nucleus sampling
    )

    # Decode the output to text
    generated_joke = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_joke

# Generate joke based on three starting words from the dataset
input_words_from_dataset = ["What", "did", "the"]
print(f"Generated Joke from Dataset Input using: {input_words_from_dataset}")
print(generate_joke(input_words_from_dataset))
print(generate_joke(input_words_from_dataset))
print(generate_joke(input_words_from_dataset))

# Generate joke from randomly chosen words
random_words = ["The", "cute", "cat"]
print(f"\nGenerated Joke from Dataset Input using: {random_words}")
print(generate_joke(random_words))
print(generate_joke(random_words))
print(generate_joke(random_words))


Generated Joke from Dataset Input using: ['What', 'did', 'the']
What did the bartender say to the student? "Oh, this was a good week" but said it to myself and then it was a good
What did the best teacher say when she saw a train coming through a hole in her heart? A well-practicing student says "I have a very keen eye"
What did the Italian knight eat on a m, I would like to say: "Thanks to for the wonderful

Generated Joke from Dataset Input using: ['The', 'cute', 'cat']
The cute cat lays down to his mother who says it's time to kick it off. With no signal.. *Your son's father's son, who's a teacher
The cute cat walks into the bath tub...the other pet is sitting in its bathtub, so we can have a proper wedding...the other's sleeping in an actual bath.
The cute cat walks into a bar and gives it a softie and says, "Hey, I'm allergic to Corgi." He does not give a huff and says, "Hey, I'm allergic to Corgi."
