
# Import libraries and model

Below are scripts for importing libraries for training a joke generator using the GPT-2 Model. Before running the code, do activate the GPU settings through **Runtime/Change Runtime type** in the menu above.

In [None]:
# install transformers from https://huggingface.co/
!pip install git+https://github.com/huggingface/transformers.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-kejuljqp
  Running command git clone -q https://github.com/huggingface/transformers.git /tmp/pip-req-build-kejuljqp
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 26.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 48.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading hugging

In [None]:
import logging
logging.getLogger().setLevel(logging.CRITICAL)

import torch
import numpy as np

from transformers import GPT2Tokenizer, GPT2LMHeadModel

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

# Let's test out the GPT-2 model

In [None]:
# Function to first select topN tokens from the probability list and then based on the selected N word distribution
# get random token ID
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

# Function to generate text from default GPT-2 model
def generate_some_text(input_str, text_len = 50):

    cur_ids = torch.tensor(tokenizer.encode(input_str)).unsqueeze(0).long().to(device)

    model.eval()
    with torch.no_grad():

        for i in range(text_len):
            outputs = model(cur_ids, labels=cur_ids)
            loss, logits = outputs[:2]
            softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(only one) batch and the last predicted embedding
            next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=10) #Randomly(from the given probability distribution) choose the next word from the top n words
            cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word

        output_list = list(cur_ids.squeeze().to('cpu').numpy())
        output_text = tokenizer.decode(output_list)
        print(output_text)

In [None]:
# test 1: without training on joke dataset, it outputs story without humour
generate_some_text('''Three Economists went into a bar.''')

Three Economists went into a bar. One of them said, "What do you think about the economy?" and one of the guys was like, "It's great." I was like, "Well, that's what we thought before, huh?" And he was like, "Well


In [None]:
# test 2
generate_some_text('''Three Economists went into a bar.''')

Three Economists went into a bar. They started asking each other about what their economic theories were. They started to discuss their theory and then they went to the bar.

The economist that was most interested was the bartender who started to ask about his theory.

"What do


In [None]:
# test 3
generate_some_text('''This learning and development session is going''')

This learning and development session is going to be a great opportunity for you to get familiar and excited about how we build this system. You will meet other members of the design team, including the technical team, as well as industry experts like the CEO and CTO of the company that is


Training on Joke dataset scrapped from Reddit

# Training (Fine-tuning) on joke dataset scrapped from Reddit

In [None]:
# Mount into drive
from google.colab import drive
drive.mount("/content/drive")

# change directory into data
%cd '/content/drive/MyDrive/Joke generator/data/'

Mounted at /content/drive
/content/drive/MyDrive/Joke generator/data


In [None]:
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
import os
import csv
import json

class JokesDataset(Dataset):
    def __init__(self, verbose = False):
        super().__init__()
        # !! change this path to your list of joke to customise training
        # short_jokes_path = './shortjokes.csv'
        # short_jokes_path = './reddit_jokes.csv'
        short_jokes_path = './reddit-cleanjokes.csv'
        self.joke_list = []
        self.end_of_text_token = "<|endoftext|>"
        self.verbose = verbose
        
        with open(short_jokes_path) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            
            x = 0
            for row in csv_reader:
                # print(row)
                joke_str = f"JOKE:{row[1]}{self.end_of_text_token}"
                self.joke_list.append(joke_str)
        
    def __len__(self):
        return len(self.joke_list)

    def __getitem__(self, item):
        return self.joke_list[item]

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
from torchsummary import summary

if torch.cuda.is_available():
  device = 'cuda'
else:
  device = 'cpu'  

dataset = JokesDataset()
joke_loader = DataLoader(dataset, batch_size=1, shuffle=True)

# model params
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 3e-5
WARMUP_STEPS = 5000
MAX_SEQ_LEN = 400

# initialising model instance
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps = -1)
proc_seq_count = 0
sum_loss = 0.0
batch_count = 0

tmp_jokes_tens = None
models_folder = "trained_models"
if not os.path.exists(models_folder):
    os.mkdir(models_folder)

# for debugging, set dataset.verbose = True when initialising
if dataset.verbose:
  print(f'Training on: {torch.cuda.get_device_name(0)}\n')
  print(f'Preview model summary: {print(model)}')



In [None]:
for epoch in range(EPOCHS):
    print("-----------------------------------")
    print("Epoch %d" % (epoch+1))
    print("-----------------------------------")
    
    for idx,joke in enumerate(joke_loader):
        
        #################### "Fit as many joke sequences into MAX_SEQ_LEN sequence as possible" logic start ####
        joke_tens = torch.tensor(tokenizer.encode(joke[0])).unsqueeze(0).to(device)
        #Skip sample from dataset if it is longer than MAX_SEQ_LEN
        if joke_tens.size()[1] > MAX_SEQ_LEN:
            continue
        
        #The first joke sequence in the sequence
        if not torch.is_tensor(tmp_jokes_tens):
            tmp_jokes_tens = joke_tens
            continue
        else:
            #The next joke does not fit in so we process the sequence and leave the last joke 
            #as the start for next sequence 
            if tmp_jokes_tens.size()[1] + joke_tens.size()[1] > MAX_SEQ_LEN:
                work_jokes_tens = tmp_jokes_tens
                tmp_jokes_tens = joke_tens
            else:
                #Add the joke to sequence, continue and try to add more
                tmp_jokes_tens = torch.cat([tmp_jokes_tens, joke_tens[:,1:]], dim=1)
                continue
        ################## Sequence ready, process it trough the model ##################
            
        outputs = model(work_jokes_tens, labels=work_jokes_tens)
        loss, logits = outputs[:2]                        
        loss.backward()
        sum_loss = sum_loss + loss.detach().data
                       
        proc_seq_count = proc_seq_count + 1
        if proc_seq_count == BATCH_SIZE:
            proc_seq_count = 0    
            batch_count += 1
            optimizer.step()
            scheduler.step() 
            optimizer.zero_grad()
            model.zero_grad()

        if batch_count == 100:
            print(f"sum loss {sum_loss}")
            batch_count = 0
            sum_loss = 0.0
    
    # Store the model after each epoch to compare the performance of them
    torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_medium_reddit_clean_{epoch}.pt"))


models_folder = "trained_models"

-----------------------------------
Epoch 1
-----------------------------------
-----------------------------------
Epoch 2
-----------------------------------
-----------------------------------
Epoch 3
-----------------------------------
-----------------------------------
Epoch 4
-----------------------------------
-----------------------------------
Epoch 5
-----------------------------------


# Generating jokes with/without start words

In [None]:
models_folder = "trained_models"
output_folder = "output"
num_jokes = 100
# change the model path if you trained a new model
model_path = os.path.join(models_folder, f"gpt2_medium_reddit_clean_{0}.pt")
model.load_state_dict(torch.load(model_path))
jokes_output_file_path = os.path.join(output_folder, f'bar_generated_clean_{6}.jokes')

# determines how the joke should start
start_words = "JOKE: An infinite number of mathematicians walk into a bar."

model.eval()
if os.path.exists(jokes_output_file_path):
    os.remove(jokes_output_file_path)
    
joke_num = 0
with torch.no_grad():
    for joke_idx in range(num_jokes):
        joke_finished = False
        cur_ids = torch.tensor(tokenizer.encode(start_words)).unsqueeze(0).to(device)
        for i in range(100):
            outputs = model(cur_ids, labels=cur_ids)
            loss, logits = outputs[:2]
            softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(from only one in this case) batch and the last predicted embedding
            if i < 3:
                n = 20
            else:
                n = 3
            next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n) #Randomly(from the topN probability distribution) select the next word
            cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word to the running sequence

            if next_token_id in tokenizer.encode('<|endoftext|>'):
                joke_finished = True
                break
        
        if joke_finished:
            joke_num = joke_num + 1
            output_list = list(cur_ids.squeeze().to('cpu').numpy())
            output_text = tokenizer.decode(output_list)

            with open(jokes_output_file_path, 'a') as f:
                f.write(f"{output_text} \n\n")

In [None]:
# Joke generated are really insensitive ... 
output_text

"JOKE:What's the difference between a gay and a racist? A black man can't get a job at a bank<|endoftext|>"

# Credits

**Data source:**
1. https://github.com/amoudgl/short-jokes-dataset
2. https://www.kaggle.com/datasets/abhinavmoudgil95/short-jokes

**Code adapted from:**
1. https://www.kaggle.com/code/leekeonshin/gru-jokes
2. https://towardsdatascience.com/teaching-gpt-2-a-sense-of-humor-fine-tuning-large-transformer-models-on-a-single-gpu-in-pytorch-59e8cec40912