In [None]:
#https://github.com/rahul-jha98/JustJoking.ai

In [None]:
## Quietly installing transformers package to import
## the GPT2Tokenizer and TFGPT2LMHeadModel
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 4.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 4.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 58.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.3 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstall

## Imports

In [None]:
import tensorflow as tf

from transformers import GPT2Tokenizer, TFGPT2LMHeadModel

import os
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

## Downloading the Data

The dataset is available at https://www.kaggle.com/abhinavmoudgil95/short-jokes/data

Sign In to Kaggle and begin the download process for **shortjokes.csv** file. Then copy the link address for the download file and update the _URL varaiable.

Once done run all the cells in the notebook. Also, you can cancel the donwload process for the file. :)

In [None]:
#_URL = 'https://storage.googleapis.com/kaggle-data-sets/781%2F1457%2Fcompressed%2Fshortjokes.csv.zip?GoogleAccessId=gcp-kaggle-com@kaggle-161607.iam.gserviceaccount.com&Expires=1591507341&Signature=FyX9Byp7qpIpWTuJi028%2F74JCoCSyi0r8%2FtCGTfx9H0jewKbi%2FGvXRt46owax54aYyDFCfzPmCAWPUKmm%2FKMYgZ%2BqsskQH%2F92PiuQlIT4fttjvKUNEpy14Dcd%2BNy4NCpqLUlU0TCoLgsYEak53yU23QbWBgus1HpFn7UXY1Az8TOjNRBQYk%2FXajaV1qlrrNKRC13K6v6WR2qTsL3tLbzalQWiPxfPv1TwQnqicmYdPxRkhiuv19iX7Y1qpp22ZSzUW6w80e9A5R%2BcAItllI43OrN9HRMljVyJMrDIHP%2FqLAsvTmc2yYBa2muwN4wNDwxOBGbYpsbC0I%2B2F2J1HZbAA%3D%3D'

In [None]:
# path_to_zip = tf.keras.utils.get_file('shortjokes.csv.zip', origin=_URL, extract=True)

# FILE_PATH = os.path.join(os.path.dirname(path_to_zip), 'shortjokes.csv')

## Preparing the Dataset

### Extracting jokes list from CSV

In [None]:
pd.options.display.max_colwidth = None

In [None]:
jokes = pd.read_csv('/content/shortjokes.csv')
jokes.head()

FileNotFoundError: ignored

In [None]:
jokeslist = jokes['Joke'].to_list()
jokeslist[:5]

### Creating the Tokenizer for word tokenization

In [None]:
Tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

In [None]:
special_tokens_dict = {'pad_token': 'pad'}
num_added_toks = Tokenizer.add_special_tokens(special_tokens_dict)

START_TOKEN = '<|start|> '
END_TOKEN = ' <|end|>'

### Create Dataset from List

In [None]:
# A utility method to create a tf.data dataset from a List of jokes
def jokeslist_to_dataset(jokeslist, tokenizer, 
                  shuffle=True, batch_size=4, MAX_LEN = 64):
  

  jokeslist = [START_TOKEN + joke + END_TOKEN for joke in jokeslist]

  encodings = [tokenizer.encode_plus(joke,
                                  None,
                                  add_special_tokens = True,
                                  max_length = MAX_LEN,
                                  pad_to_max_length = True,
                                  return_token_type_ids=True) 
              for joke in jokeslist]

  ids = [x['input_ids'] for x in encodings]
  masks = [x['attention_mask'] for x in encodings]
  types = [x['token_type_ids'] for x in encodings]

  inputs = {}
  inputs['input_ids'] = ids
  inputs['attention_mask'] = masks
  inputs['token_type_ids'] = types

  ds = tf.data.Dataset.from_tensor_slices(inputs)

  if shuffle:
    ds = ds.shuffle(buffer_size=len(jokeslist))

  ds = ds.batch(batch_size)

  return ds

Note: This is a costly process since all the tokenization is done immediately. Thus it is expected to be slow. The only advantage is since everything is processed and kept in memory we are saving repated operationg while training.

In [None]:
# pip list

In [None]:
## In case you wish to only test the model do not run this cell
jokes_dataset = jokeslist_to_dataset(jokeslist, Tokenizer)

In [None]:
for x in jokes_dataset:
  a, b, c = x['input_ids'], x['attention_mask'], x['token_type_ids']
  print(a[0], b[0], c[0])
  break

## The Model

In [None]:
model = TFGPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
model.summary()

## Loss Function and Optimizer

In [None]:
loss_function = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, epsilon=1e-08, clipnorm=1.0)

## Checkpointing

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
CHECKPOINT_PATH = "/content/gdrive/MyDrive/jokes"

In [None]:
checkpoint_path = CHECKPOINT_PATH

ckpt = tf.train.Checkpoint(model = model)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')

## Training

In [None]:
@tf.function
def train_step(data_dict):  
  with tf.GradientTape() as tape:

    outputs = model(data_dict)

    lm_logits = outputs[0]
    labels = data_dict['input_ids']

    ## For computing loss we remove the last element from logits and 
    ## first from labels. Thus we need the model to learn to predict next
    ## word more confidently among the others. 
    shift_logits = lm_logits[..., :-1, :]
    shift_labels = labels[..., 1:]
    
    loss = loss_function(tf.reshape(shift_labels, (-1,)),
         tf.reshape(shift_logits, 
                   (-1, shift_logits.shape[-1])))

  gradients = tape.gradient(loss, model.trainable_variables)    
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

  return loss

In [None]:
EPOCHS = 2

for epoch in range(EPOCHS):
  
  for batch, data in tqdm(enumerate(jokes_dataset)):
    loss = train_step(data)
    if batch % 100 == 0:
      print('Epoch : {0} Batch : {1} ---- Loss : {2}'.format(epoch+1, batch+1, loss))
      ckpt_save_path = ckpt_manager.save()
      print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                        ckpt_save_path))

  

  ckpt_save_path = ckpt_manager.save()
  print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                        ckpt_save_path))

NameError: ignored

In [None]:
# assign location
path='/content/gdrive/MyDrive/h5jokes'
 
# save
model.save_weights(path)

NameError: ignored

## Inference

### Generating Joke from Scratch

In [None]:
def exploit_best_token_while_exploring(probabilites, exploration_len=5):
    ## Get the top k probabilites indices where k is exploration_len 
    top_indices = np.argpartition(probabilites, -exploration_len)\
                            [-exploration_len:]

    ## Getting top proabilities value
    
    top_probabilities = probabilites[top_indices]

    ## Normalizing it so that they sum is 1
    top_probabilities = top_probabilities / np.sum(top_probabilities)

    ## Although we choose next token randomly for our options we 
    ## pass the probabilities associated with each to account for the
    ## model's confidence for the token also. 
    choice = np.random.choice(exploration_len, 1, p = top_probabilities)

    ## Return next token id based on choice
    next_token_id = int(top_indices[choice][0])
    return next_token_id

In [None]:
def generate_joke(joke_length = 64):

  ## Begin by appending the START_TOKEN to our current joke
  current_joke = tf.expand_dims(tf.convert_to_tensor(Tokenizer.encode(START_TOKEN)), 0)
  
  for pos in range(joke_length):

    ## Get output of model for the current_joke
    output = model(current_joke)

    ## Getting the logits value from output tuple i.e. (logits)
    logits = output[0]

    ## Logits is of the shape (BATCH, LEN_INPUT) 
    ## So, since batch size is 1 we get 0th index
    ## and the softmax for only the next possible word i.e. -1
    softmax_logits = tf.nn.softmax(logits[0, -1], axis=0).numpy()

    ## Depending on whether we are in initial or final stages of joke
    ## we determine how many options we should explore to make sure we have
    ## variety in jokes generated
    if pos == 0:
      # If we are predicting first word we need maximum exploration
      exploration_len = 50

    elif pos < 4:
      # The next three words have exploration length 15
      exploration_len = 15

    else:
      # As we move further we narrow our exploration length
      exploration_len = 10

    ## Get the token we should append to current joke
    token_to_append = exploit_best_token_while_exploring(softmax_logits, 
                                                         exploration_len)

    ## Append the token to current joke
    current_joke = tf.concat([current_joke, 
                              tf.ones((1,1), dtype = tf.int32)*token_to_append], 
                             axis = 1)
    
    ## In case the token belongs to the END_TOKEN we return it as complete joke
    if token_to_append in Tokenizer.encode(END_TOKEN):
      return Tokenizer.decode(list(tf.squeeze(current_joke).numpy()))
  
  ## If we did not get end token it means no joke is formed
  return None

In [None]:
generate_joke()

NameError: ignored

In [None]:
generate_joke()

NameError: ignored

In [None]:
generate_joke()

NameError: ignored

In [None]:
generate_joke()

NameError: ignored

In [None]:
generate_joke()

NameError: ignored

In [None]:
generate_joke()

NameError: ignored

### Completing Sentences in Humorous Way

In [None]:
def complete_joke(initial_string = '', joke_length = 64):

  ## Begin by appending the START_TOKEN along with initial string to 
  ## our current joke sentence
  current_joke = tf.expand_dims(
      tf.convert_to_tensor(Tokenizer.encode(START_TOKEN + initial_string)), 
      0)
  
  for pos in range(joke_length):

    ## Get output of model for the current_joke
    output = model(current_joke)

    ## Getting the logits value from output tuple i.e. (logits)
    logits = output[0]

    ## Logits is of the shape (BATCH, LEN_INPUT) 
    ## So, since batch size is 1 we get 0th index
    ## and the softmax for only the next possible word i.e. -1
    softmax_logits = tf.nn.softmax(logits[0, -1], axis=0).numpy()

    ## Depending on whether we are in initial or final stages of joke
    ## we determine how many options we should explore to make sure we have
    ## variety in jokes generated
    if pos == 0:
      # If we are predicting first word we need maximum exploration
      exploration_len = 50

    elif pos < 4:
      # The next three words have exploration length 15
      exploration_len = 15

    else:
      # As we move further we narrow our exploration length
      exploration_len = 10

    ## Get the token we should append to current joke
    token_to_append = exploit_best_token_while_exploring(softmax_logits, 
                                                         exploration_len)

    ## Append the token to current joke
    current_joke = tf.concat([current_joke, 
                              tf.ones((1,1), dtype = tf.int32)*token_to_append], 
                             axis = 1)
    
    ## In case the token belongs to the END_TOKEN we return it as complete joke
    if token_to_append in Tokenizer.encode(END_TOKEN):
      return Tokenizer.decode(list(tf.squeeze(current_joke).numpy()))
  
  ## If we did not get end token it means no joke is formed and we try again
  return complete_joke(initial_string, joke_length)

In [None]:
complete_joke("Jokes are")

NameError: ignored

In [None]:
complete_joke("I am trained enough")

NameError: ignored

In [None]:
complete_joke("I can do this all day")

NameError: ignored

In [None]:
complete_joke("Hope is a good thing")

NameError: ignored

In [None]:
complete_joke("dog is running in the grass")

NameError: ignored

In [None]:
complete_joke("Why do")

NameError: ignored

In [None]:
complete_joke("Don't lie ")

"<|start|> Don't lie!!!<|"