In [None]:
!pip install transformers
!pip3 install sentencepiece

In [None]:
import datetime
import os
import time
import sys

import numpy as np
import random
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
torch.manual_seed(42)

In [None]:
from transformers import Trainer, TrainingArguments
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Load model checkpoint from huggingface Library

Load the model which you want to use and load the tokenizer for that model.


In [None]:
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = 'google/pegasus-xsum'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

# Download and Prepare Data

Download the data from github repo. Load the dataset from the .json file and remove the unwanted columns. Divide the dataset for training and validation. Use the categories in validation data to generate jokes.

In [None]:
!git clone https://github.com/taivop/joke-dataset.git

In [None]:
y = pd.read_json('joke-dataset/wocka.json')
del y['id']
del y['title'] 

In [None]:
z = pd.read_json('joke-dataset/stupidstuff.json')
del z['id']
del z['rating'] 

In [None]:
sum_data = pd.concat([y,z])
sum_data

In [None]:
sum_data = sum_data.sample(len(sum_data), random_state=20)
train_sub = int(len(sum_data) * 0.99)

train_df = sum_data[0:train_sub]
val_df = sum_data[train_sub:]

train_texts = list(train_df['category'])
val_texts = list(val_df['category'])

train_decode = list(train_df['body'])
val_decode = list(val_df['body'])

# Tokenize

Tokenize the data and convert them to a pytorch data object for training.

In [None]:
train_encodings = tokenizer(train_texts, max_length=16, truncation=True, padding='longest')
val_encodings = tokenizer(val_texts, max_length=16, truncation=True, padding='longest')

train_labels = tokenizer(train_decode, max_length=512, truncation=True, padding='longest')
val_labels = tokenizer(val_decode, max_length=512, truncation=True, padding='longest')

In [None]:
class Summary_dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings)

In [None]:
train_dataset = Summary_dataset(train_encodings, train_labels)
val_dataset = Summary_dataset(val_encodings, val_labels)

# Training

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=100,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    logging_dir='./logs',            # directory for storing logs
    logging_steps=5,
    eval_accumulation_steps=1,
    learning_rate=1e-4,
    adafactor = True                #use adafactor instead of adam
)

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

In [None]:
trainer.train()

# Generate Text

Generate text using different sets of arguments. You can find more on generating text here: [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate)

In [None]:
batch = tokenizer('Medical', truncation=True, padding='longest', return_tensors="pt").to(torch_device)
generated = model.generate(**batch, min_length=32, do_sample=True, top_p=0.92, top_k=0, num_beams=8, no_repeat_ngram_size=2)
tgt_text = tokenizer.batch_decode(generated, skip_special_tokens=True)
tgt_text

# Save Model

In [None]:
trainer.save_model('pegasus_jokes_2')

# Load saved model

In [None]:
from transformers import PegasusConfig

In [None]:
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')
config = PegasusConfig.from_json_file('./content/saved_model/*.config') #Path of .config file
model = PegasusForConditionalGeneration.from_pretrained('./content/saved_model/pytorch_model.bin', config=config).to(torch_device) #path of .bin file