In [None]:
%%time
%%capture
!pip install transformers

CPU times: user 42.6 ms, sys: 109 ms, total: 151 ms
Wall time: 4.2 s


## Requirements Installation

In [None]:
#Requirements Installation 
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
from itertools import compress
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
                         AdamW, get_linear_schedule_with_warmup, \
                         TrainingArguments, BeamScorer, Trainer

import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
                             RandomSampler, SequentialSampler

from IPython.display import clear_output

print(f"PyTorch version: {torch.__version__}")

PyTorch version: 1.13.0+cu116


## Configurations

In [None]:
DEBUG           = False

INPUT_DIR       = 'articles'

USE_APEX        = True
APEX_OPT_LEVEL  = 'O1'

MODEL           = 'gpt2-medium' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}

UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training

SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}
                    
MAXLEN          = 768  #{768, 1024, 1280, 1600}

TRAIN_SIZE      = 0.9

if USE_APEX:
    TRAIN_BATCHSIZE = 2
    BATCH_UPDATE    = 32
    #TRAIN_BATCHSIZE = 2
else:
    TRAIN_BATCHSIZE = 2
    BATCH_UPDATE    = 32

EPOCHS          = 20
LR              = 5e-3
EPS             = 1e-8
WARMUP_STEPS    = 1e2

SEED            = 2020

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

##Clean Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
with open('/content/drive/MyDrive/COMP SCI 496: Gen Deep Models Final Project/Sonnets.txt') as f:
    lines = f.readlines()
lines = ''.join(lines)
lines = lines.split('\n\n')

In [None]:
tfidf = TfidfVectorizer(stop_words='english', analyzer = 'word')

In [None]:
len(lines)

154

In [None]:
keywords = []
for i, line in enumerate(lines):
  X = tfidf.fit_transform(lines[i].lower().split(' '))

  feature_array = np.array(tfidf.get_feature_names())
  tfidf_sorting = np.argsort(X.toarray()).flatten()[::-1]

  n = 5
  top_n = feature_array[tfidf_sorting][:n].tolist()
  keywords.append(top_n)


In [None]:
X = tfidf.fit_transform(lines[0].lower().split(' '))

feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(X.toarray()).flatten()

n = 5
top_n = feature_array[tfidf_sorting][-n:].tolist()
top_n

['fresh', 'fuel', 'decease', 'world', 'thee']

In [None]:
keywords[0]

['thee', 'world', 'decease', 'fuel', 'fresh']

### Use Dataset


In [None]:
with open('/content/drive/MyDrive/COMP SCI 496: Gen Deep Models Final Project/Sonnets with keywords.txt') as f:
    lines = f.readlines()
lines = ''.join(lines)
lines = lines.split('\n\n')
len(lines)

154

In [None]:
lines[0].split('\n', 1)

['beauty, heir, memory, famine, grave',
 "From fairest creatures we desire increase,\nThat thereby beauty's rose might never die,\nBut as the riper should by time decease,\nHis tender heir might bear his memory:\nBut thou contracted to thine own bright eyes,\nFeed'st thy light's flame with self-substantial fuel,\nMaking a famine where abundance lies,\nThy self thy foe, to thy sweet self too cruel:\nThou that art now the world's fresh ornament,\nAnd only herald to the gaudy spring,\nWithin thine own bud buriest thy content,\nAnd, tender churl, mak'st waste in niggarding:\nPity the world, or else this glutton be,\nTo eat the world's due, by the grave and thee."]

In [None]:
%%time

data = {}       

for i in range(len(lines)):
    #id, category, title, keywords, text
    id = i
    keywords, text = lines[i].split('\n', 1)
    keywords = keywords.split(', ')
    data[id] = [keywords, text]


print(f"Number of articles: {len(data) :,}")

Number of articles: 154
CPU times: user 459 µs, sys: 0 ns, total: 459 µs
Wall time: 525 µs


In [None]:
for sonnet in data:
    if len(data[sonnet][1].split('\n')) != 14:
        print(sonnet)

152


### Dataset and Loaders

In [None]:
class myDataset(Dataset):

    def __init__(self, data, tokenizer, randomize=True):

        text, keywords = [], []
        for k, v in data.items():
            text.append(v[1])
            keywords.append(v[0])

        self.randomize = randomize
        self.tokenizer = tokenizer 
        self.text      = text
        self.keywords  = keywords  

    #---------------------------------------------#

    @staticmethod
    def join_keywords(keywords, randomize=True):
        N = len(keywords)

        #random sampling and shuffle
        if randomize: 
            M = random.choice(range(N+1))
            keywords = keywords[:M]
            random.shuffle(keywords)

        return ','.join(keywords)

    #---------------------------------------------#

    def __len__(self):
        return len(self.text)

    #---------------------------------------------#
    
    def __getitem__(self, i):
        keywords = self.keywords[i].copy()
        kw = self.join_keywords(keywords, self.randomize)
        
        input = SPECIAL_TOKENS['bos_token'] + kw + SPECIAL_TOKENS['sep_token'] + \
                self.text[i] + SPECIAL_TOKENS['eos_token']

        encodings_dict = tokenizer(input,                                   
                                   truncation=True, 
                                   max_length=MAXLEN, 
                                   padding="max_length")   
        
        input_ids = encodings_dict['input_ids']
        attention_mask = encodings_dict['attention_mask']
        
        return {'label': torch.tensor(input_ids),
                'input_ids': torch.tensor(input_ids), 
                'attention_mask': torch.tensor(attention_mask)}

In [None]:
def split_data(data, S=TRAIN_SIZE):
    # Shuffle ids
    ids = list(data.keys())
    random.shuffle(ids)

    # Split into training and validation sets    
    train_size = int(S * len(data))

    train_ids = ids[:train_size]
    val_ids = ids[train_size:]

    train_data = dict()
    for id in train_ids:
        train_data[id] = data[id]

    val_data = dict()
    for id in val_ids:
        val_data[id] = data[id]

    return train_data, val_data 


## Loading Tokenizer, Config and Model 

In [None]:
def get_tokenizer(special_tokens=None):
    tokenizer = AutoTokenizer.from_pretrained(MODEL) #GPT2Tokenizer

    if special_tokens:
        tokenizer.add_special_tokens(special_tokens)
        print("Special tokens added")
    return tokenizer

def get_model(tokenizer, special_tokens=None, load_model_path=None):

    #GPT2LMHeadModel
    if special_tokens:
        config = AutoConfig.from_pretrained(MODEL, 
                                            bos_token_id=tokenizer.bos_token_id,
                                            eos_token_id=tokenizer.eos_token_id,
                                            sep_token_id=tokenizer.sep_token_id,
                                            pad_token_id=tokenizer.pad_token_id,
                                            output_hidden_states=False)
    else: 
        config = AutoConfig.from_pretrained(MODEL,                                     
                                            pad_token_id=tokenizer.eos_token_id,
                                            output_hidden_states=False)    

    #----------------------------------------------------------------#
    model = AutoModelForPreTraining.from_pretrained(MODEL, config=config)

    if special_tokens:
        #Special tokens added, model needs to be resized accordingly
        model.resize_token_embeddings(len(tokenizer))

    if load_model_path:
        model.load_state_dict(torch.load(load_model_path))

    model.cuda()
    return model

In [None]:
%%time

tokenizer = get_tokenizer(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                #   load_model_path='pytorch_model.bin'
                 )

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2-medium/snapshots/e852c9080bc759a01663acf5a828d95b261a9903/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-medium",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "n_positions": 1024,
  "n_special": 0,
  "predict_special_tokens": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",


Special tokens added


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2-medium/snapshots/e852c9080bc759a01663acf5a828d95b261a9903/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-medium",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50257,
  "embd_pdrop": 0.1,
  "eos_token_id": 50258,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "n_positions": 1024,
  "n_special": 0,
  "pad_token_id": 50260,
  "predict_special_tokens": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "sep_token_id": 50261,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specifi

CPU times: user 5.82 s, sys: 2.3 s, total: 8.12 s
Wall time: 16.3 s


In [None]:
# - Freeze selective layers:
# - Freeze all layers except last n:
for parameter in model.parameters():
    parameter.requires_grad = False

for i, m in enumerate(model.transformer.h):        
    #Only un-freeze the last n transformer blocks
    if i+1 > 12 - UNFREEZE_LAST_N:
        for parameter in m.parameters():
            parameter.requires_grad = True 

for parameter in model.transformer.ln_f.parameters():        
    parameter.requires_grad = True

for parameter in model.lm_head.parameters():        
    parameter.requires_grad = True

In [None]:
train_data, val_data = split_data(data)

train_dataset = myDataset(train_data, tokenizer)
val_dataset = myDataset(val_data, tokenizer, randomize=False)

print(train_dataset[0])
f'There are {len(train_dataset) :,} samples for training, and {len(val_dataset) :,} samples for validation testing'

{'label': tensor([50257, 23701, 50261,  3792,   340, 11906,   481,    11, 11906,  2939,
          815,  1394,  1280,   198,  3666,  4334, 29708,  2340,   284,   262,
        34730,  1755,    30,   198,    35,   455, 14210,  6227,   616,  1017,
        17024,   815,   307,  5445,    11,   198,  3633, 16187,   588,   284,
        17903,   466, 15290,   616,  6504,    30,   198,  3792,   340, 11906,
         4437,   326, 14210,  3758,   338,    83,   422, 17903,   198,  2396,
         1290,   422,  1363,   656,   616, 23777,   284,   279,   563,    11,
          198,  2514,  1064,   503,   427,  1047,   290, 21696,  2250,   287,
          502,    11,   198,   464,  8354,   290,  3478,   273,   286, 11906,
        35394,    30,   198,    46,    11,   645,     0, 11906,  1842,    11,
          996,   881,    11,   318,   407,   523,  1049,    25,   198,  1026,
          318,   616,  1842,   326,  7622,  6164,  4151, 21693,    25,   198,
        24461,   898,  2081,  1842,   326,   288,   84

'There are 138 samples for training, and 16 samples for validation testing'

## Finetune GPT-2 using Trainer 

In [None]:
%%time

training_args = TrainingArguments(
    output_dir="/content/",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size=TRAIN_BATCHSIZE,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy="epoch",
    fp16=True,
    fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,        
    save_total_limit=1,
    load_best_model_at_end=False,
    logging_steps=20    
)

#---------------------------------------------------#
trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

#---------------------------------------------------#
trainer.train()
trainer.save_model()    

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 138
  Num Epochs = 20
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 32
  Total optimization steps = 40
  Number of trainable parameters = 253009920
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,No log,25.735849
1,No log,25.735849
2,No log,30.652786
3,No log,4.018051
4,No log,6.492215
5,No log,19.363756
6,No log,1.651411
7,No log,1.509197
8,No log,1.239179
9,13.338900,1.109463


***** Running Evaluation *****
  Num examples = 16
  Batch size = 2
***** Running Evaluation *****
  Num examples = 16
  Batch size = 2
***** Running Evaluation *****
  Num examples = 16
  Batch size = 2
***** Running Evaluation *****
  Num examples = 16
  Batch size = 2
***** Running Evaluation *****
  Num examples = 16
  Batch size = 2
***** Running Evaluation *****
  Num examples = 16
  Batch size = 2
***** Running Evaluation *****
  Num examples = 16
  Batch size = 2
***** Running Evaluation *****
  Num examples = 16
  Batch size = 2
***** Running Evaluation *****
  Num examples = 16
  Batch size = 2
***** Running Evaluation *****
  Num examples = 16
  Batch size = 2
***** Running Evaluation *****
  Num examples = 16
  Batch size = 2
***** Running Evaluation *****
  Num examples = 16
  Batch size = 2
***** Running Evaluation *****
  Num examples = 16
  Batch size = 2
***** Running Evaluation *****
  Num examples = 16
  Batch size = 2
***** Running Evaluation *****
  Num examples = 

CPU times: user 11min 3s, sys: 1min 36s, total: 12min 39s
Wall time: 12min 50s


In [None]:
# Save to G-Drive ----------------------------------#
!cp -r 'pytorch_model.bin' '/content/drive/MyDrive/Colab Notebooks/pytorch_model_V2.bin'

## Generating Text with Fine-tuned GPT-2 Model 

In [None]:
# !cp -r '/content/drive/MyDrive/Colab Notebooks/Text Generation/pytorch_model_V2.bin' 'pytorch_model.bin' 

In [None]:
tokenizer = get_tokenizer(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                  load_model_path='/content/pytorch_model.bin')

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2-medium/snapshots/e852c9080bc759a01663acf5a828d95b261a9903/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-medium",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "n_positions": 1024,
  "n_special": 0,
  "predict_special_tokens": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",


Special tokens added


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2-medium/snapshots/e852c9080bc759a01663acf5a828d95b261a9903/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-medium",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50257,
  "embd_pdrop": 0.1,
  "eos_token_id": 50258,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "n_positions": 1024,
  "n_special": 0,
  "pad_token_id": 50260,
  "predict_special_tokens": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "sep_token_id": 50261,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specifi

In [None]:
keywords = ['love', 'milk', 'nice', 'sacrifice', 'heart']
kw = myDataset.join_keywords(keywords, randomize=False)

prompt = SPECIAL_TOKENS['bos_token'] + kw + SPECIAL_TOKENS['sep_token']
print(prompt)
         
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
print(generated)
device = torch.device("cuda")
generated = generated.to(device)

model.eval();

<|BOS|>love,milk,nice,sacrifice,heart<|SEP|>
tensor([[50257, 23205,    11, 25433,    74,    11, 44460,    11, 30584, 31932,
            11, 11499, 50261]])


In [None]:
# Top-p (nucleus) text generation (10 samples):
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                min_length=50, 
                                max_length=MAXLEN,
                                top_k=30,                                 
                                top_p=0.7,        
                                temperature=0.9,
                                repetition_penalty=2.0,
                                num_return_sequences=10
                                )

for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    a = len(','.join(keywords))    
    print("{}: {}\n\n".format(i+1,  text[a:]))

1: So then do not despair
When I have seen thee frown upon my deeds: yet be wise; thou know'st mine eye is wakened. Time and fortune will bear this out straight!  And when in doubt thy heart begins to groan with griefs-- 'tis folly so vile that it cannot go well.' Three winters cold were the summer of Love lived for love alone on his loving breast-razing days thus far from home... but three summers' pride had stol'.
And now thine eyes are raven black as hellish night doth live ere long hence still green leaves can see through these wastes quite ill? Is't winter a joy where sweets meet each hour dyed gay'? Proud monarch how happy you make me think ye look day byday!'


2: O! how I love thee so;
I hate that which thou hast not had: yet do allow me this slander thus. 'Thou usurer' wilt be mine heir and my glory to triumph in joy? And though thy heart doth teach it is thine inward Muse's fault--  If such a soul live with others but as an eye hath taught them ''To shun', what shall they say

In [None]:
# Beam-search text generation:
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                max_length=MAXLEN,                                                      
                                num_beams=5,
                                repetition_penalty=10.0,
                                early_stopping=True,      
                                num_return_sequences=2
                                )

for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    a = len(','.join(keywords))    
    print("{}: {}\n\n".format(i+1,  text[a:]))

1: Thy love is more than enough to make my heart groan;
For then I think on thee, and weep afresh:
And yet this sorrow doth not vex me so much,
Nor mine eye well knows what conscience is,
To say the least of thy worth.
I have sworn thee fair, but thou know'st
Thou art fairest in all men's eyes,
And 'gainst thy self thine own sweet graces add a greater grief!
O! none loves her that she fears will be wrongfully mistaking,
The loss of whom she thinks best disposed wilt bear ill.
Thus vainly thinking on thee, whilst others look elsewhere,
My thoughts are as black as night, till heaven clears,
And makes them see new faces every hour,
As those mourning eyes which mourn for their loved one.


2: No longer mourn for me when I am gone;
I have left my love alone, and found a better state
Than that which mine own heart hath in store:
So thou know'st thy self to be woe,
And will bear thine eye straight, no matter how it turns
In loving spite of ill-wresting hands,
When others' graces do themselves

## Generating Text with Raw GPT-2

In [None]:
tokenizer = get_tokenizer()
model = get_model(tokenizer)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2-medium/snapshots/e852c9080bc759a01663acf5a828d95b261a9903/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-medium",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "n_positions": 1024,
  "n_special": 0,
  "predict_special_tokens": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",


In [None]:
prompt = "Create a shakespearean sonnet for me: "

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)

model.eval()
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                max_length=MAXLEN,                                                      
                                num_beams=5,
                                repetition_penalty=5.0,
                                early_stopping=True,      
                                num_return_sequences=1
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

0: Create a shakespearean sonnet for me:  I'm afraid I'll have to write it from memory, but here's the first draft.
This is my version of Shakespeare's "A Midsummer Night's Dream" in its original form (with some minor changes).




In [None]:
prompt = "Generate a shakespearean sonnet that starts with: O! lest the world should task you to recite "

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)

model.eval()
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                max_length=MAXLEN,                                                      
                                num_beams=5,
                                repetition_penalty=5.0,
                                early_stopping=True,      
                                num_return_sequences=1
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

0: Generate a shakespearean sonnet that starts with: O! lest the world should task you to recite __________.

I don't know about you, but I'm not going to be writing my own sonnets any time soon.


