#Source
https://github.com/falloutdurham/beginners-pytorch-deep-learning/blob/master/chapter9/Chapter9.5.ipynb
https://medium.com/swlh/fine-tuning-gpt-2-for-magic-the-gathering-flavour-text-generation-3bafd0f9bb93

## Install dependencies

In [None]:
!pip install --quiet transformers

In [None]:
import os

from tqdm import tqdm, trange
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup

## Data

In [None]:
data = [
        "Do one scary thing every day.Today, I reached out to 40+ PermanentLink trial users who didn't convert to paying customers.Writing those emails was harder than I thought, but the responses I already received show that it's worth it.",
        "In all my years of sales, nothing can stand up to the breadth of enrichment data Clearbit can provide. Still, there are gaps, especially outside the US. Are there any similar enrichment tools you trust?"
        "How to design almost any UI element.",
        "Our cold email warm-up system is rockin. 750 accounts warming up right now and 71,800 warm-up emails exchanged between these accounts in the last 10 days. You can now manage your settings and see stats from the dashboard.",
        "You can’t hack building relationships. It’s not like a podcast that you can run at 2x speed. The hack is consistency. Show up every day, contribute every day, and give people time to fall in love with your generosity.",
        "I could spend hours playing with dark mode UIs. There's just something awesome about dark mode."
]

## Loader
We need to create a structured dataset and dataloader to appropriately feed into the model.
We will use in-built PyTorch classes to define the dataset and dataloader, which will feed the neural network.
* The dataset object will create a new list, which is a tuple of tensors.
* The first tensor is the encoded flavour text, wrapped in a start of text token, an end of text token and padded up to a maximum embedding length
* The second tensor is an attention mask, which is a list of 1's and 0's that tells the model which tokens are important

This CustomDataSet can be generalized to fit any tokenizer and datalist.

We're going to be using gpt2-small in this chapter, which has that limitation due to its hidden dimensionality of 768 (if you want to use larger pre-trained models, then you can increase this: gpt2-medium/1024, gpt2-large/1280, gpt2-xl/1600). Of course, because this dataset is only tweets, we're never going to bump up against the limit, but I thought I would I'd include it so you know to be aware of the limitation.

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=768):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        
        for i in data:
          encs = tokenizer.encode_plus('<|startoftext|>'+i+'<|endoftext|>')
          self.input_ids.append( torch.tensor(encs['input_ids']) )
          self.attn_masks.append( torch.tensor(encs['attention_mask']) )        

        print('--- input_ids')
        print(self.input_ids)
        print('--- attn_masks')
        print(self.attn_masks)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

## Global objects

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
dataset = CustomDataset(data, tokenizer)

## Training

Training GPT-2's involves passing our input text into the transformer model…and training the model to get the text back as output.

As for our training loop, given that our labels are our input, all we're really doing is:

```
outputs = model(input)
loss = loss_function(output, input)
loss.backward()
optimizer.step()
```


In [None]:
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [None]:
def train(
    dataset,
    model,
    tokenizer,
    batch_size=16,
    epochs=4,
    lr=2e-5,
    max_seq_len=400,
    warmup_steps=5000,
    gpt2_type="gpt2",
    device="cuda",
    output_dir=".",
    output_prefix="wreckgar",
    test_mode=False,
    save_model_on_epoch=False,
):

    acc_steps = 100

    model = model.to(device)
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):
        print(f"Training epoch {epoch}")
        for idx, batch in enumerate(train_dataloader):
            binputs = batch[0].to(device)
            blabels = batch[0].to(device)
            bmasks = batch[1].to(device)
            outputs = model(binputs, labels=blabels, attention_mask=bmasks, token_type_ids=None)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

In [None]:
!mkdir -p trained_models

In [None]:
model = train(
    dataset,
    model,
    tokenizer,
    batch_size=16,
    epochs=1,
    lr=3e-5,
    max_seq_len=140,
    warmup_steps=5000,
    gpt2_type='gpt2',
    device="cuda",
    output_dir="./trained_models",
    output_prefix="twitter",
    save_model_on_epoch=True
)

Training epoch 0


In [None]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=100,
    top_p=0.8,
    temperature=1.,
):

    model.eval()

    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False

            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            # Using top-p (nucleus sampling): https://github.com/huggingface/transformers/blob/master/examples/run_generation.py

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(
                    F.softmax(sorted_logits, dim=-1), dim=-1
                )

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)

                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
                output_list = list(generated.squeeze().numpy())
                output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
                generated_list.append(output_text)
                
    return generated_list

In [None]:
generated_tweets = generate(model, tokenizer, "<|startoftext|>",entry_count=10)

100%|██████████| 10/10 [05:14<00:00, 31.47s/it]


In [None]:
print(generated_tweets)

['<|startoftext|>endofdocument</span> ".catch(title, endofdocument) <|replace "<|replace "#", " "> ".format(endofdocument, ",replace))} "<?xml version="1.0" encoding="utf-8"?> </body> </html> <?xml version="1.0" encoding="utf-8"?> </html>\n\nTutorial, here.<|endoftext|>', '<|startoftext|>=> []\n\nFor example, we could use Yarrow which exposes the QC_FAST__HS_FUNCTION_OPTS API.\n\nyarrow Yarrow :: CwdChar -> String\n\nReturn a vector which describes how to construct an edge edge. The position and outer values must be of type {W } with the Haskell type of Yarrow.\n\nclass Yarrow with M :: CwdChar -> S instance Yarrow a where X : [K ]<|endoftext|>', '<|startoftext|> <span>Hey Hey</span></body> </html>\n\n</html>\n\n<html>\n\n<head> <title>Enter the main html element</title>\n\n<link rel="stylesheet" href="css/main.css" />\n\n</head>\n\n<body>\n\n<div id="head"></div>\n\n<script type="text/javascript">\n\n(function() {\n\n(window) {\n<|endoftext|>', '<|startoftext|>|<|printable|>|<|firstla