Complementary notebook for finetuning a model. Refer to the full paper for a complete walkthrough.

In [1]:
import random
import torch
import csv
import spacy
import re
import json

import pandas as pd
import numpy as np
import torch.nn.functional as F

from tqdm import trange
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2", pad_token="<|endoftext|>")
configuration = GPT2Config.from_pretrained("distilgpt2", output_hidden_states=True, output_attention=True)
model = GPT2LMHeadModel.from_pretrained("distilgpt2", config=configuration)

Downloading: 100%|██████████| 0.99M/0.99M [00:00<00:00, 17.2MB/s]
Downloading: 100%|██████████| 446k/446k [00:00<00:00, 14.5MB/s]
Downloading: 100%|██████████| 762/762 [00:00<00:00, 164kB/s]
Downloading: 100%|██████████| 336M/336M [00:26<00:00, 13.5MB/s]


Example output without finetuning

In [None]:
input_ids = tokenizer.encode(
    "Neural network models can use attention mechanisms to direct their focus. This can help with",
    return_tensors="pt",
)

sample_outputs = model.generate(
    input_ids, 
    max_length=len(input_ids[0]) + 12,
    do_sample=True, 
    top_k=50,
    no_repeat_ngram_size=2, 
    top_p=0.8, 
    temperature=0.9,
    num_return_sequences=3,
    return_dict_in_output=True,
    output_scores=True
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Output:
----------------------------------------------------------------------------------------------------
0: Neural network models can use attention mechanisms to direct their focus. This can help with the ability to predict the level of interest of a particular group
1: Neural network models can use attention mechanisms to direct their focus. This can help with spatial memory and other learning mechanisms. The goal of this paper
2: Neural network models can use attention mechanisms to direct their focus. This can help with the ability to identify and identify potential patterns in a variety of


In [None]:
with open("./data.json", "r") as read_file:
    data = json.load(read_file)

In [None]:
max_length = max([len(tokenizer.encode(content)) for content in data]) + 2

max_length

172

In [None]:
import torch

torch.manual_seed(42)
from torch.utils.data import Dataset


class FinetuneDataset(Dataset):
    def __init__(self, data, tokenizer, gpt2_type="distilgpt2", max_length=max_length):

        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []

        for content in data:
            encodings_dict = tokenizer(
                tokenizer.bos_token + content + tokenizer.eos_token,
                truncation=True,
                max_length=max_length,
                padding="max_length",
            )

            self.input_ids.append(torch.tensor(encodings_dict["input_ids"]))
            self.attn_masks.append(torch.tensor(encodings_dict["attention_mask"]))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]


In [None]:
from torch.utils.data import random_split

dataset = FinetuneDataset(data, tokenizer, max_length=max_length)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

f'There are {train_size} samples for training, and {val_size} samples for validation testing'

'There are 1681 samples for training, and 187 samples for validation testing'

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

bs = 32
train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset),
            batch_size = bs
        )

validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset),
            batch_size = bs 
        )

In [None]:
import random
import numpy as np

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

<torch._C.Generator at 0x7f46889e58b0>

In [None]:
epochs = 3
warmup_steps = 1e2
sample_every = 100

In [None]:
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-4, eps=1e-8)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

In [None]:
import random
import time
import datetime


def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))


total_t0 = time.time()

training_stats = []

for epoch_i in range(0, epochs):

    print(f"Beginning epoch {epoch_i + 1} of {epochs}")
    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0]
        b_labels = batch[0]
        b_masks = batch[1]

        model.zero_grad()

        outputs = model(
            b_input_ids, labels=b_labels, attention_mask=b_masks, token_type_ids=None
        )

        loss = outputs[0]

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every 100 batches.
        if step % sample_every == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print(
                f"Batch {step} of {len(train_dataloader)}. Loss:{batch_loss}. Time:{elapsed}"
            )

            model.eval()

            sample_outputs = model.generate(
                bos_token_id=random.randint(1, 30000),
                do_sample=True,
                top_k=50,
                max_length=200,
                top_p=0.95,
                num_return_sequences=1,
            )
            for i, sample_output in enumerate(sample_outputs):
                print(
                    f"Example output: {tokenizer.decode(sample_output, skip_special_tokens=True)}"
                )

            model.train()

        loss.backward()

        optimizer.step()

        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print(f"Average Training Loss: {avg_train_loss}. Epoch time: {training_time}")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        b_input_ids = batch[0]
        b_labels = batch[0]
        b_masks = batch[1]

        with torch.no_grad():

            outputs = model(b_input_ids, attention_mask=b_masks, labels=b_labels)

            loss = outputs[0]

        batch_loss = loss.item()
        total_eval_loss += batch_loss

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)

    print(f"Validation loss: {avg_val_loss}. Validation Time: {validation_time}")

    # Record all statistics from this epoch.
    training_stats.append(
        {
            "epoch": epoch_i + 1,
            "Training Loss": avg_train_loss,
            "Valid. Loss": avg_val_loss,
            "Training Time": training_time,
            "Validation Time": validation_time,
        }
    )

print(f"Total training took {format_time(time.time()-total_t0)}")


Beginning epoch 1 of 6
Average Training Loss: 2.480187913156905. Epoch time: 0:08:39
Validation loss: 0.9046754837036133. Validation Time: 0:00:18
Beginning epoch 2 of 6
Average Training Loss: 0.8645765084140705. Epoch time: 0:10:08
Validation loss: 0.8574422101179758. Validation Time: 0:00:19
Beginning epoch 3 of 6
Average Training Loss: 0.7577598690986633. Epoch time: 0:08:27
Validation loss: 0.8441044290860494. Validation Time: 0:00:17
Beginning epoch 4 of 6
Average Training Loss: 0.6670659690533044. Epoch time: 0:08:31
Validation loss: 0.8529025316238403. Validation Time: 0:00:18
Beginning epoch 5 of 6
Average Training Loss: 0.6000708830806444. Epoch time: 0:08:19
Validation loss: 0.8662164111932119. Validation Time: 0:00:19
Beginning epoch 6 of 6
Average Training Loss: 0.5509948156914621. Epoch time: 0:08:28
Validation loss: 0.885182112455368. Validation Time: 0:00:18
Total training took 0:54:21


In [None]:
model.save_pretrained('./modelv2/')
tokenizer.save_pretrained('./modelv2/')

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.json',
 './model/merges.txt',
 './model/added_tokens.json')

In [None]:
finetuned_tokenizer = GPT2Tokenizer.from_pretrained('./model/')
finetuned_model = GPT2LMHeadModel.from_pretrained('./model/')

Inferences from the fine-tuned model. Notice how the network has learned to capitalize keywords such as "Neural Networks" and "Memory Systems".

In [None]:
input_ids = finetuned_tokenizer.encode(
    "Neural network models can use attention mechanisms to direct their focus. This can help with",
    return_tensors="pt",
)

sample_outputs = finetuned_model.generate(
    input_ids, 
    max_length=len(input_ids[0]) + 12,
    do_sample=True, 
    top_k=50,
    no_repeat_ngram_size=2, 
    top_p=0.8, 
    temperature=0.9,
    num_return_sequences=3,
    return_dict_in_output=True,
    output_scores=True
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Output:
----------------------------------------------------------------------------------------------------
0: Neural network models can use attention mechanisms to direct their focus. This can help with the interpretation of Neural Networks.
1: Neural network models can use attention mechanisms to direct their focus. This can help with the index on Memory Systems.
2: Neural network models can use attention mechanisms to direct their focus. This can help with Attentional Neural Networks to learn how to train a trained model


To run this model on the client, I've converted it to onnx with `python -m transformers.onnx --model ../model --framework --feature=causal-lm pt .`

All od the following code will later be converted to javscript with [`onnx-runtime-node`](https://www.npmjs.com/package/onnxruntime-node).

In [7]:
from onnxruntime import InferenceSession
session = InferenceSession("onnx/model.onnx")
finetuned_tokenizer = GPT2Tokenizer.from_pretrained('./model/')

In [88]:
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        sorted_indices_to_remove = cumulative_probs > top_p
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
    return logits

def normalize(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

def generate(
    model,
    tokenizer,
    prompt,
    max_length=16,
    temperature=0.9,
    repetition_penalty=0.8,
    top_k=50,
    top_p=0.9,
    max_context_length=1024,
):
    with torch.no_grad():
        break_tokens = [764, 50256, 198, 13]

        generated = tokenizer(prompt, return_tensors="pt")

        new_tokens = generated["input_ids"][0]
        new_logits = []
        temperature = temperature
        repetition_penalty = repetition_penalty
        top_k = top_k
        top_p = top_p


        for _ in trange(max_length):
            outputs = model.run(
                None,
                input_feed={
                    "input_ids": generated["input_ids"].cpu().numpy(),
                    "attention_mask": generated["attention_mask"].cpu().numpy(),
                },
            )

            logits = torch.tensor(outputs[0][0])
            attention = outputs[-1][0]

            next_token_logits = logits[-1, :] / (
                temperature if temperature > 0 else 1.0
            )

            new_logits.append(next_token_logits)
            for _ in set(generated["input_ids"].view(-1).tolist()):
                next_token_logits[_] /= repetition_penalty
            if temperature == 0:  # greedy sampling:
                next_token = torch.argmax(next_token_logits).unsqueeze(0)
            else:
                filtered_logits = top_k_top_p_filtering(
                    next_token_logits, top_k=top_k, top_p=top_p
                )
                next_token = torch.multinomial(
                    F.softmax(filtered_logits, dim=-1), num_samples=1
                )

            generated["input_ids"] = torch.cat(
                (generated["input_ids"], next_token.unsqueeze(0)), dim=1
            )
            generated["attention_mask"] = torch.cat(
                (generated["attention_mask"], torch.tensor([1]).unsqueeze(0)), dim=1
            )
            new_tokens = torch.cat((new_tokens, next_token), 0)

            next_token_logit = next_token.item()
            if next_token_logit in break_tokens:
                break

        normalized_attentions = normalize(np.mean(attention, axis=(1)))
        print("\n\nNormalized attentions\n---")
        for i, a in enumerate(normalized_attentions):
            print(finetuned_tokenizer.decode(new_tokens[i]).strip(), a)

        return tokenizer.decode(new_tokens, skip_special_tokens=True)


In [90]:
input = "Neural network models can use attention mechanisms to direct their focus. This can help with"
output = generate(session, finetuned_tokenizer, input)

output


 88%|████████▊ | 14/16 [00:01<00:00,  8.57it/s]

Normalized attentions
---
Ne 0.39964992
ural 0.5156584
network 0.6587709
models 0.9730898
can 0.0834508
use 0.10494143
attention 0.73390186
mechanisms 0.8171208
to 0.22891888
direct 0.4344021
their 0.22744419
focus 0.36719105
. 1.0
This 0.88731384
can 0.22770312
help 0.0
with 0.033685226
the 0.07412371
Attention 0.7709657
Distribution 0.50096774
Map 0.7513429
, 0.5973961
which 0.749522
shows 0.11179085
how 0.20706418
activation 0.2883157
of 0.2310062
these 0.43793496
neural 0.43947598
networks 0.55677927
is 0.7408395



'Neural network models can use attention mechanisms to direct their focus. This can help with the Attention Distribution Map, which shows how activation of these neural networks is.'

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=ad10b37e-1254-49b1-9814-3334468ab840' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>