In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os

import numpy as np
import torch
import torch.nn.functional as F
from tqdm import trange
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [3]:
# GPUを使う場合の設定
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

## Load Model

In [4]:
models_address = "./models/gpt2-large"

In [5]:
# !mkdir $models_address
# !wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin -O $models_address/pytorch_model.bin
# !wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json -O $models_address/config.json

In [6]:
model = GPT2LMHeadModel.from_pretrained(models_address)
model.to(device)

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at ./models/gpt2-large and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias', 'h.12.attn.masked_bias', 'h.13.attn.masked_bias', 'h.14.attn.masked_bias', 'h.15.attn.masked_bias', 'h.16.attn.masked_bias', 'h.17.attn.masked_bias', 'h.18.attn.masked_bias', 'h.19.attn.masked_bias', 'h.20.attn.masked_bias', 'h.21.attn.masked_bias', 'h.22.attn.masked_bias', 'h.23.attn.masked_bias', 'h.24.attn.masked_bias', 'h.25.attn.masked_bias', 'h.26.attn.masked_bias', 'h.27.attn.masked_bias', 'h.28.attn.masked_bias', 'h.29.attn.masked_bias', 'h.30.attn.masked_bias', 'h.31.attn.masked_bias', 'h.32.attn.masked_bias', 'h.33.attn.masked_bias', 'h.34.attn.masked_bias', 'h.35

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2):

In [7]:
# !wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json -O $models_address/vocab.json
# !wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt -O $models_address/merges.txt

In [9]:
tokenizer =  GPT2Tokenizer.from_pretrained(models_address)

## 文章生成に用いるパラメータ

- Seed, int num for getting a random num
- Temperature, int num to treat as a magic num to make the generating process unpredictable
- Max_len, int num to define how long the text the model will generate
- Top_k, int num help the model only pick top K possible candidate token base on the context for each run of next token prediction
- Top_p, float num to filter the next predict token, only when the next token's possibility higher than this num, can be taken into consideration for the predicting process

In [10]:
seed = 4

np.random.seed(seed)
torch.manual_seed(seed)
if n_gpu > 0:
    torch.cuda.manual_seed_all(seed)

temperature = 1.0
max_len = 30
top_k = 100
top_p = 0.8

## 文章生成テスト

In [11]:
# Demo test text, the generation will base on this sentence
raw_text = "Apple was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976."

### トークンへの変換

In [12]:
context_tokens = tokenizer.encode(raw_text)
context_tokens

[16108,
 373,
 9393,
 416,
 6542,
 19161,
 11,
 6542,
 370,
 8590,
 8461,
 461,
 11,
 290,
 14430,
 13329,
 287,
 3035,
 15408,
 13]

### 文章生成

In [13]:
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
    return logits

In [14]:
def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0,  device='cpu'):
    '''Method to generate text with GPT-2 '''
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context
    
    with torch.no_grad():
        for _ in trange(length):
            inputs = {'input_ids': generated}
            
            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
            next_token_logits = outputs[0][0, -1, :] / temperature
            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
            next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
    return generated

In [15]:
# Generate 
out = sample_sequence(model=model,length=max_len,context=context_tokens,num_samples=1,temperature=temperature,top_k=top_k,top_p= top_p,device=device)

100%|██████████| 30/30 [00:11<00:00,  2.62it/s]


In [16]:
# Paraser result 
out = out[0, len(context_tokens):].tolist()
text = tokenizer.decode(out, clean_up_tokenization_spaces=True)

In [17]:
text

' It has a headquarters in Cupertino, California, with more than 11,000 employees.\n\nMore about Apple\n\nMore from AOL:'