<a href="https://colab.research.google.com/github/learn2Pro/rl_learning/blob/master/llm/gpt/gpt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m95.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModel
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from IPython.display import Image
# default: 100
mpl.rcParams['figure.dpi'] = 150
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## 1. summary
- transformer-based language model
    - 目前最核心的一个能力：text generation，尤其对于 gpt 而言；
- openai gpt2
    - https://openai.com/research/better-language-models
- 预训练之后，被 prompt（context）激活的广泛的能力

## 1.1 pretrained model
- on english language
- casual language modeling(CLM) objective(多分类问题)
- Language Models are unsupervised multi-task learner
    - https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf


## decoding strategies

- converting the model’s probabilistic output（vocab size classification） to text
    - iteratively，意味着更多的计算量
    - quality & diversity
- greedy search decoding：搜狗输入法，每次都用top1的候选
- beam search decoding
- sampling methods
- top-k & nucleus sampling

- (autoregressive or causal) language model
- $ x = x_1,x_2,...,x_n y = y_1,y_2,...,y_n$
- chain rule of probability to factorize it as a product of conditional probabilities
    - $P(y|x) = P(y_1,y_2,...,y_t|x) = \prod_{t=1}^{N} P(y_t|y_{<t},x)$
- 单向的，从左至右的，（BERT 的 B 表示的含义就是 bidirectional）
- 具体解码过程：
- $p(y_t=w_i|y_{<t},x) = softmax(z_t,i)$
- $\hat{y} = arg max_y P(y|x)$

## gpt2



|model|	参数量|	hidden dim|	block| 数量|
|-|-|-|-|-|
|gpt2|	124M|	768| (64*12)|	12|
|gpt2-medium|	355M|	1024 |(64*16)|	24|
|gpt2-large	|774M	|1280 |(64*20)	|36|
|gpt2-xl	|1.56B	|1600 |(64*25)	|48|

In [6]:
def get_params(model):
    total = 0
    for k,t in model.named_parameters():
        total += t.numel()
    return total

In [7]:
from transformers import AutoConfig,AutoModel,AutoTokenizer
from transformers import AutoModelForCausalLM
model_name = 'gpt2-large'

config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
model_cls = AutoModelForCausalLM.from_pretrained(model_name).to(device)
print(get_params(model))
print(get_params(model_cls))

774030080
774030080


In [8]:
model

GPT2Model(
  (wte): Embedding(50257, 1280)
  (wpe): Embedding(1024, 1280)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-35): 36 x GPT2Block(
      (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)

In [9]:
model_cls

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1280, out_features=50257, bias=False)
)

## 2.1 tokenizer

In [10]:
print(tokenizer.special_tokens_map)
print(tokenizer.special_tokens_map_extended)

print(tokenizer.encode('<|endoftext|>'))
print(tokenizer.decode(50256))

print(tokenizer.encode('  '))
print(tokenizer.decode(220))

# 大小写敏感
print(tokenizer.encode('Hello'))
print(tokenizer.encode('hello'))
print(tokenizer.encode(' hello'))
print(tokenizer.decode(23748))
print(tokenizer.decode(15496))
print(tokenizer.encode(' hello'))
print(tokenizer.encode('  hello'))
print(tokenizer.encode('   hello'))

{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}
{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}
[50256]
<|endoftext|>
[220, 220]
 
[15496]
[31373]
[23748]
 hello
Hello
[23748]
[220, 23748]
[220, 220, 23748]


|Token ID|	String|
|-|-|
|39177|	ItemThumbnailImage|
|30210|	guiActiveUnfocused|
|39755|	isSpecialOrderable|
|31576|	externalActionCode|
|39753|	quickShipAvailable|
|39757|	channelAvailability|
|36174|	RandomRedditorWithNo|
|30899|	cloneembedreportprint|
|40242|	BuyableInstoreAndOnline|
|30906|	rawdownloadcloneembedreportprint|

In [11]:
from transformers import GPT2LMHeadModel,GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

context = tokenizer('It will rain in the', return_tensors='pt')
context
# prediction = gpt2.generate(**context, max_length=10)
# tokenizer.decode(prediction[0])

Downloading (…)olve/main/vocab.json: 0.00B [00:00, ?B/s]

Downloading (…)olve/main/merges.txt: 0.00B [00:00, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

{'input_ids': tensor([[1026,  481, 6290,  287,  262]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [13]:
tokenizer.padding_side = "left"
# tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token

sentences = ["It will rain in the",
            "I want to eat a big bowl of",
            "My dog is"]
inputs = tokenizer(sentences, return_tensors="pt", padding=True)
print(inputs.input_ids)
print(inputs.attention_mask)
# output_sequences = gpt2.generate(**inputs)

# for seq in output_sequences:
#     print(tokenizer.decode(seq))

tensor([[50256, 50256, 50256,  1026,   481,  6290,   287,   262],
        [   40,   765,   284,  4483,   257,  1263,  9396,   286],
        [50256, 50256, 50256, 50256, 50256,  3666,  3290,   318]])
tensor([[0, 0, 0, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 1, 1, 1]])


## 3 forward
- GPT2Model
  - wte: word token embedding
  - wpe: word position embedding
- LMHead
  - MLP: hidden_size => vocab size

In [17]:
input_text = "A long long time ago"
model_inputs = tokenizer(input_text, return_tensors='pt')
model_inputs

{'input_ids': tensor([[  32,  890,  890,  640, 2084]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [21]:
input_ids = model_inputs['input_ids'].to(device)
model_inputs['attention_mask'].to(device)
output = model_cls(input_ids=input_ids)
output

CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[[ 1.6154,  2.7667,  0.7119,  ..., -7.2549, -5.9910,  0.4480],
         [ 1.3235,  1.4639, -3.3896,  ..., -6.4218, -2.7806, -1.6063],
         [ 2.6349,  2.4169, -1.4526,  ..., -7.6764, -2.2451, -0.3095],
         [ 4.8530,  4.0522, -2.4927,  ..., -6.4525, -4.7340,  1.2123],
         [ 3.9016,  4.9969, -1.3059,  ..., -7.5989, -5.0314,  2.0330]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-0.4480,  0.3063,  0.6135,  ...,  0.2141, -0.9180,  1.0187],
          [-0.4511, -0.0476, -0.3777,  ...,  0.5957, -0.5705,  0.8082],
          [-0.4955, -0.0556, -0.5869,  ...,  0.0962, -0.6235,  0.6053],
          [-0.1995, -0.1685, -0.5340,  ...,  0.3959, -0.2088,  0.7400],
          [-0.4999, -1.0959, -0.4615,  ...,  0.0572, -0.5673, -0.6184]],

         [[-0.3943,  0.6006, -0.2194,  ..., -1.3892,  0.0673, -0.3629],
          [-0.4305,  0.4791,  0.5663,  ..., -0.1362,  0.3677,  0.5704],
          [

In [23]:
output.logits.shape

torch.Size([1, 5, 50257])

### 3.1 model.transformer()

In [26]:
model_cls.eval()
model_cls.transformer(input_ids).last_hidden_state

tensor([[[-0.0085,  0.0312, -0.2349,  ...,  0.5693,  0.4657, -0.2162],
         [-0.1532, -0.1076,  0.6295,  ..., -1.0053,  0.8251, -0.0173],
         [ 0.0575, -0.4767,  0.6039,  ..., -0.7278,  0.8265, -0.1439],
         [-0.2359, -0.1584,  1.0381,  ..., -0.2890,  0.0971,  0.4442],
         [-0.7065,  0.3486,  0.2827,  ..., -0.0195,  0.5571, -0.8061]]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [27]:
model_cls.transformer(input_ids).last_hidden_state.shape

torch.Size([1, 5, 1280])