In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModel
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from IPython.display import Image
# default: 100
mpl.rcParams['figure.dpi'] = 150
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

## 1. summary
- transformer-based language model
    - 目前最核心的一个能力：text generation，尤其对于 gpt 而言；
- openai gpt2
    - https://openai.com/research/better-language-models
- 预训练之后，被 prompt（context）激活的广泛的能力

In [None]:
Image('../../image/pretrain.png')

## 1.1 pretrained model
- on english language
- casual language modeling(CLM) objective(多分类问题)
- Language Models are unsupervised multi-task learner
    - https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf


## decoding strategies

- converting the model’s probabilistic output（vocab size classification） to text
    - iteratively，意味着更多的计算量
    - quality & diversity
- greedy search decoding：搜狗输入法，每次都用top1的候选
- beam search decoding
- sampling methods
- top-k & nucleus sampling

- (autoregressive or causal) language model
- $ x = x_1,x_2,...,x_n y = y_1,y_2,...,y_n$
- chain rule of probability to factorize it as a product of conditional probabilities
    - $P(y|x) = P(y_1,y_2,...,y_t|x) = \prod_{t=1}^{N} P(y_t|y_{<t},x)$
- 单向的，从左至右的，（BERT 的 B 表示的含义就是 bidirectional）
- 具体解码过程：
- $p(y_t=w_i|y_{<t},x) = softmax(z_t,i)$
- $\hat{y} = arg max_y P(y|x)$

## gpt2


|model|	参数量|	hidden dim|	block| 数量|
|-|-|-|-|-|
|gpt2|	124M|	768| (64*12)|	12|
|gpt2-medium|	355M|	1024 |(64*16)|	24|
|gpt2-large	|774M	|1280 |(64*20)	|36|
|gpt2-xl	|1.56B	|1600 |(64*25)	|48|

In [None]:
def get_params(model):
    total = 0
    for k,t in model.named_parameters():
        total += t.numel()
    return total

In [None]:
from transformers import AutoConfig,AutoModel,AutoTokenizer
model_name = 'gpt2-xl'

config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
get_params(model)

In [None]:
model

### 2.1 tokenizer

In [None]:
config

In [None]:
print(tokenizer.special_tokens_map)
print(tokenizer.special_tokens_map_extended)

print(tokenizer.encode('<|endoftext|>'))
print(tokenizer.decode(50256))

print(tokenizer.encode('  '))
print(tokenizer.decode(220))

# 大小写敏感
print(tokenizer.encode('Hello'))
print(tokenizer.encode('hello'))
print(tokenizer.encode(' hello'))
print(tokenizer.decode(23748))
print(tokenizer.decode(15496))
print(tokenizer.encode(' hello'))
print(tokenizer.encode('  hello'))
print(tokenizer.encode('   hello'))

|Token ID|	String|
|-|-|
|39177|	ItemThumbnailImage|
|30210|	guiActiveUnfocused|
|39755|	isSpecialOrderable|
|31576|	externalActionCode|
|39753|	quickShipAvailable|
|39757|	channelAvailability|
|36174|	RandomRedditorWithNo|
|30899|	cloneembedreportprint|
|40242|	BuyableInstoreAndOnline|
|30906|	rawdownloadcloneembedreportprint|

In [None]:
print(tokenizer.decode(39177))

print(tokenizer.encode('ItemThumbnailImage'))

print(tokenizer.encode('chartreuse'))

### 2.2 attention mask
- 结构化，批次化的tensor

In [None]:
from transformers import GPT2LMHeadModel,GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

context = tokenizer('It will rain in the', return_tensors='pt')
context
# prediction = gpt2.generate(**context, max_length=10)
# tokenizer.decode(prediction[0])

In [None]:
# tokenizer.padding_side = "left"
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token

sentences = ["It will rain in the",
            "I want to eat a big bowl of",
            "My dog is"]
inputs = tokenizer(sentences, return_tensors="pt", padding=True)
print(inputs.input_ids)
print(inputs.attention_mask)
# output_sequences = gpt2.generate(**inputs)

# for seq in output_sequences:
#     print(tokenizer.decode(seq))

## forward
- GPT2Model
    - wte: word token embedding
    - wpe: word position embedding
- LMHead
    - mlp: hidden_state => vocab_size