# 第10章: 事前学習済み言語モデル（GPT型）

## 90. 次単語予測

In [69]:
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
import torch

set_seed(42)
text = "The movie was full of"
model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained('gpt2')

input_ids = tokenizer.encode(text, add_special_tokens=False, return_tensors="pt")
print("input_ids:", input_ids)

# 次のトークンの予測確率を取得
with torch.no_grad():  
  output = model(input_ids)
  next_token_logits = output.logits[0,-1,:]

# 確率を計算
scores = torch.softmax(next_token_logits, dim=-1)

# 上位10個を出力
topk = 10
topk_scores, topk_ids = torch.topk(scores, topk)
for topk_score, topk_id in zip(topk_scores, topk_ids):
  pred_token = tokenizer.decode([topk_id])
  print(f'{pred_token}: {topk_score:.4f}')

# GPT型の使い方    https://qiita.com/suzuki_sh/items/acf276b55085647bdd75
# CausalLMOutput  https://huggingface.co/docs/transformers/main_classes/output#transformers.modeling_outputs.CausalLMOutput

input_ids: tensor([[ 464, 3807,  373, 1336,  286]])
 jokes: 0.0219
 great: 0.0186
 laughs: 0.0115
 bad: 0.0109
 surprises: 0.0107
 references: 0.0105
 fun: 0.0100
 humor: 0.0074
 ": 0.0074
 the: 0.0067


## 91. 続きのテキストの予測

In [70]:
temp_list = [t * 0.2 for t in range(1, 6)]
topk_list = [k * 10 for k in range(1, 6)]

with torch.no_grad():
  for temp, topk in zip(temp_list, topk_list):
    output_ids = model.generate(
      input_ids,
      do_sample=True,
      temperature=temp,
      top_k=topk,
      pad_token_id=tokenizer.eos_token_id
    )
    preds = tokenizer.decode(output_ids.tolist()[0])
    print(f'temp={temp:.1f}, topk={topk}: {preds}')

# gptのtemperature  https://qiita.com/suzuki_sh/items/8e449d231bb2f09a510c

temp=0.2, topk=10: The movie was full of great moments, but it was also full of bad moments.

The
temp=0.4, topk=20: The movie was full of good, bad, and ugly jokes. I was so happy with this movie
temp=0.6, topk=30: The movie was full of great characters and all of the great roles that were played by great actresses.
temp=0.8, topk=40: The movie was full of jokes, but it did have some serious undertones of sexism and racism.
temp=1.0, topk=50: The movie was full of cringe. I didn't even get a reaction from any of the movie's


## 92. 予測されたテキストの確率を計算

In [71]:
# 次のトークンの予測確率を取得
with torch.no_grad():
  output_ids = model.generate(input_ids, pad_token_id=tokenizer.eos_token_id)
  generated_tokens_ids = output_ids[0, input_ids.shape[1]:]
  output = model(output_ids)
  next_text_logits = output.logits[0, input_ids.shape[1]-1:, :]

# 各トークンの確率を計算
scores = torch.softmax(next_text_logits, dim=-1)
for i, token_id in enumerate(generated_tokens_ids):
  print(f'{tokenizer.decode([token_id])}: {scores[i, token_id]:.4f}')

 jokes: 0.0219
 and: 0.2892
 jokes: 0.0985
 about: 0.2056
 how: 0.0997
 the: 0.0846
 movie: 0.0364
 was: 0.2963
 a: 0.0677
 joke: 0.1735
.: 0.2804
 It: 0.1230
 was: 0.5197
 a: 0.1493
 joke: 0.2690


## 93. パープレキシティ

In [72]:
texts = [
  "The movie was full of surprises",
  "The movies were full of surprises",
  "The movie were full of surprises",
  "The movies was full of surprises"
]

tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(texts, return_tensors='pt', padding=True)
with torch.no_grad():
  outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=inputs['input_ids'])

# パープレキシティの計算
shift_logits = outputs.logits[:, :-1, :].contiguous()
shift_labels = inputs['input_ids'][:, 1:].contiguous()
shift_mask = inputs['attention_mask'][:, 1:].contiguous()
batch_size, seq_len = shift_labels.shape
loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)).view(batch_size, seq_len)
loss = (loss * shift_mask).sum(dim=1) / shift_mask.sum(dim=1)
ppl = torch.exp(loss).tolist()

for i in range(len(texts)):
  print(f'{texts[i]}: {ppl[i]:.4f}')

# パープレキシティの計算  https://gotutiyan.hatenablog.com/entry/2022/02/23/133414

The movie was full of surprises: 99.3548
The movies were full of surprises: 126.4808
The movie were full of surprises: 278.8784
The movies was full of surprises: 274.6573


## 94. チャットテンプレート

## 95. マルチターンのチャット

## 96. プロンプトによる感情分析

## 97. 埋め込みに基づく感情分析

## 98. ファインチューニング

## 99. 選好チューニング