# 第10章: 事前学習済み言語モデル（GPT型）

## 90. 次単語予測

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
import torch

set_seed(42)
text = "The movie was full of"
model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained('gpt2')

input_ids = tokenizer.encode(text, add_special_tokens=False, return_tensors="pt")
print("input_ids:", input_ids)

# 次のトークンの予測確率を取得
with torch.no_grad():  
  output = model(input_ids)
  next_token_logits = output.logits[0,-1,:]

# 確率を計算
scores = torch.softmax(next_token_logits, dim=-1)

# 上位10個を出力
topk = 10
topk_scores, topk_ids = torch.topk(scores, topk)
for topk_score, topk_id in zip(topk_scores, topk_ids):
  pred_token = tokenizer.decode([topk_id])
  print(f'{pred_token}: {topk_score:.4f}')

# GPT型の使い方    https://qiita.com/suzuki_sh/items/acf276b55085647bdd75
# CausalLMOutput  https://huggingface.co/docs/transformers/main_classes/output#transformers.modeling_outputs.CausalLMOutput

  from .autonotebook import tqdm as notebook_tqdm


input_ids: tensor([[ 464, 3807,  373, 1336,  286]])
 jokes: 0.0219
 great: 0.0186
 laughs: 0.0115
 bad: 0.0109
 surprises: 0.0107
 references: 0.0105
 fun: 0.0100
 humor: 0.0074
 ": 0.0074
 the: 0.0067


## 91. 続きのテキストの予測

In [3]:
temp_list = [t * 0.2 for t in range(1, 6)]
topk_list = [k * 10 for k in range(1, 6)]

with torch.no_grad():
  for temp, topk in zip(temp_list, topk_list):
    output_ids = model.generate(
      input_ids,
      do_sample=True,
      temperature=temp,
      top_k=topk,
      pad_token_id=tokenizer.eos_token_id
    )
    preds = tokenizer.decode(output_ids.tolist()[0])
    print(f'temp={temp:.1f}, topk={topk}: {preds}')

# gptのtemperature  https://qiita.com/suzuki_sh/items/8e449d231bb2f09a510c

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


temp=0.2, topk=10: The movie was full of great moments, but the most memorable was when the villain, the villainous villain, was killed by
temp=0.4, topk=20: The movie was full of laughs, but I was disappointed that it didn't get a lot of attention. The movie was a
temp=0.6, topk=30: The movie was full of a lot of weird, bizarre and terrible things to say, and I think that's why I love
temp=0.8, topk=40: The movie was full of jokes.

"I was playing in the bathtub, and I felt a little nervous,"
temp=1.0, topk=50: The movie was full of strong performances and a big smile.

When I saw the synopsis for this movie, I expected


## 92. 予測されたテキストの確率を計算

In [4]:
# 次のトークンの予測確率を取得
with torch.no_grad():
  output_ids = model.generate(input_ids, pad_token_id=tokenizer.eos_token_id)
  generated_tokens_ids = output_ids[0, input_ids.shape[1]:]
  output = model(output_ids)
  next_text_logits = output.logits[0, input_ids.shape[1]-1:, :]

# 各トークンの確率を計算
scores = torch.softmax(next_text_logits, dim=-1)
for i, token_id in enumerate(generated_tokens_ids):
  print(f'{tokenizer.decode([token_id])}: {scores[i, token_id]:.4f}')

 jokes: 0.0219
 and: 0.2892
 jokes: 0.0985
 about: 0.2056
 how: 0.0997
 the: 0.0846
 movie: 0.0364
 was: 0.2963
 a: 0.0677
 joke: 0.1735
.: 0.2804
 It: 0.1230
 was: 0.5197
 a: 0.1493
 joke: 0.2690
 about: 0.4242
 how: 0.1742
 the: 0.1236
 movie: 0.6161
 was: 0.6350


## 93. パープレキシティ

In [5]:
texts = [
  "The movie was full of surprises",
  "The movies were full of surprises",
  "The movie were full of surprises",
  "The movies was full of surprises"
]

tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(texts, return_tensors='pt', padding=True)
with torch.no_grad():
  outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=inputs['input_ids'])

# パープレキシティの計算
shift_logits = outputs.logits[:, :-1, :].contiguous()
shift_labels = inputs['input_ids'][:, 1:].contiguous()
shift_mask = inputs['attention_mask'][:, 1:].contiguous()
batch_size, seq_len = shift_labels.shape
loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)).view(batch_size, seq_len)
loss = (loss * shift_mask).sum(dim=1) / shift_mask.sum(dim=1)
ppl = torch.exp(loss).tolist()

for i in range(len(texts)):
  print(f'{texts[i]}: {ppl[i]:.4f}')

# パープレキシティの計算  https://gotutiyan.hatenablog.com/entry/2022/02/23/133414

The movie was full of surprises: 99.3538
The movies were full of surprises: 126.4832
The movie were full of surprises: 278.8803
The movies was full of surprises: 274.6648


## 94. チャットテンプレート

In [27]:
import os

token = os.environ["HUGGING_FACE_TOKEN"]

model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
model = AutoModelForCausalLM.from_pretrained(model_name, token=token)

prompt = "What do you call a sweet eaten after dinner?"
messages = [
  {"role": "system", "content": "You are a helpful assistant."},
  {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
generated_ids = model.generate(**model_inputs, max_new_tokens=512, pad_token_id=128001)
generated_ids = [
  output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

A dessert is typically referred to as a sweet treat that is eaten after dinner.


## 95. マルチターンのチャット

In [28]:
prompt = "Please give me the plural form of the word with its spelling in reverse order."
messages.append({"role": "assistant", "content":response})
messages.append({"role": "user", "content": prompt})

text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
generated_ids = model.generate(**model_inputs, max_new_tokens=512, pad_token_id=128001)
generated_ids = [
  output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

The plural form of "dessert" is "desserts". The word "desserts" spelled in reverse order is "sretsseD".


## 96. プロンプトによる感情分析

In [20]:
!wget https://dl.fbaipublicfiles.com/glue/data/SST-2.zip -P data/
!unzip -o data/SST-2.zip -d data/
!rm data/SST-2.zip

--2025-05-01 18:21:40--  https://dl.fbaipublicfiles.com/glue/data/SST-2.zip


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 54.230.130.72, 54.230.130.97, 54.230.130.59, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|54.230.130.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7439277 (7.1M) [application/zip]
Saving to: ‘data/SST-2.zip’


2025-05-01 18:21:40 (35.7 MB/s) - ‘data/SST-2.zip’ saved [7439277/7439277]

Archive:  data/SST-2.zip
   creating: data/SST-2/
  inflating: data/SST-2/dev.tsv      
   creating: data/SST-2/original/
  inflating: data/SST-2/original/README.txt  
  inflating: data/SST-2/original/SOStr.txt  

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



  inflating: data/SST-2/original/STree.txt  
  inflating: data/SST-2/original/datasetSentences.txt  
  inflating: data/SST-2/original/datasetSplit.txt  
  inflating: data/SST-2/original/dictionary.txt  
  inflating: data/SST-2/original/original_rt_snippets.txt  
  inflating: data/SST-2/original/sentiment_labels.txt  
  inflating: data/SST-2/test.tsv     
  inflating: data/SST-2/train.tsv    


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 KB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting aiohttp
  Downloading aiohttp-3.11.18-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting requests>=2.32.2
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 KB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 KB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hColle

In [39]:
import pandas as pd
import re

# ファイルの読み込み
file_name = './data/SST-2/dev.tsv'
df = pd.read_csv(file_name, sep='\t')

# 一文に対しての感情分析
def sentiment_analysis(text):
  instructions = """
    Please determine the positive and negative aspects of the text. 
    If it's positive, output 1, if negative, output 0.
    You can only output 0 or 1.
  """
  prompt = f"""
    Instructions: {instructions},
    Text: {text}
  """
  messages = [
    {"role": "system", "content": "You are a helpful assistant. You can only output 0 or 1."},
    {"role": "user", "content": prompt}
  ]
  
  text = tokenizer.apply_chat_template(
    messages, 
    tokenize=False, 
    add_generation_prompt=True
  )
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
  generated_ids = model.generate(**model_inputs, max_new_tokens=512, pad_token_id=128001)
  generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
  ]
  response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
  return response

# 正解率の計算
correct = 0
for index, row in df.iterrows():
  response = sentiment_analysis(row['sentence'])
  if re.search(r"\b[01]\b", response) and int(re.findall(r"\b[01]\b", response)[0]) == row['label']:
    correct += 1
print(f"accuracy: {correct / len(df) * 100:.2f}%")

accuracy: 54.01%


## 97. 埋め込みに基づく感情分析

## 98. ファインチューニング

## 99. 選好チューニング