# 第9章: 事前学習済み言語モデル（BERT型）

## 80. トークン化

In [4]:
from transformers import AutoTokenizer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.tokenize("The movie was full of incomprehensibilities.")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

['the',
 'movie',
 'was',
 'full',
 'of',
 'inc',
 '##omp',
 '##re',
 '##hen',
 '##si',
 '##bilities',
 '.']

## 81. マスクの予測

In [5]:
from transformers import pipeline

fill_mask = pipeline("fill-mask", model=model_name)
masked_text = "The movie was full of [MASK]."
outputs = fill_mask(masked_text)
print(outputs[0])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


{'score': 0.10711909830570221, 'token': 4569, 'token_str': 'fun', 'sequence': 'the movie was full of fun.'}


## 82. マスクのtop-k予測

In [6]:
import pandas as pd

fill_mask = pipeline("fill-mask", model=model_name, top_k=10)
masked_text = "The movie was full of [MASK]."
outputs = fill_mask(masked_text)
display(pd.DataFrame(outputs))

# 上位10個  https://kazuhira-r.hatenablog.com/entry/2024/01/03/221331

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Unnamed: 0,score,token,token_str,sequence
0,0.107119,4569,fun,the movie was full of fun.
1,0.066345,20096,surprises,the movie was full of surprises.
2,0.044684,3689,drama,the movie was full of drama.
3,0.027217,3340,stars,the movie was full of stars.
4,0.025413,11680,laughs,the movie was full of laughs.
5,0.019517,2895,action,the movie was full of action.
6,0.019038,8277,excitement,the movie was full of excitement.
7,0.01829,2111,people,the movie was full of people.
8,0.015031,6980,tension,the movie was full of tension.
9,0.014646,2189,music,the movie was full of music.


## 83. CLSトークンによる文ベクトル

In [27]:
from transformers import AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

texts = [
  "The movie was full of fun.",
  "The movie was full of excitement.",
  "The movie was full of crap.",
  "The movie was full of rubbish."
]

# 文のエンコード化
model = AutoModel.from_pretrained(model_name)
encoded_texts = tokenizer.batch_encode_plus(texts, padding=True, add_special_tokens=True)

# 文ベクトル化
input_ids = torch.tensor(encoded_texts["input_ids"])
outputs = model(input_ids)
sentencevec = outputs[0][:,0,:]

# コサイン類似度
cs_array = cosine_similarity(sentencevec.detach().cpu().numpy(), sentencevec.detach().cpu().numpy())
print(cs_array)

# CLSトークンによる文ベクトル https://qiita.com/ichiroex/items/6e305a5d5bed7d715c2f
# コサイン類似度              https://analysis-navi.com/?p=688

[[0.99999976 0.9880607  0.9557659  0.9475323 ]
 [0.9880607  1.0000001  0.9541274  0.9486636 ]
 [0.9557659  0.9541274  0.9999999  0.98069316]
 [0.9475323  0.9486636  0.98069316 1.        ]]


## 84. 平均による文ベクトル

In [35]:
# 文ベクトル化
input_ids = torch.tensor(encoded_texts["input_ids"])
outputs = model(input_ids)
att_mask = torch.tensor(encoded_texts["attention_mask"])
att_mask = att_mask.unsqueeze(-1)
sentencevec = (outputs[0] * att_mask).sum(dim=1) / att_mask.sum(dim=1)

# コサイン類似度
cs_array = cosine_similarity(sentencevec.detach().cpu().numpy(), sentencevec.detach().cpu().numpy())
print(cs_array)

# 平均による文ベクトル  https://qiita.com/anyai_corp/items/1d66feea6102c28dd077

[[1.0000004  0.95681155 0.84899944 0.8168843 ]
 [0.95681155 0.99999976 0.8351835  0.7938445 ]
 [0.84899944 0.8351835  1.         0.9225537 ]
 [0.8168843  0.7938445  0.9225537  1.        ]]


## 85. データセットの準備

In [36]:
!wget https://dl.fbaipublicfiles.com/glue/data/SST-2.zip -P data/
!unzip -o data/SST-2.zip -d data/
!rm data/SST-2.zip

--2025-04-24 15:27:26--  https://dl.fbaipublicfiles.com/glue/data/SST-2.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.161.213.24, 3.161.213.42, 3.161.213.84, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.161.213.24|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7439277 (7.1M) [application/zip]
Saving to: ‘data/SST-2.zip’


2025-04-24 15:27:27 (13.7 MB/s) - ‘data/SST-2.zip’ saved [7439277/7439277]

Archive:  data/SST-2.zip
   creating: data/SST-2/
  inflating: data/SST-2/dev.tsv      
   creating: data/SST-2/original/
  inflating: data/SST-2/original/README.txt  
  inflating: data/SST-2/original/SOStr.txt  
  inflating: data/SST-2/original/STree.txt  
  inflating: data/SST-2/original/datasetSentences.txt  
  inflating: data/SST-2/original/datasetSplit.txt  
  inflating: data/SST-2/original/dictionary.txt  
  inflating: data/SST-2/original/original_rt_snippets.txt  
  inflating: data/SST-2/original/sentiment_labels.txt

In [37]:
import csv
import collections
import torch

def make_dict(file_name):
  dictionary = []
  with open(file_name, 'r', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter='\t')
    next(reader)
    for row in reader:
      tokens = tokenizer.tokenize(row[0])
      dictionary.append({'text': row[0], 'label': row[1], 'tokens': tokens})
  return dictionary

train_dict =  make_dict('./data/SST-2/train.tsv')
dev_dict = make_dict('./data/SST-2/dev.tsv')

print('学習データのリスト：')
for i in range(5):
  print(train_dict[i])
print('')

print('検証データのリスト：')
for i in range(5):
  print(dev_dict[i])

学習データのリスト：
{'text': 'hide new secretions from the parental units ', 'label': '0', 'tokens': ['hide', 'new', 'secret', '##ions', 'from', 'the', 'parental', 'units']}
{'text': 'contains no wit , only labored gags ', 'label': '0', 'tokens': ['contains', 'no', 'wit', ',', 'only', 'labor', '##ed', 'gag', '##s']}
{'text': 'that loves its characters and communicates something rather beautiful about human nature ', 'label': '1', 'tokens': ['that', 'loves', 'its', 'characters', 'and', 'communicate', '##s', 'something', 'rather', 'beautiful', 'about', 'human', 'nature']}
{'text': 'remains utterly satisfied to remain the same throughout ', 'label': '0', 'tokens': ['remains', 'utterly', 'satisfied', 'to', 'remain', 'the', 'same', 'throughout']}
{'text': 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ', 'label': '0', 'tokens': ['on', 'the', 'worst', 'revenge', '-', 'of', '-', 'the', '-', 'ne', '##rds', 'cl', '##iche', '##s', 'the', 'filmmakers', 'could', 'dr', '##edge', '

## 86. ミニバッチの作成

In [38]:
import torch.nn as nn

def collate(data_dict):
  data_dict.sort(key=lambda x: len(x['tokens']), reverse=True)
  tokens_list = [data['tokens'] for data in data_dict]
  label_list = [data['label'] for data in data_dict]
  padded_tokens = nn.utils.rnn.pad_sequence(tokens_list, batch_first=True, padding_value=0)
  padded_data = {
    'tokens': padded_tokens,
    'label': torch.stack(label_list)
  }
  return padded_data

padded_data = collate(train_dict[0:4])
print(padded_data)

TypeError: expected Tensor as element 0 in argument 0, but got list

## 87. ファインチューニング

## 88. 極性分析

## 89. アーキテクチャの変更