https://github.com/rinnakk/japanese-pretrained-models

In [1]:
import torch
from transformers import T5Tokenizer, RobertaForMaskedLM

# load tokenizer
tokenizer = T5Tokenizer.from_pretrained("rinna/japanese-roberta-base")
tokenizer.do_lower_case = True  # due to some bug of tokenizer config loading

# load model
model = RobertaForMaskedLM.from_pretrained("rinna/japanese-roberta-base")
model = model.eval()

# original text
text = "4年に1度オリンピックは開かれる。"

# prepend [CLS]
text = "[CLS]" + text

# tokenize
tokens = tokenizer.tokenize(text)
print(tokens)  # output: ['[CLS]', '▁4', '年に', '1', '度', 'オリンピック', 'は', '開かれる', '。']']

# mask a token
masked_idx = 5
tokens[masked_idx] = tokenizer.mask_token
print(tokens)  # output: ['[CLS]', '▁4', '年に', '1', '度', '[MASK]', 'は', '開かれる', '。']

# convert to ids
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)  # output: [4, 1602, 44, 24, 368, 6, 11, 21583, 8]

# convert to tensor
token_tensor = torch.tensor([token_ids])

# get the top 10 predictions of the masked token
with torch.no_grad():
    outputs = model(token_tensor)
    predictions = outputs[0][0, masked_idx].topk(10)

for i, index_t in enumerate(predictions.indices):
    index = index_t.item()
    token = tokenizer.convert_ids_to_tokens([index])[0]
    print(i, token)

Downloading: 100%|██████████| 806k/806k [00:00<00:00, 15.2MB/s]
Downloading: 100%|██████████| 153/153 [00:00<00:00, 37.4kB/s]
Downloading: 100%|██████████| 259/259 [00:00<00:00, 263kB/s]
Downloading: 100%|██████████| 663/663 [00:00<00:00, 332kB/s]
Downloading: 100%|██████████| 443M/443M [00:10<00:00, 43.4MB/s]


['[CLS]', '▁4', '年に', '1', '度', 'オリンピック', 'は', '開かれる', '。']
['[CLS]', '▁4', '年に', '1', '度', '[MASK]', 'は', '開かれる', '。']
[4, 1602, 44, 24, 368, 6, 11, 21583, 8]
0 ワールドカップ
1 フェスティバル
2 オリンピック
3 サミット
4 東京オリンピック
5 総会
6 全国大会
7 イベント
8 世界選手権
9 パーティー


In [11]:
from transformers import T5Tokenizer, AutoModelForCausalLM
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = T5Tokenizer.from_pretrained("rinna/japanese-gpt2-medium")
tokenizer.do_lower_case = True  # due to some bug of tokenizer config loading

model = AutoModelForCausalLM.from_pretrained("rinna/japanese-gpt2-medium")

In [12]:
inputs=tokenizer("こんにちは，世界！")
print(inputs)


{'input_ids': [9, 30442, 11, 83, 301, 543, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [13]:
outputs = model.generate(input_ids=torch.tensor([inputs["input_ids"]]))
text = tokenizer.decode(
    outputs[0].tolist(),
    skip_special_tokens=True,
    clean_up_tokenization_spaces=True,
)
print(text)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


こんにちは,世界!ニッポンの達人です。 今回は,「日本は
