In [15]:
from transformers import RobertaTokenizer, RobertaForMaskedLM, BertTokenizer, BertForMaskedLM, AlbertTokenizer
import torch

In [16]:
tokenizer = RobertaTokenizer.from_pretrained("/data/ganleilei/bert/roberta-large", add_prefix_space=True)
bert_tokenizer = BertTokenizer.from_pretrained("/data/ganleilei/bert/bert-base-uncased")
model = RobertaForMaskedLM.from_pretrained("/data/ganleilei/bert/roberta-large/")
bert_model = BertForMaskedLM.from_pretrained("/data/ganleilei/bert/bert-base-uncased")

Some weights of the model checkpoint at /data/ganleilei/bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [30]:
sent = "anarchism is a [MASK] philosophy that advocates self-governed societies based on voluntary institutions." 
sent_list = sent.strip().split()
print("len:", len(sent_list))
inputs = tokenizer(sent, return_tensors="pt")
bert_inputs = bert_tokenizer(sent, return_tensors="pt")
print("inputs:", inputs['input_ids'].size())
input_tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
bert_input_tokens = bert_tokenizer.convert_ids_to_tokens(bert_inputs['input_ids'][0])
print("input tokens:", input_tokens)
print("bert input tokens:", bert_input_tokens)
input_str = tokenizer.convert_tokens_to_string(input_tokens)
print("input str:", input_str)

len: 13
inputs: torch.Size([1, 22])
input tokens: ['<s>', 'Ġanarchism', 'Ġis', 'Ġa', 'Ġ[', 'MAS', 'K', ']', 'Ġphilosophy', 'Ġthat', 'Ġadvocates', 'Ġself', '-', 'govern', 'ed', 'Ġsocieties', 'Ġbased', 'Ġon', 'Ġvoluntary', 'Ġinstitutions', '.', '</s>']
bert input tokens: ['[CLS]', 'ana', '##rch', '##ism', 'is', 'a', '[MASK]', 'philosophy', 'that', 'advocates', 'self', '-', 'governed', 'societies', 'based', 'on', 'voluntary', 'institutions', '.', '[SEP]']
input str: <s> anarchism is a [MASK] philosophy that advocates self-governed societies based on voluntary institutions.</s>


In [31]:
with torch.no_grad():
    logits = bert_model(**bert_inputs).logits

values, predictions= torch.topk(logits, k=10, dim=-1, largest=True)

In [32]:
pred_tokens = bert_tokenizer.convert_ids_to_tokens(predictions[0, 6, :])
print(pred_tokens)

['political', 'social', 'marxist', 'philosophical', 'feminist', 'moral', 'libertarian', 'socialist', 'sociological', 'liberal']


In [8]:
albert_tokenizer = AlbertTokenizer.from_pretrained("/data/ganleilei/bert/albert/albert-xxlarge-v2/")
roberta_tokenizer = RobertaTokenizer.from_pretrained("/data/ganleilei/bert/roberta-large", add_prefix_space=True)
fout = open("data/vocab/wiki.roberta.word.bpe.txt", "w+")
for line in open("data/vocab/vocab.wiki.word.txt"):
    word = line.strip().split()[0]
    fout.write(word+"\t" +" ".join(roberta_tokenizer._tokenize(word)) + '\n')
    whitespace_word = " " + word
    fout.write("###"+word+"\t" +" ".join(roberta_tokenizer._tokenize(whitespace_word)).lower() + '\n')
    # fout.write(word+"\t" + " ".join(albert_tokenizer._tokenize(word))+'\n')

fout.close()

In [10]:
bpes = albert_tokenizer(sent)
input_tokens = albert_tokenizer.convert_ids_to_tokens(bpes['input_ids'])
print(input_tokens)
print(albert_tokenizer._tokenize("state"))

['[CLS]', '▁an', 'arch', 'ism', '▁an', 'arch', 'ism', '▁is', '▁a', '▁political', '▁philosophy', '▁that', '▁advocates', '▁self', '-', 'go', 'vern', 'ed', '▁societies', '▁based', '▁on', '▁voluntary', '▁institutions', '.', '[SEP]']
['▁state']


In [11]:
print(type(roberta_tokenizer))
bpes = roberta_tokenizer("anarchism anarchism is a political philosophy that advocates self-governed societies based on voluntary institutions .")
print(roberta_tokenizer.convert_ids_to_tokens(bpes["input_ids"]))

<class 'transformers.models.roberta.tokenization_roberta.RobertaTokenizer'>
['<s>', 'Ġanarchism', 'Ġanarchism', 'Ġis', 'Ġa', 'Ġpolitical', 'Ġphilosophy', 'Ġthat', 'Ġadvocates', 'Ġself', '-', 'govern', 'ed', 'Ġsocieties', 'Ġbased', 'Ġon', 'Ġvoluntary', 'Ġinstitutions', 'Ġ.', '</s>']
