In [None]:
from transformers import AutoTokenizer

bert_uncased_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
bert_cased_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
roberta_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")

sentence = "Lập trình viên Python. Chuyên gia Machine Learning."

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print("==> Tokens by BERT uncased:")
print(bert_uncased_tokenizer.tokenize(sentence))
print("==> Tokens by BERT cased:")
print(bert_cased_tokenizer.tokenize(sentence))
print("==> Tokens by RoBERTa:")
print(roberta_tokenizer.tokenize(sentence))
print("==> Tokens by PhoBERT:")
print(phobert_tokenizer.tokenize(sentence))

==> Tokens by BERT uncased:
['lap', 'tri', '##nh', 'vie', '##n', 'python', '.', 'chu', '##yen', 'gia', 'machine', 'learning', '.']
==> Tokens by BERT cased:
['L', '##ậ', '##p', 't', '##r', '##ì', '##nh', 'v', '##i', '##ê', '##n', 'Python', '.', 'Chu', '##y', '##ê', '##n', 'g', '##ia', 'Machine', 'Learning', '.']
==> Tokens by RoBERTa:
['▁Lập', '▁trình', '▁viên', '▁Python', '.', '▁Chuyên', '▁gia', '▁Machine', '▁Learning', '.']
==> Tokens by PhoBERT:
['Lập', 'trình', 'viên', 'Py@@', 'th@@', 'on.', 'Chuyên', 'gia', 'Mach@@', 'ine', 'Le@@', 'arn@@', 'ing@@', '.']


In [4]:
# Encode and decode (BERT uncased)
encoded_input = bert_uncased_tokenizer(sentence, return_tensors="pt")
decoded_input = bert_uncased_tokenizer.decode(encoded_input["input_ids"][0])
print("\nEncoded IDs (BERT uncased):", encoded_input["input_ids"][0].tolist())
print("Decoded sentence (BERT uncased):", decoded_input)


Encoded IDs (BERT uncased): [101, 5001, 13012, 25311, 20098, 2078, 18750, 1012, 14684, 20684, 27699, 3698, 4083, 1012, 102]
Decoded sentence (BERT uncased): [CLS] lap trinh vien python. chuyen gia machine learning. [SEP]


In [5]:
# Encode and decode (BERT cased)
encoded_input = bert_cased_tokenizer(sentence, return_tensors="pt")
decoded_input = bert_cased_tokenizer.decode(encoded_input["input_ids"][0])
print("\nEncoded IDs (BERT cased):", encoded_input["input_ids"][0].tolist())
print("Decoded sentence (BERT cased):", decoded_input)


Encoded IDs (BERT cased): [101, 149, 28645, 1643, 189, 1197, 21409, 15624, 191, 1182, 24559, 1179, 23334, 119, 17144, 1183, 24559, 1179, 176, 1465, 7792, 9681, 119, 102]
Decoded sentence (BERT cased): [CLS] Lập trình viên Python. Chuyên gia Machine Learning. [SEP]


In [6]:
# Encode and Decode (PhoBERT)
encoded_input = phobert_tokenizer(sentence, return_tensors="pt")
decoded_input = phobert_tokenizer.decode(encoded_input["input_ids"][0])
print("\nEncoded IDs (PhoBERT):", encoded_input["input_ids"][0].tolist())
print("Decoded sentence (PhoBERT):", decoded_input)


Encoded IDs (PhoBERT): [0, 6081, 1893, 1430, 10085, 1981, 34412, 15806, 3931, 27341, 3403, 2923, 50941, 5936, 5, 2]
Decoded sentence (PhoBERT): <s> Lập trình viên Python. Chuyên gia Machine Learning. </s>


In [7]:
# Encode and Decode (RoBERTa)
encoded_input = roberta_tokenizer(sentence, return_tensors="pt")
decoded_input = roberta_tokenizer.decode(encoded_input["input_ids"][0])
print("\nEncoded IDs (RoBERTa):", encoded_input["input_ids"][0].tolist())
print("Decoded sentence (RoBERTa):", decoded_input)


Encoded IDs (RoBERTa): [0, 189041, 5009, 4603, 145581, 5, 115818, 3529, 68311, 114344, 5, 2]
Decoded sentence (RoBERTa): <s> Lập trình viên Python. Chuyên gia Machine Learning.</s>
