In [1]:
import time

## HuggingFace

In [2]:
import torch
from transformers import AutoModel, AutoTokenizer

start = time.time()
ori_model = AutoModel.from_pretrained('vinai/phobert-base')
end = time.time()
print(f'>>> Loaded model in {end - start} seconds')

start = time.time()
ori_tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base')
end = time.time()
print(f'>>> Loaded tokenizer in {end - start} seconds')

start = time.time()
token_ids = ori_tokenizer.encode('Tôi là sinh_viên trường Đại_học Tôn_Đức_Thắng')
end = time.time()
print('Tokens: ', token_ids)
print(f'>>> Encoded in {end - start} seconds')

token_ids = torch.tensor([token_ids], dtype=torch.int64)
attention_mask = torch.ones_like(token_ids, dtype=torch.int64)

start = time.time()
with torch.no_grad():
    output = ori_model(token_ids, attention_mask=attention_mask)
end = time.time()
last_hidden_state = output.last_hidden_state
pooler_output = output.pooler_output
print(f'Last hidden state shape: {last_hidden_state.shape}')
print(f'Pooler ouput shape: {pooler_output.shape}')
print(f'>>> Extracted last hidden state in {end - start} seconds')

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


>>> Loaded model in 5.507025957107544 seconds


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


>>> Loaded tokenizer in 11.773649215698242 seconds
Tokens:  [0, 218, 8, 649, 212, 850, 10742, 2]
>>> Encoded in 0.0007321834564208984 seconds
Last hidden state shape: torch.Size([1, 8, 768])
Pooler ouput shape: torch.Size([1, 768])
>>> Extracted last hidden state in 0.14553403854370117 seconds


In [12]:
from torch.nn.utils.rnn import pad_sequence
batch_inputs = []
token_ids = ori_tokenizer.encode('Tôi là sinh_viên trường Đại_học Tôn_Đức_Thắng')
batch_inputs.append(torch.tensor(token_ids, dtype=torch.int64))
token_ids = ori_tokenizer.encode('Tôi là thành_viên Câu_lạc_bộ học_thuật ICON')
batch_inputs.append(torch.tensor(token_ids, dtype=torch.int64))
token_ids = ori_tokenizer.encode('Năm nay là năm thứ 4')
batch_inputs.append(torch.tensor(token_ids, dtype=torch.int64))
batch_inputs = pad_sequence(batch_inputs, batch_first=True, padding_value=ori_tokenizer.pad_token_id)
print(batch_inputs)

attention_mask = batch_inputs.ne(ori_tokenizer.pad_token_id).to(torch.int64)
print('Mask: ', attention_mask)

start = time.time()
with torch.no_grad():
    output = ori_model(batch_inputs, attention_mask=attention_mask)
end = time.time()
last_hidden_state = output.last_hidden_state
pooler_output = output.pooler_output
print(f'Last hidden state shape: {last_hidden_state.shape}')
print(f'Pooler ouput shape: {pooler_output.shape}')
print(f'>>> Extracted last hidden state in {end - start} seconds')

tensor([[    0,   218,     8,   649,   212,   850, 10742,     2,     1],
        [    0,   218,     8,   496,  6945, 14347, 10038, 15111,     2],
        [    0,   434,   136,     8,    29,   129,   163,     2,     1]])
Mask:  tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0]])
Last hidden state shape: torch.Size([3, 9, 768])
Pooler ouput shape: torch.Size([3, 768])
>>> Extracted last hidden state in 0.789665937423706 seconds


In [4]:
last_hidden_state1 = last_hidden_state.numpy()
pooler_output1 = pooler_output.numpy()

## ONNX runtime and Custom Tokenizer

In [5]:
import numpy as np
import onnxruntime as ort
from tokenizer import Tokenizer

start = time.time()
ort_session = ort.InferenceSession('phobert_base.onnx')
end = time.time()
print(f'>>> Loaded ONNX model in {end - start} seconds')

start = time.time()
my_tokenizer = Tokenizer('vocab_files/vocab.txt', 'vocab_files/bpe.codes')
end = time.time()
print(f'>>> Loaded tokenizer in {end - start} seconds')

start = time.time()
token_ids = my_tokenizer.encode('Tôi là sinh_viên trường Đại_học Tôn_Đức_Thắng')
end = time.time()
print('Tokens: ', token_ids)
print(f'>>> Encoded in {end - start} seconds')

token_ids = np.array([token_ids, token_ids])
attention_mask = np.ones_like(token_ids, dtype=np.int64)
start = time.time()
output = ort_session.run(['last_hidden_state', 'pooler_output'], {'input_ids': token_ids, 'attention_mask': attention_mask})
end = time.time()
last_hidden_state = output[0]
pooler_output = output[1]
print(f'Last hidden state shape: {last_hidden_state.shape}')
print(f'Pooler ouput shape: {pooler_output.shape}')
print(f'>>> Extracted last hidden state in {end - start} seconds')

>>> Loaded ONNX model in 1.6606559753417969 seconds
>>> Loaded tokenizer in 0.3865170478820801 seconds
Tokens:  [    0   218     8   649   212   850 10742     2]
>>> Encoded in 0.0003857612609863281 seconds
Last hidden state shape: (2, 8, 768)
Pooler ouput shape: (2, 768)
>>> Extracted last hidden state in 0.08528900146484375 seconds


Loading vocabulary from vocab_files/vocab.txt ...
Read 3149446962 words (63996 unique) from vocabulary file.
Loading codes from vocab_files/bpe.codes ...
Read 64000 codes from the codes file.


In [11]:
batch_inputs = []
token_ids = my_tokenizer.encode('Tôi là sinh_viên trường Đại_học Tôn_Đức_Thắng')
batch_inputs.append(np.array(token_ids, dtype=np.int64))
token_ids = my_tokenizer.encode('Tôi là thành_viên Câu_lạc_bộ học_thuật ICON')
batch_inputs.append(np.array(token_ids, dtype=np.int64))
token_ids = my_tokenizer.encode('Năm nay là năm thứ 4')
batch_inputs.append(np.array(token_ids, dtype=np.int64))

max_len = max([len(x) for x in batch_inputs])
pad_idx = my_tokenizer.vocab.word2idx['<pad>']
for i, x in enumerate(batch_inputs):
    batch_inputs[i] = np.pad(x, (0, max_len - len(x)), 'constant', constant_values=pad_idx)
batch_inputs = np.array(batch_inputs)
print(batch_inputs)

attention_mask = (batch_inputs != pad_idx).astype(np.int64)
print('Mask: ', attention_mask)

start = time.time()
output = ort_session.run(['last_hidden_state', 'pooler_output'], {'input_ids': batch_inputs, 'attention_mask': attention_mask})
end = time.time()
last_hidden_state = output[0]
pooler_output = output[1]
print(f'Last hidden state shape: {last_hidden_state.shape}')
print(f'Pooler ouput shape: {pooler_output.shape}')
print(f'>>> Extracted last hidden state in {end - start} seconds')

[[    0   218     8   649   212   850 10742     2     1]
 [    0   218     8   496  6945 14347 10038 15111     2]
 [    0   434   136     8    29   129   163     2     1]]
Mask:  [[1 1 1 1 1 1 1 1 0]
 [1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 0]]
Last hidden state shape: (3, 9, 768)
Pooler ouput shape: (3, 768)
>>> Extracted last hidden state in 0.1820201873779297 seconds


In [7]:
last_hidden_state2 = last_hidden_state
pooler_output2 = pooler_output

In [8]:
print((last_hidden_state1 - last_hidden_state2).mean())

6.721952e-10


In [9]:
print((pooler_output1 - pooler_output2).mean())

-4.10376e-09
