In [1]:
import torch
from transformers import AutoModel, AutoTokenizer

phobert = AutoModel.from_pretrained("vinai/phobert-base-v2")
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")

# INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
sentence = 'Chúng_tôi là những nghiên_cứu_viên .'  

input_ids = torch.tensor([tokenizer.encode(sentence)])

with torch.no_grad():
    features = phobert(input_ids)  # Models outputs are now tuples

## With TensorFlow 2.0+:
# from transformers import TFAutoModel
# phobert = TFAutoModel.from_pretrained("vinai/phobert-base")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at vinai/phobert-base-v2 were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to us

In [38]:
text = 'Đồng thời , bệnh viện tiếp tục thực hiện các biện pháp phòng chống dịch bệnh COVID - 19 theo hướng dẫn của Bộ Y tế .'

tokenizer.encode(text.split(), add_special_tokens=False)

[2316,
 790,
 4,
 326,
 2142,
 917,
 9170,
 2927,
 380,
 9,
 19289,
 6222,
 292,
 335,
 1626,
 326,
 3,
 31,
 1195,
 63,
 455,
 376,
 7,
 125,
 2406,
 7564,
 5]

In [12]:
phobert.config.hidden_size

768

In [13]:
input_ids.size()

torch.Size([1, 7])

In [84]:
[tokenizer.decode(id) for id in [tokenizer.encode(sentence)]]

['<s> Chúng tôi là những nghiên cứu viên. </s>']

In [86]:
features.last_hidden_state.size()

torch.Size([1, 10, 768])

In [7]:
features.pooler_output.size()

torch.Size([1, 768])

In [12]:
from collections import Counter

cnt = Counter([1, 2, 3, 1, 2, 3, 3, 2, 1, 2])
cnt

Counter({1: 3, 2: 4, 3: 3})

In [26]:
import json

data = json.load(open('../data/syllable/train_syllable.json', 'r') )

In [39]:
len(data)

5027

In [40]:
sentence_in_length = []

for item in data:
    sentence_in_length.append(len(item['words']))

In [44]:
print("Maximum length of sentence:", max(sentence_in_length))
print("Avarage length of sentence:", sum(sentence_in_length) // len(sentence_in_length))

Maximum length of sentence: 186
Avarage length of sentence: 33


In [45]:
print("Cover rate of sentence which has length <= 100:", len([p for p in sentence_in_length if p <= 100]) / len(sentence_in_length))

Cover rate of sentence which has length <= 100: 0.9976128903918838


In [28]:
words = data[0]['words']
" ".join(words)

'Đồng thời , bệnh viện tiếp tục thực hiện các biện pháp phòng chống dịch bệnh COVID - 19 theo hướng dẫn của Bộ Y tế .'

In [29]:
len(words)

27

In [33]:
print(tokenizer.encode(words, add_special_tokens=False, max_length=100, padding='max_length', truncation=True))

[2316, 790, 4, 326, 2142, 917, 9170, 2927, 380, 9, 19289, 6222, 292, 335, 1626, 326, 3, 31, 1195, 63, 455, 376, 7, 125, 2406, 7564, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [79]:
def tokenize(tokenizer, words):
    results = []
    for w in words:
        token = tokenizer.encode(w)[1:-1]
        results.append(tokenizer.unk_token_id if len(token) > 1 else token[0])
    return results

print(tokenize(tokenizer, words))

[2316, 790, 4, 326, 2142, 917, 9170, 2927, 380, 9, 19289, 6222, 292, 335, 1626, 326, 3, 31, 1195, 63, 455, 376, 7, 125, 2406, 7564, 5]


In [80]:
input_ids = torch.tensor([tokenize(tokenizer, words)])

with torch.no_grad():
    features = phobert(input_ids)  # Models outputs are now tuples

features.last_hidden_state.size()

torch.Size([1, 27, 768])

In [75]:
input_ids

tensor([[    0,  2460,    70,     8,    21, 25925,  1098,  1430,     5,     2]])

In [26]:
for idx in input_ids[0]:
    print(tokenizer.decode(idx))

< s >
đ ồ n g
t h ờ i
,
b ệ n h
v i ệ n
t i ế p
t ụ c
t h ự c
h i ệ n
c á c
b i ệ n
p h á p
p h ò n g
c h ố n g
d ị c h
b ệ n h
c o @ @
v i @ @
d
-
1 9
t h e o
h ư ớ n g
d ẫ n
c ủ a
b ộ
y
t ế
.
< / s >


In [81]:
tokenizer.pad_token_id

1

In [57]:
from collections import Counter

cnt = Counter([1, 2, 3, 4, 3, 2, 3, 2, 1])
cnt

Counter({1: 2, 2: 3, 3: 3, 4: 1})

In [58]:
cnt.update([1, 5, 2, 4])
cnt

Counter({1: 3, 2: 4, 3: 3, 4: 2, 5: 1})

In [71]:
cnt = Counter([])

for item in data:
    try:
        cnt.update(item['tags'])
    except Exception:
        print('exception')
        print(item)

len(cnt)

21

In [67]:
cnt

Counter({'O': 129379,
         'B-ORGANIZATION': 1137,
         'I-ORGANIZATION': 4818,
         'B-SYMPTOM_AND_DISEASE': 1439,
         'I-SYMPTOM_AND_DISEASE': 2270,
         'B-LOCATION': 5398,
         'I-LOCATION': 12309,
         'B-DATE': 2549,
         'B-PATIENT_ID': 3240,
         'B-AGE': 682,
         'B-NAME': 349,
         'I-DATE': 2500,
         'B-JOB': 205,
         'I-JOB': 318,
         'B-TRANSPORTATION': 226,
         'B-GENDER': 542,
         'I-GENDER': 14,
         'I-TRANSPORTATION': 69,
         'I-NAME': 80,
         'I-AGE': 2,
         'I-PATIENT_ID': 15})

In [None]:
{
    'O': 0,
    'B-ORGANIZATION': 1,
    'I-ORGANIZATION': 2,
    'B-SYMPTOM_AND_DISEASE': 3,
    'I-SYMPTOM_AND_DISEASE': 4,
    'B-LOCATION': 5,
    'I-LOCATION': 6,
    'B-PATIENT_ID': 7,
    'I-PATIENT_ID': 8,
    'B-DATE': 9,
    'I-DATE': 10,
    'B-AGE': 11,
    'I-AGE': 12,
    'B-NAME': 13,
    'I-NAME': 14,
    'B-JOB': 15,
    'I-JOB': 16,
    'B-TRANSPORTATION': 17,
    'I-TRANSPORTATION': 18,
    'B-GENDER': 19,
    'I-GENDER': 20
}

In [68]:
sorted(cnt.keys())

['B-AGE',
 'B-DATE',
 'B-GENDER',
 'B-JOB',
 'B-LOCATION',
 'B-NAME',
 'B-ORGANIZATION',
 'B-PATIENT_ID',
 'B-SYMPTOM_AND_DISEASE',
 'B-TRANSPORTATION',
 'I-AGE',
 'I-DATE',
 'I-GENDER',
 'I-JOB',
 'I-LOCATION',
 'I-NAME',
 'I-ORGANIZATION',
 'I-PATIENT_ID',
 'I-SYMPTOM_AND_DISEASE',
 'I-TRANSPORTATION',
 'O']

In [None]:
import torch

# a = torch.randint(0, 10, size=(3, 10))
b = torch.softmax(torch.randn(size=(3, 10, 5)), dim = -1)
b.view(-1).size()

tensor([[False, False, False, False,  True, False, False, False, False, False],
        [False, False, False, False, False, False, False, False,  True, False],
        [False, False, False, False, False, False, False, False, False, False]])


tensor(2)