In [1]:
import torch
from transformers import AutoModel, AutoTokenizer

phobert = AutoModel.from_pretrained("vinai/phobert-base-v2")
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")

# INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
sentence = 'Chúng_tôi là những nghiên_cứu_viên .'  

input_ids = torch.tensor([tokenizer.encode(sentence)])

with torch.no_grad():
    features = phobert(input_ids)  # Models outputs are now tuples

## With TensorFlow 2.0+:
# from transformers import TFAutoModel
# phobert = TFAutoModel.from_pretrained("vinai/phobert-base")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at vinai/phobert-base-v2 were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to us

In [21]:
text = 'Đồng thời , bệnh viện tiếp tục thực hiện các biện pháp phòng chống dịch bệnh COVID - 19 theo hướng dẫn của Bộ Y tế .'

enc = tokenizer.encode(text.split(), add_special_tokens=False)
enc.insert(0, tokenizer.cls_token_id)
enc.append(tokenizer.sep_token_id)
print(enc)

[0, 2316, 790, 4, 326, 2142, 917, 9170, 2927, 380, 9, 19289, 6222, 292, 335, 1626, 326, 3, 31, 1195, 63, 455, 376, 7, 125, 2406, 7564, 5, 2]


In [3]:
phobert.config.hidden_size

768

In [4]:
input_ids.size()

torch.Size([1, 7])

In [5]:
[tokenizer.decode(id) for id in [tokenizer.encode(sentence)]]

['<s> Chúng_tôi là những nghiên_cứu_viên. </s>']

In [6]:
features.last_hidden_state.size()

torch.Size([1, 7, 768])

In [7]:
features.pooler_output.size()

torch.Size([1, 768])

In [8]:
from collections import Counter

cnt = Counter([1, 2, 3, 1, 2, 3, 3, 2, 1, 2])
cnt

Counter({2: 4, 1: 3, 3: 3})

In [9]:
import json

data = json.load(open('../data/syllable/train_syllable.json', 'r') )

In [10]:
len(data)

5027

In [11]:
sentence_in_length = []

for item in data:
    sentence_in_length.append(len(item['words']))

In [12]:
print("Maximum length of sentence:", max(sentence_in_length))
print("Avarage length of sentence:", sum(sentence_in_length) // len(sentence_in_length))

Maximum length of sentence: 186
Avarage length of sentence: 33


In [13]:
print("Cover rate of sentence which has length <= 100:", len([p for p in sentence_in_length if p <= 100]) / len(sentence_in_length))

Cover rate of sentence which has length <= 100: 0.9976128903918838


In [14]:
words = data[0]['words']
" ".join(words)

'Đồng thời , bệnh viện tiếp tục thực hiện các biện pháp phòng chống dịch bệnh COVID - 19 theo hướng dẫn của Bộ Y tế .'

In [15]:
len(words)

27

In [16]:
print(tokenizer.encode(words, add_special_tokens=False, max_length=100, padding='max_length', truncation=True))

[2316, 790, 4, 326, 2142, 917, 9170, 2927, 380, 9, 19289, 6222, 292, 335, 1626, 326, 3, 31, 1195, 63, 455, 376, 7, 125, 2406, 7564, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [17]:
def tokenize(tokenizer, words):
    results = []
    for w in words:
        token = tokenizer.encode(w)[1:-1]
        results.append(tokenizer.unk_token_id if len(token) > 1 else token[0])
    return results

print(tokenize(tokenizer, words))

[2316, 790, 4, 326, 2142, 917, 9170, 2927, 380, 9, 19289, 6222, 292, 335, 1626, 326, 3, 31, 1195, 63, 455, 376, 7, 125, 2406, 7564, 5]


In [18]:
input_ids = torch.tensor([tokenize(tokenizer, words)])

with torch.no_grad():
    features = phobert(input_ids)  # Models outputs are now tuples

features.last_hidden_state.size()

torch.Size([1, 27, 768])

In [None]:
input_ids

: 

In [None]:
for idx in input_ids[0]:
    print(tokenizer.decode(idx))

: 

In [None]:
tokenizer.pad_token_id

: 

In [None]:
from collections import Counter

cnt = Counter([1, 2, 3, 4, 3, 2, 3, 2, 1])
cnt

: 

In [None]:
cnt.update([1, 5, 2, 4])
cnt

: 

In [None]:
cnt = Counter([])

for item in data:
    try:
        cnt.update(item['tags'])
    except Exception:
        print('exception')
        print(item)

len(cnt)

: 

In [None]:
cnt

: 

In [1]:
{
    'O': 0,
    'B-ORGANIZATION': 1,
    'I-ORGANIZATION': 2,
    'B-SYMPTOM_AND_DISEASE': 3,
    'I-SYMPTOM_AND_DISEASE': 4,
    'B-LOCATION': 5,
    'I-LOCATION': 6,
    'B-PATIENT_ID': 7,
    'I-PATIENT_ID': 8,
    'B-DATE': 9,
    'I-DATE': 10,
    'B-AGE': 11,
    'I-AGE': 12,
    'B-NAME': 13,
    'I-NAME': 14,
    'B-JOB': 15,
    'I-JOB': 16,
    'B-TRANSPORTATION': 17,
    'I-TRANSPORTATION': 18,
    'B-GENDER': 19,
    'I-GENDER': 20
}

{'O': 0,
 'B-ORGANIZATION': 1,
 'I-ORGANIZATION': 2,
 'B-SYMPTOM_AND_DISEASE': 3,
 'I-SYMPTOM_AND_DISEASE': 4,
 'B-LOCATION': 5,
 'I-LOCATION': 6,
 'B-PATIENT_ID': 7,
 'I-PATIENT_ID': 8,
 'B-DATE': 9,
 'I-DATE': 10,
 'B-AGE': 11,
 'I-AGE': 12,
 'B-NAME': 13,
 'I-NAME': 14,
 'B-JOB': 15,
 'I-JOB': 16,
 'B-TRANSPORTATION': 17,
 'I-TRANSPORTATION': 18,
 'B-GENDER': 19,
 'I-GENDER': 20}

In [2]:
sorted(cnt.keys())

NameError: name 'cnt' is not defined

In [None]:
import torch

# a = torch.randint(0, 10, size=(3, 10))
b = torch.softmax(torch.randn(size=(3, 10, 5)), dim = -1)
b.view(-1).size()

: 