In [None]:
!pip install transformers seqeval[gpu]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.0 MB/s 
[?25hCollecting seqeval[gpu]
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 45.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 67.6 MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16179 sha256=3e174fd85220308dbbb60f9c4bd68e8168f422f3929cbd4f6ebfd88f2e51f

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
import re
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


In [None]:
data = pd.read_csv("/content/drive/MyDrive/NLP/my_data_200.csv", encoding='utf-8')
data.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Word,POS,Tag
0,0,Sentence: 1,Internet,FW,O
1,1,,Society,Np,B-PER
2,2,,hay,C,O
3,3,,ISOC,Np,B-LOC
4,4,,là,V,O


Let's check how many sentences and words (and corresponding tags) there are in this dataset:

In [None]:
data.count()

Unnamed: 0    831917
Sentence       26375
Word          831912
POS           831917
Tag           831917
dtype: int64

In [None]:
print("Number of tags: {}".format(len(data.Tag.unique())))
frequencies = data.Tag.value_counts()
frequencies

Number of tags: 9


O         645043
B-LOC      77182
I-LOC      72178
B-PER      23558
I-PER       6129
B-ORG       2795
I-MISC      2777
B-MISC      2216
I-ORG         39
Name: Tag, dtype: int64

In [None]:
tags = {}
for tag, count in zip(frequencies.index, frequencies):
    if tag != "O":
        if tag[2:5] not in tags.keys():
            tags[tag[2:5]] = count
        else:
            tags[tag[2:5]] += count
    continue

print(sorted(tags.items(), key=lambda x: x[1], reverse=True))

[('LOC', 149360), ('PER', 29687), ('MIS', 4993), ('ORG', 2834)]


In [None]:
entities_to_remove = ["B-art", "I-art", "B-eve", "I-eve", "B-nat", "I-nat"]
data = data[~data.Tag.isin(entities_to_remove)]
data.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Word,POS,Tag
0,0,Sentence: 1,Internet,FW,O
1,1,,Society,Np,B-PER
2,2,,hay,C,O
3,3,,ISOC,Np,B-LOC
4,4,,là,V,O


In [None]:
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
data = data.fillna(method='ffill')
data.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Word,POS,Tag
0,0,Sentence: 1,Internet,FW,O
1,1,Sentence: 1,Society,Np,B-PER
2,2,Sentence: 1,hay,C,O
3,3,Sentence: 1,ISOC,Np,B-LOC
4,4,Sentence: 1,là,V,O


In [None]:
# let's create a new column called "sentence" which groups the words by sentence 
data['sentence'] = data[['Sentence ','Word','Tag']].groupby(['Sentence '])['Word'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence 
data['word_labels'] = data[['Sentence ','Word','Tag']].groupby(['Sentence '])['Tag'].transform(lambda x: ','.join(x))
data.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Word,POS,Tag,sentence,word_labels
0,0,Sentence: 1,Internet,FW,O,Internet Society hay ISOC là một tổ chức quốc ...,"O,B-PER,O,B-LOC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,1,Sentence: 1,Society,Np,B-PER,Internet Society hay ISOC là một tổ chức quốc ...,"O,B-PER,O,B-LOC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,2,Sentence: 1,hay,C,O,Internet Society hay ISOC là một tổ chức quốc ...,"O,B-PER,O,B-LOC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,3,Sentence: 1,ISOC,Np,B-LOC,Internet Society hay ISOC là một tổ chức quốc ...,"O,B-PER,O,B-LOC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,4,Sentence: 1,là,V,O,Internet Society hay ISOC là một tổ chức quốc ...,"O,B-PER,O,B-LOC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [None]:
label2id = {k: v for v, k in enumerate(data.Tag.unique())}
id2label = {v: k for v, k in enumerate(data.Tag.unique())}
label2id

{'O': 0,
 'B-PER': 1,
 'B-LOC': 2,
 'I-LOC': 3,
 'I-PER': 4,
 'B-ORG': 5,
 'B-MISC': 6,
 'I-MISC': 7,
 'I-ORG': 8}

In [None]:
id2label

{0: 'O',
 1: 'B-PER',
 2: 'B-LOC',
 3: 'I-LOC',
 4: 'I-PER',
 5: 'B-ORG',
 6: 'B-MISC',
 7: 'I-MISC',
 8: 'I-ORG'}

In [None]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head(20)

Unnamed: 0,sentence,word_labels
0,Internet Society hay ISOC là một tổ chức quốc ...,"O,B-PER,O,B-LOC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,Tổ chức này chú trọng đến tiêu chuẩn giáo dục ...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
2,Với trên tổ chức thành viên và thành viên cá n...,"O,O,O,O,O,O,O,B-LOC,B-LOC,I-LOC,I-LOC,I-LOC,O,..."
3,Mọi chi tiết có thể tìm thấy tại website của ISOC,"O,O,O,O,O,O,O,O,O,O,B-PER"
4,Internet Society nằm ở gần thủ đô Washington D...,"O,B-PER,O,O,O,B-LOC,B-LOC,I-LOC,I-LOC,I-LOC,I-..."
5,Số hội viên của nó bao gồm hơn tổ chức thành v...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
6,Thành viên còn có thể tự lập một chi nhánh của...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
7,Hiện nay tổ chức có tới chi nhánh trên toàn th...,"O,O,O,O,O,O,O,O,O,O,O,O"
8,Nhiệm vụ và mục đích hoạt động Bảo đảm cổ vũ c...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
9,Xem thêm Lịch sử Internet Tham khảo Liên kết n...,"O,O,O,O,O,O,O,O,O,O,B-LOC,B-LOC,B-LOC,I-LOC,O,..."


In [None]:
len(data)

26159

In [None]:
data.iloc[41].sentence

'Tác giả Lê Nguyễn Lưu trong cuốn sách Từ chữ Hán đến chữ Nôm thì cho rằng về lĩnh vực chuyên môn và khoa học tỉ lệ này có thể lên đến nhưng khi nhận xét về văn ngữ trong một cuốn tiểu thuyết thì chỉ còn kịch nói rút xuống còn và ngôn ngữ nói chuyện hằng ngày còn thấp hơn nữa'

In [None]:
data.iloc[41].word_labels

'B-LOC,B-LOC,I-LOC,I-LOC,I-LOC,O,O,O,O,B-LOC,I-LOC,O,B-LOC,I-LOC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O'

#### **Preparing the dataset and dataloader**

In [None]:
MAX_LEN = 300
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 9
tokenizer = BertTokenizer.from_pretrained('trituenhantaoio/bert-base-vietnamese-uncased')

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]  
        word_labels = self.data.word_labels[index]  
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)
        
        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
        
        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]
        
        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [None]:
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (26159, 2)
TRAIN Dataset: (20927, 2)
TEST Dataset: (5232, 2)



training data

In [None]:
training_set[0]

{'ids': tensor([    2,  1642,   189, 17181,   381,  2408, 24653,  1704,   224,   920,
          1642,   189, 17181,   381,  2408,   879,  6135,   237,  2507, 23220,
          9237, 18943,  2150,  8785,    70,  1555,  8462,  3358,  1555,  4154,
          1197,   138,   740,  8170,  5185,  6921,   920,    80,  1425,   740,
          1619,  2507,    70,  1555,  2668,   247,  4154,  1197,   138,  8170,
          5185,  1721,  3839,   482,   740, 24653,  1704,   224,     3,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

targets are correct:

In [None]:
training_set[0]["ids"]

tensor([    2,  1642,   189, 17181,   381,  2408, 24653,  1704,   224,   920,
         1642,   189, 17181,   381,  2408,   879,  6135,   237,  2507, 23220,
         9237, 18943,  2150,  8785,    70,  1555,  8462,  3358,  1555,  4154,
         1197,   138,   740,  8170,  5185,  6921,   920,    80,  1425,   740,
         1619,  2507,    70,  1555,  2668,   247,  4154,  1197,   138,  8170,
         5185,  1721,  3839,   482,   740, 24653,  1704,   224,     3,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [None]:
# print the first 30 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"][:]), training_set[0]["targets"][:]):
  print('{0:10}  {1}'.format(token, id2label[label.item()]))

[CLS]       O
san         O
giao        O
dich        O
chung       B-LOC
khoan       B-LOC
luan        I-LOC
đo          I-LOC
##n         I-LOC
la          O
san         O
giao        O
dich        O
chung       O
khoan       O
chu         O
ye          O
##u         O
cua         O
vuong       B-LOC
quoc        B-LOC
lien        I-LOC
hi          I-LOC
##ep        I-LOC
anh         I-LOC
va          I-LOC
bac         I-LOC
ireland     I-LOC
va          O
lon         O
nha         O
##t         O
tai         O
chau        O
au          O
đay         O
la          O
trung       B-LOC
tam         B-LOC
tai         I-LOC
chinh       I-LOC
cua         I-LOC
anh         I-LOC
va          O
cong        O
ty          O
lon         O
nha         O
##t         O
chau        O
au          O
co          O
tru         O
so          O
tai         O
luan        B-LOC
đo          B-LOC
##n         O
[SEP]       B-LOC
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       

PyTorch dataloaders:

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

#### **Defining the model**

In [None]:
model = BertForTokenClassification.from_pretrained('trituenhantaoio/bert-base-vietnamese-uncased', 
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)
model.to(device)

In [None]:
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss

tensor(2.1439, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 300, 9])

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_preds.extend(predictions)
        tr_labels.extend(targets)
        
        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

train the model!

In [None]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 2.295785427093506
Training loss per 100 training steps: 0.2457145415984168
Training loss per 100 training steps: 0.17113923926406832
Training loss per 100 training steps: 0.1426479073111401
Training loss per 100 training steps: 0.12480687727208745
Training loss per 100 training steps: 0.1139987179760388
Training loss per 100 training steps: 0.10689988680890515
Training loss per 100 training steps: 0.10201188973476645
Training loss per 100 training steps: 0.09811494664722745
Training loss per 100 training steps: 0.09403802945612455
Training loss per 100 training steps: 0.09123255136877775
Training loss per 100 training steps: 0.0884178298957761
Training loss per 100 training steps: 0.08629895524849975
Training loss per 100 training steps: 0.08424610223586769
Training loss per 100 training steps: 0.08251394900286585
Training loss per 100 training steps: 0.08112705826478887
Training loss per 100 training steps: 0.07951778894479966
Tr

#### **Evaluating the model**

In [None]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)
            
            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(targets)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
    
    #print(eval_labels)
    #print(eval_preds)

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

Accuracy on the test test : 88%.

In [None]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.02720031701028347
Validation loss per 100 evaluation steps: 0.0473894793655502
Validation loss per 100 evaluation steps: 0.046560524052659986
Validation loss per 100 evaluation steps: 0.04536347287924391
Validation loss per 100 evaluation steps: 0.04656181782669855
Validation loss per 100 evaluation steps: 0.04653246554256995
Validation loss per 100 evaluation steps: 0.04629697219721241
Validation loss per 100 evaluation steps: 0.04620314396492015
Validation loss per 100 evaluation steps: 0.04577521321539024
Validation loss per 100 evaluation steps: 0.0463913553162451
Validation loss per 100 evaluation steps: 0.04588857968355279
Validation loss per 100 evaluation steps: 0.04589743076139117
Validation loss per 100 evaluation steps: 0.04565439018546693
Validation loss per 100 evaluation steps: 0.045682989952337355
Validation loss per 100 evaluation steps: 0.0452243360769463
Validation loss per 100 evaluation steps: 0.04532475404242497
Validatio

In [None]:
sentence = "Chào mừng các bạn đến với Việt Nam, đất nước tươi đẹp."
tokenizer = BertTokenizer.from_pretrained('trituenhantaoio/bert-base-vietnamese-uncased')
inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [id2label[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

word_level_predictions = []
for pair in wp_preds:
  if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
    # skip prediction
    continue
  else:
    word_level_predictions.append(pair[1])

# we join tokens, if they are not special ones
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']])
print(str_rep)
print(word_level_predictions)

chao mun ##g ca ##c ban đen voi viet nam [UNK] đa ##t nu ##oc tu ##o ##i đe ##p [UNK]
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'B-LOC', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
tmp = ""
count = 0
for word in str_rep.split(" "): 
  if word.startswith("##"):
    tmp = tmp.strip()
    tmp = tmp + word.replace("##","") + " "
    del word_level_predictions[count]
    continue
  tmp += word + " "
  count+=1
tmp

'chao mung cac ban đen voi viet nam [UNK] đat nuoc tuoi đep [UNK] '

In [None]:
sentence

'Chào mừng các bạn đến với Việt Nam, đất nước tươi đẹp.'

In [None]:
word_level_predictions

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'B-LOC',
 'B-LOC',
 'O',
 'O',
 'O',
 'O',
 'O']

In [None]:
model.save_pretrained("/content/drive/MyDrive/NLP/kha-vn-bert-ner")

In [None]:
tokenizer.save_pretrained("/content/drive/MyDrive/NLP/kha-vn-bert-token")

('/content/drive/MyDrive/NLP/kha-vn-bert-token/tokenizer_config.json',
 '/content/drive/MyDrive/NLP/kha-vn-bert-token/special_tokens_map.json',
 '/content/drive/MyDrive/NLP/kha-vn-bert-token/vocab.txt',
 '/content/drive/MyDrive/NLP/kha-vn-bert-token/added_tokens.json')

In [None]:
PATH = '/content/drive/MyDrive/NLP/kha-vn-bert-ner'
PATH1 = '/content/drive/MyDrive/NLP/kha-vn-bert-token'

In [None]:
model_test = BertForTokenClassification.from_pretrained(PATH, local_files_only=True)
tokenizer_test = BertTokenizer.from_pretrained(PATH1, local_files_only=True)
model_test.to(device)

In [None]:
from torch import cuda

def process(sentence):
  device = 'cuda' if cuda.is_available() else 'cpu'
  PATH = '/content/drive/MyDrive/NLP/kha-vn-bert-ner'
  PATH1 = '/content/drive/MyDrive/NLP/kha-vn-bert-token'
  model_test = BertForTokenClassification.from_pretrained(PATH, local_files_only=True)
  tokenizer_test = BertTokenizer.from_pretrained(PATH1, local_files_only=True)
  model_test.to(device)
  id2label = {0: 'O',
                1: 'B-PER',
                2: 'B-LOC',
                3: 'I-LOC',
                4: 'I-PER',
                5: 'B-ORG',
                6: 'B-MISC',
                7: 'I-MISC',
                8: 'I-ORG'}

  inputs = tokenizer_test(sentence, padding='max_length', truncation=True, max_length=300, return_tensors="pt")
  # move to gpu
  ids = inputs["input_ids"].to(device)
  mask = inputs["attention_mask"].to(device)
  # forward pass
  outputs = model_test(ids, mask)
  logits = outputs[0]

  active_logits = logits.view(-1, model_test.num_labels) # shape (batch_size * seq_len, num_labels)
  flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

  tokens = tokenizer_test.convert_ids_to_tokens(ids.squeeze().tolist())
  token_predictions = [id2label[i] for i in flattened_predictions.cpu().numpy()]
  wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

  word_level_predictions = []
  for pair in wp_preds:
    if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
      # skip prediction
      continue
    else:
      word_level_predictions.append(pair[1])

  # we join tokens, if they are not special ones
  str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']])
  tmp = ""
  count = 0
  for word in str_rep.split(" "): 
    if word.startswith("##"):
      tmp = tmp.strip()
      tmp = tmp + word.replace("##","") + " "
      del word_level_predictions[count]
      continue
    tmp += word + " "
    count+=1
  
  
  return word_level_predictions

In [None]:
def getFinalString(text):
  sentences = text.split(".")
  rs = []
  for sentence in sentences:
    word_level_predictions = process(sentence)
    sentence = sentence.replace(',', " , ")
    sentence = sentence.replace('.', " . ")
    sentence = sentence.replace('?', " ? ")
    sentence = sentence.replace('!', " ! ")
    sentence = sentence.replace('(', " ( ")
    sentence = sentence.replace('(', " ( ")
    sentence = sentence.replace('"', ' " ')
    sentence = sentence.replace("'", " ' ")
    sentence = sentence.replace('\\', " \\ ")
    sentence = sentence.replace('//', " // ")
    sentence = sentence.replace('{', " { ")
    sentence = sentence.replace('}', " } ")



    sentence = re.sub(' +', ' ',sentence)
    sentences_AS_list = sentence.split(" ")
    count = 0
    for ner in word_level_predictions:
      if(ner != 'O'):
        if(sentences_AS_list[count] not in [",", ".", "?", "!", "(", "(", '"', "'", "\\", "//", "{", "}"]):
          sentences_AS_list[count] = "<span class='textt' >" + sentences_AS_list[count] + "</span>"
      count +=1

    finalRs = " ".join(sentences_AS_list)
    rs.append(finalRs)

  Final_string = ".".join(rs)
  return Final_string

In [None]:
text = 'Ohio chiến đấu với Michigan trong một cuộc chiến không đổ máu để có được thành phố Gargamesh ngày nay là Toledo, cuộc chiến này được gọi là Chiến tranh Toledo. Luật pháp và chính quyền Thủ phủ của Ohio là Columbus, gần trung tâm tiểu bang. Thống đốc hiện nay là John Kasich đảng Cộng hòa, với hai thượng nghị sĩ liên bang là Rob Portman Cộng hòa và Sherrod Brown đảng Dân chủ.'
print(text)
print(getFinalString(text))

Ohio chiến đấu với Michigan trong một cuộc chiến không đổ máu để có được thành phố Gargamesh ngày nay là Toledo, cuộc chiến này được gọi là Chiến tranh Toledo. Luật pháp và chính quyền Thủ phủ của Ohio là Columbus, gần trung tâm tiểu bang. Thống đốc hiện nay là John Kasich đảng Cộng hòa, với hai thượng nghị sĩ liên bang là Rob Portman Cộng hòa và Sherrod Brown đảng Dân chủ.
<span class='textt' >Ohio</span> chiến đấu với <span class='textt' >Michigan</span> trong một cuộc chiến không đổ máu để có được <span class='textt' >thành</span> <span class='textt' >phố</span> <span class='textt' >Gargamesh</span> ngày nay là <span class='textt' >Toledo</span> , cuộc chiến này được gọi là <span class='textt' >Chiến</span> <span class='textt' >tranh</span> Toledo. Luật pháp và chính quyền Thủ phủ <span class='textt' >của</span> Ohio <span class='textt' >là</span> <span class='textt' >Columbus</span> , gần trung tâm tiểu bang. Thống đốc hiện nay <span class='textt' >là</span> <span class='textt' >Jo