In [1]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch
import torch.nn as nn
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.optim as optim
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import  AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report

In [2]:
!gdown 1MGOpGIl9-bBU1zbYJcD7OCE_2t3eSUsX
!gdown 1e2s-OOu18iC2dvCA6_GDfiD85PChx3R3

# !gdown

Downloading...
From: https://drive.google.com/uc?id=1MGOpGIl9-bBU1zbYJcD7OCE_2t3eSUsX
To: /content/train_word.json
100% 2.32M/2.32M [00:00<00:00, 208MB/s]
Downloading...
From: https://drive.google.com/uc?id=1e2s-OOu18iC2dvCA6_GDfiD85PChx3R3
To: /content/test_word.json
100% 1.54M/1.54M [00:00<00:00, 174MB/s]


In [3]:
train_word = pd.read_json(r"/content/train_word.json",  encoding='utf-8', lines = True)
test_word = pd.read_json(r"/content/test_word.json",  encoding='utf-8', lines = True)
# dev_word = pd.read_json(r"C:\Users\VIET HOANG - VTS\Downloads\PhoNER_COVID19-main\PhoNER_COVID19-main\data\word\dev_word.json",  encoding='utf-8', lines = True)

In [11]:
import numpy as np
import torch
from torch.utils.data import Dataset
from keras.preprocessing.sequence import pad_sequences

class VNerDataset(Dataset):
    def __init__(self, data_json, tokenizer , max_len=50, pad_token='<PAD>', unk_token='<UNK>', o_tag='O'):
        self.data = data_json
        self.max_len = max_len
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.o_tag = o_tag

        # Khởi tạo từ điển từ và nhãn
        self.vocab2idx = self.build_vocab2idx()
        self.tag2idx = self.build_tag2idx()
        self.tokenizer = tokenizer
    def build_vocab(self):
        data = pd.read_json(r"/content/train_word.json",  encoding='utf-8', lines = True)
        vocab = set(word for sentence in data['words'] for word in sentence)
        data2 = pd.read_json(r"/content/test_word.json",  encoding='utf-8', lines = True)
        vocab2 = set(word for sentence in data2['words'] for word in sentence)
        vocab = vocab.union(vocab2)

        return vocab

    def build_tag(self):
        tags = set(tag for tags in self.data['tags'] for tag in tags)
        return tags

    def build_tag2idx(self):
        tag2idx = {tag: i for i, tag in enumerate(self.build_tag())}
        return tag2idx

    def build_vocab2idx(self):
        vocab2idx = {vocab: i for i, vocab in enumerate(self.build_vocab())}
        vocab2idx[self.pad_token] = len(vocab2idx)
        vocab2idx[self.unk_token] = len(vocab2idx) + 1
        return vocab2idx

    def encode_data(self):
        X, y = [], []
        for words, tags in zip(self.data['words'], self.data['tags']):
            word_ids = [self.vocab2idx.get(word, self.vocab2idx[self.unk_token]) for word in words]
            tag_ids = [self.tag2idx[tag] for tag in tags]

            # Padding cho mỗi câu
            word_ids = pad_sequences([word_ids], maxlen=self.max_len, padding='post', value=self.vocab2idx[self.pad_token])[0]
            tag_ids = pad_sequences([tag_ids], maxlen=self.max_len, padding='post', value=self.tag2idx[self.o_tag])[0]

            X.append(word_ids)
            y.append(tag_ids)


        return torch.tensor(X, dtype=torch.long), torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        words, tags = self.data['words'][idx], self.data['tags'][idx]
        # word_ids = [self.vocab2idx.get(word, self.vocab2idx[self.unk_token]) for word in words]
        tag_ids = [self.tag2idx[tag] for tag in tags]

        # word_ids = pad_sequences([word_ids], maxlen=self.max_len, padding='post', value=self.vocab2idx[self.pad_token])[0]
        tag_ids = pad_sequences([tag_ids], maxlen=self.max_len, padding='post', value=self.tag2idx[self.o_tag])[0]

        encode = self.tokenizer(words, is_split_into_words=True, padding='max_length', max_length=self.max_len, truncation=True)
        encode['labels'] = tag_ids

        return {key: torch.tensor(val) for key, val in encode.items()}

In [4]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

# Tải PhoBERT và tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
model = AutoModelForTokenClassification.from_pretrained("vinai/phobert-base", num_labels=20)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
train_dataset = VNerDataset(train_word, tokenizer, max_len=50)
test_dataset = VNerDataset(test_word, tokenizer, max_len=50)
# dev_dataset = VNerDataset(dev_word, max_len=50)


In [13]:
from transformers import DataCollatorForTokenClassification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)


trainer.train()



Epoch,Training Loss,Validation Loss
1,0.3797,0.513236


TrainOutput(global_step=158, training_loss=0.5944475358045553, metrics={'train_runtime': 73.5317, 'train_samples_per_second': 68.365, 'train_steps_per_second': 2.149, 'total_flos': 128296148178000.0, 'train_loss': 0.5944475358045553, 'epoch': 1.0})

In [36]:
from sklearn.metrics import classification_report
import numpy as np
from tqdm import tqdm
import torch
def validate_model(model, test_loader, tag2idx, device):
    model.eval()
    true_labels, pred_labels = [], []

    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids'].to(device).long()
            attention_mask = batch['attention_mask'].to(device).long()
            labels = batch['labels'].to(device).long()

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=2).cpu().numpy()

            true_labels.extend(labels.cpu().numpy())
            pred_labels.extend(predicted_labels)

    true_labels = np.array(true_labels).flatten()
    pred_labels = np.array(pred_labels).flatten()

    print(classification_report(true_labels, pred_labels, target_names=list(tag2idx.keys())))

tag2idx = train_dataset.tag2idx

test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

validate_model(model, test_loader, tag2idx, device)


100%|██████████| 94/94 [00:12<00:00,  7.62it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                       precision    recall  f1-score   support

B-SYMPTOM_AND_DISEASE       0.00      0.00      0.00      1128
           B-LOCATION       0.64      0.13      0.21      4354
       B-ORGANIZATION       0.00      0.00      0.00       715
               B-NAME       0.00      0.00      0.00       309
     I-TRANSPORTATION       0.00      0.00      0.00        68
       I-ORGANIZATION       0.83      0.03      0.06      1898
                B-JOB       0.00      0.00      0.00       169
                I-JOB       0.00      0.00      0.00       114
                    O       0.89      0.99      0.94    128574
               I-DATE       0.71      0.67      0.69      1639
                B-AGE       0.00      0.00      0.00       551
     B-TRANSPORTATION       0.00      0.00      0.00       190
         B-PATIENT_ID       0.00      0.00      0.00      1930
         I-PATIENT_ID       0.00      0.00      0.00        27
I-SYMPTOM_AND_DISEASE       0.00      0.00      0.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
