In [1]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch
import torch.nn as nn
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.optim as optim
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
!gdown 1MGOpGIl9-bBU1zbYJcD7OCE_2t3eSUsX
!gdown 1e2s-OOu18iC2dvCA6_GDfiD85PChx3R3
# !gdown

Downloading...
From: https://drive.google.com/uc?id=1MGOpGIl9-bBU1zbYJcD7OCE_2t3eSUsX
To: /content/train_word.json
100% 2.32M/2.32M [00:00<00:00, 134MB/s]
Downloading...
From: https://drive.google.com/uc?id=1e2s-OOu18iC2dvCA6_GDfiD85PChx3R3
To: /content/test_word.json
100% 1.54M/1.54M [00:00<00:00, 50.0MB/s]


In [3]:
train_word = pd.read_json(r"/content/train_word.json",  encoding='utf-8', lines = True)
test_word = pd.read_json(r"/content/test_word.json",  encoding='utf-8', lines = True)
# dev_word = pd.read_json(r"C:\Users\VIET HOANG - VTS\Downloads\PhoNER_COVID19-main\PhoNER_COVID19-main\data\word\dev_word.json",  encoding='utf-8', lines = True)

In [None]:
# thêm tập test vô word2idx

In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset
from keras.preprocessing.sequence import pad_sequences

class VNerDataset(Dataset):
    def __init__(self, data_json, max_len=50, pad_token='<PAD>', unk_token='<UNK>', o_tag='O'):
        self.data = data_json
        self.max_len = max_len
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.o_tag = o_tag

        # Khởi tạo từ điển từ và nhãn
        self.vocab2idx = self.build_vocab2idx()
        self.tag2idx = self.build_tag2idx()

    def build_vocab(self):
        data = pd.read_json(r"/content/train_word.json",  encoding='utf-8', lines = True)
        vocab = set(word for sentence in data['words'] for word in sentence)
        data2 = pd.read_json(r"/content/test_word.json",  encoding='utf-8', lines = True)
        vocab2 = set(word for sentence in data2['words'] for word in sentence)
        vocab = vocab.union(vocab2)

        return vocab

    def build_tag(self):
        tags = set(tag for tags in self.data['tags'] for tag in tags)
        return tags

    def build_tag2idx(self):
        tag2idx = {tag: i for i, tag in enumerate(self.build_tag())}
        return tag2idx

    def build_vocab2idx(self):
        vocab2idx = {vocab: i for i, vocab in enumerate(self.build_vocab())}
        vocab2idx[self.pad_token] = len(vocab2idx)
        vocab2idx[self.unk_token] = len(vocab2idx) + 1
        return vocab2idx

    def encode_data(self):
        X, y = [], []
        for words, tags in zip(self.data['words'], self.data['tags']):
            word_ids = [self.vocab2idx.get(word, self.vocab2idx[self.unk_token]) for word in words]
            tag_ids = [self.tag2idx[tag] for tag in tags]

            # Padding cho mỗi câu
            word_ids = pad_sequences([word_ids], maxlen=self.max_len, padding='post', value=self.vocab2idx[self.pad_token])[0]
            tag_ids = pad_sequences([tag_ids], maxlen=self.max_len, padding='post', value=self.tag2idx[self.o_tag])[0]

            X.append(word_ids)
            y.append(tag_ids)


        return torch.tensor(X, dtype=torch.long), torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        words, tags = self.data['words'][idx], self.data['tags'][idx]
        word_ids = [self.vocab2idx.get(word, self.vocab2idx[self.unk_token]) for word in words]
        tag_ids = [self.tag2idx[tag] for tag in tags]

        word_ids = pad_sequences([word_ids], maxlen=self.max_len, padding='post', value=self.vocab2idx[self.pad_token])[0]
        tag_ids = pad_sequences([tag_ids], maxlen=self.max_len, padding='post', value=self.tag2idx[self.o_tag])[0]
        
        return torch.tensor(word_ids, dtype=torch.long), torch.tensor(tag_ids, dtype=torch.long)

In [None]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes, num_layers=1, pad_idx=0):
        super(BiLSTM, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.bilstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers,
                              bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.bilstm(embedded)
        logits = self.fc(lstm_out)
        return logits

In [6]:

train_dataset = VNerDataset(train_word, max_len=50)
test_dataset = VNerDataset(test_word, max_len=50)
# dev_dataset = VNerDataset(dev_word, max_len=50)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=False)

In [7]:
num_words = len(train_dataset.vocab2idx)
num_tags = len(train_dataset.tag2idx)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
print(num_words, num_tags)

7254 20


In [None]:
model1 = BiLSTM(vocab_size = num_words, embedding_dim= 50, hidden_size = 100, num_classes = num_tags).to(device)
# Initialize optimizer and loss function
optimizer = optim.Adam(model1.parameters(), lr=3e-4)  # Learning rate set to 0.0003
loss_fn = nn.CrossEntropyLoss()

# Training loop
for epoch in range(10):
    print(f"Epoch {epoch + 1}:")
    model1.train()
    for batch_item in tqdm(train_loader, desc="training"):
        word, label = batch_item
        word, label = word.to(device), label.to(device)
        
        # Forward pass
        outputs = model1(word)
        outputs = outputs.view(-1, outputs.size(-1))  # (batch_size * sequence_length, num_classes)
        labels = label.view(-1)  # (batch_size * sequence_length)

        # Tính toán loss
        loss = loss_fn(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


Epoch 1:


training: 100%|██████████| 158/158 [00:11<00:00, 13.60it/s]


Epoch 2:


training: 100%|██████████| 158/158 [00:10<00:00, 15.34it/s]


Epoch 3:


training: 100%|██████████| 158/158 [00:10<00:00, 14.70it/s]


Epoch 4:


training: 100%|██████████| 158/158 [00:11<00:00, 13.41it/s]


Epoch 5:


training: 100%|██████████| 158/158 [00:12<00:00, 13.03it/s]


Epoch 6:


training: 100%|██████████| 158/158 [00:12<00:00, 12.87it/s]


Epoch 7:


training: 100%|██████████| 158/158 [00:11<00:00, 13.24it/s]


Epoch 8:


training: 100%|██████████| 158/158 [00:11<00:00, 14.19it/s]


Epoch 9:


training: 100%|██████████| 158/158 [00:11<00:00, 13.26it/s]


Epoch 10:


training: 100%|██████████| 158/158 [00:13<00:00, 11.77it/s]


In [11]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [None]:
from sklearn.metrics import classification_report, f1_score, accuracy_score
import os
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

model1.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch_item in test_loader:
        x_batch, y_batch = batch_item
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        outputs = model1(y_batch)

        _, predicted = torch.max(outputs, dim=2)

        mask = (y_batch != 0)
        filtered_preds = predicted[mask].cpu().numpy()
        filtered_labels = y_batch[mask].cpu().numpy()

        all_predictions.extend(filtered_preds)
        all_labels.extend(filtered_labels)

print("Classification Report:")
print(classification_report(all_labels, all_predictions, zero_division=1))

f1_micro = f1_score(all_labels, all_predictions, average='micro')
f1_macro = f1_score(all_labels, all_predictions, average='macro')
accuracy = accuracy_score(all_labels, all_predictions)

print(f"Micro F1-score: {f1_micro}")
print(f"Macro F1-score: {f1_macro}")
print(f"Overall Accuracy: {accuracy}")

Classification Report:
              precision    recall  f1-score   support

           1       1.00      0.00      0.00       551
           2       0.00      0.00      0.00       114
           3       1.00      0.00      0.00      1128
           4       0.58      0.01      0.02      4832
           5       0.00      0.00      0.00      1930
           6       1.00      0.00      0.00       309
           7       0.00      0.00      0.00      1474
           8       0.00      0.00      0.00       715
           9       0.00      0.00      0.00      1898
          10       1.00      0.00      0.00        68
          11       1.00      0.00      0.00       427
          12       1.00      0.00      0.00        13
          13       1.00      0.00      0.00        27
          14       1.00      0.00      0.00      4354
          15       1.00      0.00      0.00       190
          16       0.86      0.80      0.83    128574
          17       0.00      0.00      0.00      1639
    

In [13]:
torch.save(model1.state_dict(), 'nlp2_model.pth')
print("mo hinh da duoc luu!")

mo hinh da duoc luu!
