### 1. Task 1

Read data:

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
def read_conllu(path):
    sentences = []
    with open(path, encoding='utf-8') as f:
        tokens = []
        for line in f:
            line = line.rstrip('\n')
            if not line:
                if tokens:
                    sentences.append(tokens)
                tokens = []
                continue
            if line.startswith('#'):
                continue
            cols = line.split('\t')
            if len(cols) != 10:
                raise ValueError("Expected 10 columns, got %d: %r" % (len(cols), line))
            token = (cols[1], cols[3])
            tokens.append(token)
    return sentences

In [3]:
train_path = '/content/drive/MyDrive/UD_English-EWT/en_ewt-ud-train.conllu'
dev_path = '/content/drive/MyDrive/UD_English-EWT/en_ewt-ud-dev.conllu'
test_path = '/content/drive/MyDrive/UD_English-EWT/en_ewt-ud-test.conllu'

train_sentences = read_conllu(train_path)
dev_sentences = read_conllu(dev_path)
test_sentences = read_conllu(test_path)

print(f'Train samples: {len(train_sentences)}')
print(f'Dev samples: {len(dev_sentences)}')
print(f'Test sentences: {len(test_sentences)}')

Train samples: 12544
Dev samples: 2001
Test sentences: 2077


Build vocab:

In [4]:
pad_token_id = 0
unk_token_id = 1

# Build vocabularies from train_sentences (each token is (form, upos))
words = set()
tags = set()
for sent in train_sentences:
    for form, upos in sent:
        words.add(form)
        tags.add(upos)

# word2idx with special <UNK> and <PAD>
word2idx = {'<UNK>': unk_token_id, '<PAD>': pad_token_id}
word2idx.update({word: idx for idx, word in zip(range(2, len(words)+2), words)})

# tag_to_ix mapping
tag2idx = {tag: i for i, tag in enumerate(sorted(tags))}

print(f"word2idx size (including <UNK>): {len(word2idx)}")
print(f"tag2idx size: {len(tag2idx)}")

word2idx size (including <UNK>): 20202
tag2idx size: 18


### Task 2

Định nghĩa dataset:

In [5]:
from torch.utils.data import Dataset
import torch

class POSDataset(Dataset):
    def __init__(self, sentences, word2idx, tag2idx):
        super().__init__()
        self.sentences = sentences
        self.word2idx = word2idx
        self.tag2idx = tag2idx

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        tokens, tags = zip(*sentence)
        # lấy ra tokens ids cho các câu, xử lí out of vocabulary
        sentence_indices = torch.tensor([self.word2idx[token] if token in word2idx.keys() else word2idx['<UNK>'] for token in tokens ], dtype=torch.long)
        tag_indices = torch.tensor([self.tag2idx[tag] for tag in tags], dtype=torch.long)
        return sentence_indices, tag_indices

Định nghĩa collate_fn có nhiệm vụ xử lí padding và khởi tạo data loader:

In [6]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_fn_with_padding(batch):
    input_ids, tag_ids = zip(*batch)
    padded_sequences = pad_sequence(input_ids, batch_first=True, padding_value=pad_token_id)
    padded_tags = pad_sequence(tag_ids, batch_first=True, padding_value=-100) # -100 để bỏ qua khi tính loss

    return padded_sequences, padded_tags

train_ds = POSDataset(train_sentences, word2idx, tag2idx)
dev_ds = POSDataset(dev_sentences, word2idx, tag2idx)
test_ds = POSDataset(test_sentences, word2idx, tag2idx)

batch_size = 32
num_workers = 2
train_loader = DataLoader(
    train_ds, batch_size=batch_size, collate_fn=collate_fn_with_padding,
    shuffle=True, generator=torch.Generator().manual_seed(42),
    num_workers=num_workers
)
dev_loader = DataLoader(
    dev_ds, batch_size=batch_size,
    collate_fn=collate_fn_with_padding, shuffle=False, num_workers=num_workers
)
test_loader = DataLoader(
    test_ds, batch_size=batch_size, collate_fn=collate_fn_with_padding,
    shuffle=False, num_workers=num_workers
)

### Task 3

Định nghĩa mô hình:

In [7]:
import torch.nn as nn

class SimpleRNNForTokenClassification(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes, dropout_p=0.3):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        self.dropout_p = dropout_p

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(embedding_dim, hidden_size ,batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input_seqs):
        emb = self.dropout(self.embedding(input_seqs))
        output, _ = self.rnn(emb) # (batch, seq len, hidden)
        logits = self.fc(output) # fc tự động áp dụng lên dim cuối cùng
        return logits # (batch, seq len, num_classes)


### Task 4 + 5

Khởi tạo mô hình, optimizer và loss function:

In [11]:
import torch.optim as optim
import torch.nn as nn
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

vocab_size = len(word2idx)
embedding_dim = 128
hidden_size = 256
num_classes = len(tag2idx)
model = SimpleRNNForTokenClassification(vocab_size, embedding_dim, hidden_size, num_classes).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=-100)

Huấn luyện mô hình:

In [12]:
import torch
from tqdm import tqdm

def train_one_epoch(model, data_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    pbar = tqdm(data_loader, desc='Training', total=len(data_loader))

    for input_ids, tag_ids in pbar:
        input_ids = input_ids.to(device)
        tag_ids = tag_ids.to(device)

        optimizer.zero_grad()

        logits = model(input_ids).view(-1, model.num_classes)
        targets = tag_ids.view(-1)
        loss = criterion(logits, targets)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    epoch_loss = total_loss / len(data_loader)
    return epoch_loss

@torch.no_grad()
def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    pbar = tqdm(data_loader, desc='Evaluating', total=len(data_loader))
    total_correct = 0
    total_token = 0

    for input_ids, tag_ids in pbar:
        input_ids = input_ids.to(device)
        tag_ids = tag_ids.to(device)

        logits = model(input_ids).view(-1, model.num_classes)
        targets = tag_ids.view(-1)
        loss = criterion(logits, targets)

        total_loss += loss.item()

        # Tính accuracy
        preds = torch.argmax(logits, dim=-1)
        total_correct += (preds == targets).sum().item()
        total_token += targets.numel()

    validation_loss = total_loss / len(data_loader)
    accuracy = total_correct / total_token

    return validation_loss, accuracy

In [13]:
epochs = 20
best_accuracy = 0.0
for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}:")
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, accuracy = evaluate(model, dev_loader, criterion, device)
    print(f"Train loss: {train_loss:.5f} | Val loss: {val_loss:.5f} | Accuracy: {accuracy:.5f}")

    # lưu lại mô hình tốt nhất dựa trên validation accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), 'best_model.pth')
        print(f"Best model saved with accuracy: {best_accuracy:.5f}")

    print("==="*20)


Epoch 1/20:


Training: 100%|██████████| 392/392 [00:34<00:00, 11.24it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 45.85it/s]


Train loss: 1.18092 | Val loss: 0.83198 | Accuracy: 0.24197
Best model saved with accuracy: 0.24197

Epoch 2/20:


Training: 100%|██████████| 392/392 [00:28<00:00, 13.82it/s]
Evaluating: 100%|██████████| 63/63 [00:00<00:00, 68.37it/s]


Train loss: 0.76717 | Val loss: 0.65679 | Accuracy: 0.25941
Best model saved with accuracy: 0.25941

Epoch 3/20:


Training: 100%|██████████| 392/392 [00:30<00:00, 12.77it/s]
Evaluating: 100%|██████████| 63/63 [00:00<00:00, 69.25it/s]


Train loss: 0.61563 | Val loss: 0.56186 | Accuracy: 0.26985
Best model saved with accuracy: 0.26985

Epoch 4/20:


Training: 100%|██████████| 392/392 [00:30<00:00, 12.73it/s]
Evaluating: 100%|██████████| 63/63 [00:00<00:00, 69.07it/s]


Train loss: 0.51906 | Val loss: 0.51339 | Accuracy: 0.27720
Best model saved with accuracy: 0.27720

Epoch 5/20:


Training: 100%|██████████| 392/392 [00:31<00:00, 12.28it/s]
Evaluating: 100%|██████████| 63/63 [00:00<00:00, 69.68it/s]


Train loss: 0.45093 | Val loss: 0.46793 | Accuracy: 0.28074
Best model saved with accuracy: 0.28074

Epoch 6/20:


Training: 100%|██████████| 392/392 [00:33<00:00, 11.61it/s]
Evaluating: 100%|██████████| 63/63 [00:00<00:00, 68.99it/s]


Train loss: 0.39659 | Val loss: 0.44228 | Accuracy: 0.28380
Best model saved with accuracy: 0.28380

Epoch 7/20:


Training: 100%|██████████| 392/392 [00:30<00:00, 12.81it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 51.15it/s]


Train loss: 0.35357 | Val loss: 0.42693 | Accuracy: 0.28567
Best model saved with accuracy: 0.28567

Epoch 8/20:


Training: 100%|██████████| 392/392 [00:31<00:00, 12.41it/s]
Evaluating: 100%|██████████| 63/63 [00:00<00:00, 69.26it/s]


Train loss: 0.31975 | Val loss: 0.44298 | Accuracy: 0.28465

Epoch 9/20:


Training: 100%|██████████| 392/392 [00:31<00:00, 12.43it/s]
Evaluating: 100%|██████████| 63/63 [00:00<00:00, 70.51it/s]


Train loss: 0.29117 | Val loss: 0.42311 | Accuracy: 0.28632
Best model saved with accuracy: 0.28632

Epoch 10/20:


Training: 100%|██████████| 392/392 [00:31<00:00, 12.54it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 45.98it/s]


Train loss: 0.26824 | Val loss: 0.41789 | Accuracy: 0.28764
Best model saved with accuracy: 0.28764

Epoch 11/20:


Training: 100%|██████████| 392/392 [00:31<00:00, 12.55it/s]
Evaluating: 100%|██████████| 63/63 [00:00<00:00, 68.75it/s]


Train loss: 0.24767 | Val loss: 0.42011 | Accuracy: 0.28849
Best model saved with accuracy: 0.28849

Epoch 12/20:


Training: 100%|██████████| 392/392 [00:31<00:00, 12.51it/s]
Evaluating: 100%|██████████| 63/63 [00:00<00:00, 68.34it/s]


Train loss: 0.22841 | Val loss: 0.41916 | Accuracy: 0.28824

Epoch 13/20:


Training: 100%|██████████| 392/392 [00:31<00:00, 12.59it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 47.71it/s]


Train loss: 0.21384 | Val loss: 0.41224 | Accuracy: 0.28938
Best model saved with accuracy: 0.28938

Epoch 14/20:


Training: 100%|██████████| 392/392 [00:31<00:00, 12.64it/s]
Evaluating: 100%|██████████| 63/63 [00:00<00:00, 68.12it/s]


Train loss: 0.19975 | Val loss: 0.40497 | Accuracy: 0.29035
Best model saved with accuracy: 0.29035

Epoch 15/20:


Training: 100%|██████████| 392/392 [00:31<00:00, 12.63it/s]
Evaluating: 100%|██████████| 63/63 [00:00<00:00, 71.00it/s]


Train loss: 0.18936 | Val loss: 0.41242 | Accuracy: 0.29063
Best model saved with accuracy: 0.29063

Epoch 16/20:


Training: 100%|██████████| 392/392 [00:31<00:00, 12.63it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 47.61it/s]


Train loss: 0.17768 | Val loss: 0.40574 | Accuracy: 0.29104
Best model saved with accuracy: 0.29104

Epoch 17/20:


Training: 100%|██████████| 392/392 [00:31<00:00, 12.50it/s]
Evaluating: 100%|██████████| 63/63 [00:00<00:00, 67.66it/s]


Train loss: 0.16800 | Val loss: 0.43500 | Accuracy: 0.28989

Epoch 18/20:


Training: 100%|██████████| 392/392 [00:31<00:00, 12.61it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 59.77it/s]


Train loss: 0.16030 | Val loss: 0.40575 | Accuracy: 0.29276
Best model saved with accuracy: 0.29276

Epoch 19/20:


Training: 100%|██████████| 392/392 [00:32<00:00, 11.98it/s]
Evaluating: 100%|██████████| 63/63 [00:01<00:00, 52.26it/s]


Train loss: 0.15341 | Val loss: 0.42207 | Accuracy: 0.29214

Epoch 20/20:


Training: 100%|██████████| 392/392 [00:31<00:00, 12.60it/s]
Evaluating: 100%|██████████| 63/63 [00:00<00:00, 66.84it/s]


Train loss: 0.14441 | Val loss: 0.40536 | Accuracy: 0.29291
Best model saved with accuracy: 0.29291


Đánh giá trên tập test:

In [18]:
test_loss, test_accuracy = evaluate(model, test_loader, criterion, device)
print('\n')
print(test_accuracy)

Evaluating: 100%|██████████| 65/65 [00:01<00:00, 48.18it/s]



0.2779067339695172





Hàm dự đoán:

In [14]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [15]:
from nltk.tokenize import word_tokenize

def predict(model, tokenizer, sentence, word2idx, idx2word, idx2tag, device):
    model.eval()
    tokens = tokenizer(sentence)
    token_ids = [word2idx[token] if token in word2idx.keys() else word2idx['<UNK>'] for token in tokens]
    input_ids = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0)
    logits = model(input_ids)
    preds = torch.argmax(logits, dim=-1).squeeze(0)
    predicted_tags = [idx2tag[tag_id] for tag_id in preds.tolist()]
    return list(zip(tokens, predicted_tags))

sample_sentences = [
    "This is Sparta!!!!!",
    "This movie is really interesting",
    "I rate this movie ten out of ten"
]

idx2word = {idx: word for word, idx in word2idx.items()}
idx2tag = {idx: tag for tag, idx in tag2idx.items()}

for sent in sample_sentences:
    output = predict(model, word_tokenize, sent, word2idx, idx2word, idx2tag, device)
    print(output)

[('This', 'DET'), ('is', 'AUX'), ('Sparta', 'VERB'), ('!', 'PUNCT'), ('!', 'PUNCT'), ('!', 'PUNCT'), ('!', 'PUNCT'), ('!', 'PUNCT')]
[('This', 'DET'), ('movie', 'NOUN'), ('is', 'AUX'), ('really', 'ADV'), ('interesting', 'ADJ')]
[('I', 'PRON'), ('rate', 'VERB'), ('this', 'DET'), ('movie', 'NOUN'), ('ten', 'NUM'), ('out', 'ADV'), ('of', 'ADP'), ('ten', 'NUM')]


### Kết quả:
- Dev accuracy: 29.291%
- Test accuracy: 27.79%