In [12]:
#Task 1
import torch
import torch.nn as nn

def load_conllu(file_path):
    sentences = []
    current_sentence = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = []
                continue
            if line.startswith("#"):
                continue
            parts = line.split("\t")
            if len(parts) > 3:
                word = parts[1]
                tag = parts[3]
                current_sentence.append((word, tag))
        if current_sentence:
            sentences.append(current_sentence)
    return sentences

train_data = load_conllu("/content/en_ewt-ud-train.conllu")
dev_data = load_conllu("/content/en_ewt-ud-dev.conllu")

word_to_ix = {"<UNK>": 0}
tag_to_ix = {}

for sentence in train_data:
    for word, tag in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

print("Vocabulary size:", len(word_to_ix))
print("Tag set size:", len(tag_to_ix))
print("Example pairs:", train_data[0][:10])

Vocabulary size: 20201
Tag set size: 18
Example pairs: [('Al', 'PROPN'), ('-', 'PUNCT'), ('Zaman', 'PROPN'), (':', 'PUNCT'), ('American', 'ADJ'), ('forces', 'NOUN'), ('killed', 'VERB'), ('Shaikh', 'PROPN'), ('Abdullah', 'PROPN'), ('al', 'PROPN')]


In [13]:
#Task 2
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class POSDataset(Dataset):
    def __init__(self, sentences, word_to_ix, tag_to_ix):
        self.sentences = sentences
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        words = [self.word_to_ix.get(w, self.word_to_ix["<UNK>"]) for w, _ in sentence]
        tags = [self.tag_to_ix[t] for _, t in sentence]
        return torch.tensor(words, dtype=torch.long), torch.tensor(tags, dtype=torch.long)

def collate_fn(batch):
    sentences, tags = zip(*batch)
    sentences_padded = pad_sequence(sentences, batch_first=True)
    tags_padded = pad_sequence(tags, batch_first=True)
    return sentences_padded, tags_padded

train_dataset = POSDataset(train_data, word_to_ix, tag_to_ix)
dev_dataset = POSDataset(dev_data, word_to_ix, tag_to_ix)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

sample_sent, sample_tag = train_dataset[0]
print("Example sentence indices:", sample_sent[:10])
print("Example tag indices:", sample_tag[:10])


Example sentence indices: tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])
Example tag indices: tensor([0, 1, 0, 1, 2, 3, 4, 0, 0, 0])


In [7]:
#Task 3
class SimpleRNNForTokenClassification(nn.Module):
    def __init__(self, vocab_size, tag_size, embedding_dim=128, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, tag_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        logits = self.fc(output)
        return logits

model = SimpleRNNForTokenClassification(len(word_to_ix), len(tag_to_ix))
x_example, y_example = next(iter(train_loader))
logits_example = model(x_example)

print("Input batch shape:", x_example.shape)
print("Logits shape:", logits_example.shape)


Input batch shape: torch.Size([32, 48])
Logits shape: torch.Size([32, 48, 18])


In [8]:
#Task 4
model = SimpleRNNForTokenClassification(len(word_to_ix), len(tag_to_ix))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss(ignore_index=0)

epochs = 3
for epoch in range(epochs):
    total_loss = 0
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        logits = model(batch_x)
        loss = loss_fn(logits.view(-1, logits.size(-1)), batch_y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print("Epoch", epoch + 1, "Loss:", total_loss / len(train_loader))

x_sample, y_sample = next(iter(dev_loader))
logits_sample = model(x_sample)
pred_sample = torch.argmax(logits_sample, dim=-1)

print("Sample input indices:", x_sample[0][:10])
print("Sample true tags:", y_sample[0][:10])
print("Sample predicted tags:", pred_sample[0][:10])


Epoch 1 Loss: 0.9703491297455467
Epoch 2 Loss: 0.4999715414430414
Epoch 3 Loss: 0.3698372770176858
Sample input indices: tensor([1559,   13, 9360, 3111,   74,  757,    4,    0,    0,    0])
Sample true tags: tensor([6, 5, 0, 4, 5, 3, 1, 0, 0, 0])
Sample predicted tags: tensor([6, 5, 3, 3, 5, 3, 1, 4, 4, 4])


In [9]:
#Task 5
def evaluate(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_x, batch_y in dataloader:
            logits = model(batch_x)
            preds = torch.argmax(logits, dim=-1)
            mask = batch_y != 0
            correct += (preds[mask] == batch_y[mask]).sum().item()
            total += mask.sum().item()
    return correct / total

dev_acc = evaluate(model, dev_loader)
print("Dev accuracy:", dev_acc)

x_sample, y_sample = next(iter(dev_loader))
logits_sample = model(x_sample)
pred_sample = torch.argmax(logits_sample, dim=-1)

print("Example true tags:", y_sample[0][:10])
print("Example predicted tags:", pred_sample[0][:10])


Dev accuracy: 0.8633087460484721
Example true tags: tensor([6, 5, 0, 4, 5, 3, 1, 0, 0, 0])
Example predicted tags: tensor([6, 5, 3, 3, 5, 3, 1, 4, 4, 4])


In [14]:
best_dev_acc = 0.0
train_acc_list = []
dev_acc_list = []

model = SimpleRNNForTokenClassification(len(word_to_ix), len(tag_to_ix))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss(ignore_index=0)

epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    total_correct = 0
    total_tokens = 0

    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        logits = model(batch_x)

        loss = loss_fn(logits.view(-1, logits.size(-1)), batch_y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        preds = torch.argmax(logits, dim=-1)
        mask = batch_y != 0
        total_correct += (preds[mask] == batch_y[mask]).sum().item()
        total_tokens += mask.sum().item()

    train_acc = total_correct / total_tokens
    train_acc_list.append(train_acc)

    dev_acc = evaluate(model, dev_loader)
    dev_acc_list.append(dev_acc)

    if dev_acc > best_dev_acc:
        best_dev_acc = dev_acc
        torch.save(model.state_dict(), "best_model.pt")

    print(f"Epoch {epoch+1}: Loss={total_loss/len(train_loader):.4f} | Train Acc={train_acc:.4f} | Dev Acc={dev_acc:.4f}")


Epoch 1: Loss=0.9911 | Train Acc=0.7048 | Dev Acc=0.8074
Epoch 2: Loss=0.5086 | Train Acc=0.8412 | Dev Acc=0.8508
Epoch 3: Loss=0.3749 | Train Acc=0.8818 | Dev Acc=0.8724


In [15]:
model.load_state_dict(torch.load("best_model.pt"))
final_dev_acc = evaluate(model, dev_loader)
print("Final Dev Accuracy:", final_dev_acc)


Final Dev Accuracy: 0.8724130663856692


In [18]:
def predict_sentence(sentence):
    model.eval()
    words = sentence.split()
    indices = [word_to_ix.get(w, word_to_ix["<UNK>"]) for w in words]
    x = torch.tensor(indices).unsqueeze(0)

    with torch.no_grad():
        logits = model(x)
        preds = torch.argmax(logits, dim=-1).squeeze(0)

    ix2tag = {v: k for k, v in tag_to_ix.items()}
    result = [(w, ix2tag[p.item()]) for w, p in zip(words, preds)]
    return result
print(predict_sentence("i love NLP"))


[('i', 'PRON'), ('love', 'VERB'), ('NLP', 'VERB')]


BÁO CÁO KẾT QUẢ:

• Vocabulary size: 20201
• Tag set size: 18

Mô hình RNN được huấn luyện 3 epoch.
Sau mỗi epoch,hàm loss, độ chính xác trên tập train và dev như sau:

Epoch 1: Loss=0.9911 | Train Acc=0.7048 | Dev Acc=0.8074
Epoch 2: Loss=0.5086 | Train Acc=0.8412 | Dev Acc=0.8508
Epoch 3: Loss=0.3749 | Train Acc=0.8818 | Dev Acc=0.8724

Dựa trên độ chính xác trên tập dev, mô hình có điểm dev cao nhất được lưu lại.
Sau khi load mô hình tốt nhất:

Final Dev Accuracy: 0.8724130663856692

Hàm predict_sentence(sentence) được viết để dự đoán nhãn UPOS cho câu mới.


KẾT QUẢ THỰC HIỆN :
- Độ chính xác trên tập dev : 0.8724...
- Ví dụ dự đoán câu mới :
  + Câu "i love NLP"
  + Dự đoán : [('i', 'PRON'), ('love', 'VERB'), ('NLP', 'VERB')]
  + Câu "This is a test sentence"
  + Dự đoán : [('This', 'PRON'), ('is', 'AUX'), ('a', 'DET'), ('test', 'NOUN'), ('sentence', 'VERB')]