In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import zipfile
import os

zip_path = "/content/drive/MyDrive/ai-1904-dpl-302-m-topic-sentiment-classification.zip"  

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Kiểm tra các file đã giải nén
os.listdir(extract_path)


['test.csv', 'train data.json']

Đọc file train_data.json

In [None]:
import json
import pandas as pd

# Đọc train data
with open("/content/data/train data.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)

# Tách text, sentiment, topic
texts, sentiments, topics = [], [], []
for item in train_data:
    text = item["data"]["text"]
    sent, topic_list = None, []

    for ann in item["annotations"]:
        if ann["from_name"] == "sentiment":
            sent = ann["value"]["choices"][0]
        elif ann["from_name"] == "topic":
            topic_list.extend(ann["value"]["choices"])

    if sent:
        texts.append(text)
        sentiments.append(sent)
        topics.append(topic_list)

print(f"Số mẫu huấn luyện: {len(texts)}")

# Đọc test
test_df = pd.read_csv("/content/data/test.csv")
test_texts = test_df["text"].tolist()


Số mẫu huấn luyện: 1684


 2. Khởi tạo PhoBERT + Tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)

3. Xử lý label

In [None]:
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

# Cảm xúc: Tiêu cực, Trung tính, Tích cực
le_sent = LabelEncoder()
sent_labels = le_sent.fit_transform(sentiments)

# Chủ đề: multi-label
mlb_topic = MultiLabelBinarizer()
topic_labels = mlb_topic.fit_transform(topics)
topic_names = mlb_topic.classes_


4. Dataset dùng chung

In [None]:
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, is_multilabel=False, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.is_multilabel = is_multilabel
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(self.labels[idx], dtype=torch.float if self.is_multilabel else torch.long)
        }


 5. Model Sentiment và Topic

In [None]:
import torch.nn as nn
from transformers import AutoModel

class PhoBERT_Classifier(nn.Module):
    def __init__(self, num_labels, is_multilabel=False):
        super().__init__()
        self.phobert = AutoModel.from_pretrained("vinai/phobert-base")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.phobert.config.hidden_size, num_labels)
        self.is_multilabel = is_multilabel

    def forward(self, input_ids, attention_mask):
        output = self.phobert(input_ids=input_ids, attention_mask=attention_mask)
        cls_token = output.last_hidden_state[:, 0, :]
        cls_token = self.dropout(cls_token)
        logits = self.classifier(cls_token)
        return logits


6. Huấn luyện mô hình sentiment

In [None]:
from torch.utils.data import DataLoader
import torch.nn.functional as F
import numpy as np
import copy
from sklearn.metrics import accuracy_score, f1_score, classification_report
# Dataset và mô hình
sent_dataset = TextDataset(texts, sent_labels, tokenizer)
sent_loader = DataLoader(sent_dataset, batch_size=16, shuffle=True)
model_sent = PhoBERT_Classifier(num_labels=3).to("cuda")

# Huấn luyện
optimizer = torch.optim.AdamW(model_sent.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()
best_loss, patience = float("inf"), 0

for epoch in range(20):
    model_sent.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for batch in sent_loader:
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        labels = batch["label"].to("cuda")

        optimizer.zero_grad()
        logits = model_sent(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Dự đoán class và gom nhãn thực tế để tính metrics
        preds = logits.argmax(dim=1).detach().cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

    avg = total_loss / len(sent_loader)
    # Tính các chỉ số metrics
    acc = accuracy_score(all_labels, all_preds)
    f1_macro = f1_score(all_labels, all_preds, average="macro")
    f1_micro = f1_score(all_labels, all_preds, average="micro")
    print(f"[Sentiment Epoch {epoch+1}] Loss: {avg:.4f} | Acc: {acc:.4f} | F1-macro: {f1_macro:.4f} | F1-micro: {f1_micro:.4f}")

    if avg < best_loss:
        best_loss = avg
        best_sent = copy.deepcopy(model_sent.state_dict())
        patience = 0
    else:
        patience += 1
        if patience >= 3: break

model_sent.load_state_dict(best_sent)
print(classification_report(all_labels, all_preds, digits=4))


[Sentiment Epoch 1] Loss: 0.9179 | Acc: 0.5552 | F1-macro: 0.3169 | F1-micro: 0.5552
[Sentiment Epoch 2] Loss: 0.7392 | Acc: 0.6390 | F1-macro: 0.5836 | F1-micro: 0.6390
[Sentiment Epoch 3] Loss: 0.5984 | Acc: 0.7328 | F1-macro: 0.6990 | F1-micro: 0.7328
[Sentiment Epoch 4] Loss: 0.4347 | Acc: 0.8385 | F1-macro: 0.8219 | F1-micro: 0.8385
[Sentiment Epoch 5] Loss: 0.3020 | Acc: 0.8842 | F1-macro: 0.8755 | F1-micro: 0.8842
[Sentiment Epoch 6] Loss: 0.1908 | Acc: 0.9382 | F1-macro: 0.9342 | F1-micro: 0.9382
[Sentiment Epoch 7] Loss: 0.1727 | Acc: 0.9424 | F1-macro: 0.9362 | F1-micro: 0.9424
[Sentiment Epoch 8] Loss: 0.1003 | Acc: 0.9691 | F1-macro: 0.9656 | F1-micro: 0.9691
[Sentiment Epoch 9] Loss: 0.0695 | Acc: 0.9786 | F1-macro: 0.9775 | F1-micro: 0.9786
[Sentiment Epoch 10] Loss: 0.0629 | Acc: 0.9804 | F1-macro: 0.9781 | F1-micro: 0.9804
[Sentiment Epoch 11] Loss: 0.0451 | Acc: 0.9893 | F1-macro: 0.9886 | F1-micro: 0.9893
[Sentiment Epoch 12] Loss: 0.0599 | Acc: 0.9816 | F1-macro: 0.9

7. Huấn luyện mô hình topic (multi-label)

In [None]:
topic_dataset = TextDataset(texts, topic_labels, tokenizer, is_multilabel=True)
topic_loader = DataLoader(topic_dataset, batch_size=16, shuffle=True)
model_topic = PhoBERT_Classifier(num_labels=len(topic_names), is_multilabel=True).to("cuda")

# Huấn luyện
optimizer = torch.optim.AdamW(model_topic.parameters(), lr=2e-5)
loss_fn = nn.BCEWithLogitsLoss()
best_loss, patience = float("inf"), 0

for epoch in range(20):
    model_topic.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for batch in topic_loader:
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        labels = batch["label"].to("cuda")

        optimizer.zero_grad()
        logits = model_topic(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Dự đoán xác suất, sau đó chuyển thành nhị phân
        probs = torch.sigmoid(logits).detach().cpu().numpy()
        preds = (probs >= 0.5).astype(int)
        all_preds.append(preds)
        all_labels.append(labels.detach().cpu().numpy())

    avg = total_loss / len(topic_loader)
    y_true = np.vstack(all_labels)
    y_pred = np.vstack(all_preds)

    # Metrics
    acc = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average="macro", zero_division=0)
    f1_micro = f1_score(y_true, y_pred, average="micro", zero_division=0)
    print(f"[Topic Epoch {epoch+1}] Loss: {avg:.4f} | Acc: {acc:.4f} | F1-macro: {f1_macro:.4f} | F1-micro: {f1_micro:.4f}")

    if avg < best_loss:
        best_loss = avg
        best_topic = copy.deepcopy(model_topic.state_dict())
        patience = 0
    else:
        patience += 1
        if patience >= 3:
            break

model_topic.load_state_dict(best_topic)
print(classification_report(y_true, y_pred, target_names=topic_names, digits=4))

[Topic Epoch 1] Loss: 0.3734 | Acc: 0.0024 | F1-macro: 0.0232 | F1-micro: 0.0271
[Topic Epoch 2] Loss: 0.2904 | Acc: 0.0018 | F1-macro: 0.0048 | F1-micro: 0.0007
[Topic Epoch 3] Loss: 0.2666 | Acc: 0.0018 | F1-macro: 0.0000 | F1-micro: 0.0000
[Topic Epoch 4] Loss: 0.2507 | Acc: 0.0018 | F1-macro: 0.0000 | F1-micro: 0.0000
[Topic Epoch 5] Loss: 0.2322 | Acc: 0.0160 | F1-macro: 0.0206 | F1-micro: 0.0392
[Topic Epoch 6] Loss: 0.2102 | Acc: 0.0819 | F1-macro: 0.0952 | F1-micro: 0.2097
[Topic Epoch 7] Loss: 0.1916 | Acc: 0.1574 | F1-macro: 0.1738 | F1-micro: 0.3628
[Topic Epoch 8] Loss: 0.1750 | Acc: 0.2316 | F1-macro: 0.2374 | F1-micro: 0.4755
[Topic Epoch 9] Loss: 0.1606 | Acc: 0.2892 | F1-macro: 0.3001 | F1-micro: 0.5614
[Topic Epoch 10] Loss: 0.1477 | Acc: 0.3314 | F1-macro: 0.3636 | F1-micro: 0.6210
[Topic Epoch 11] Loss: 0.1358 | Acc: 0.3961 | F1-macro: 0.4355 | F1-micro: 0.6783
[Topic Epoch 12] Loss: 0.1238 | Acc: 0.4477 | F1-macro: 0.4780 | F1-micro: 0.7263
[Topic Epoch 13] Loss: 0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


 8. Dự đoán test + Xuất file CSV

In [None]:
model_sent.eval()
model_topic.eval()

# Sentiment
sent_preds = []
with torch.no_grad():
    for i in range(0, len(test_texts), 16):
        enc = tokenizer(test_texts[i:i+16], padding=True, truncation=True, max_length=256, return_tensors="pt")
        input_ids = enc["input_ids"].to("cuda")
        attention_mask = enc["attention_mask"].to("cuda")
        logits = model_sent(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        sent_preds.extend(preds)

pred_sentiments = le_sent.inverse_transform(sent_preds)

# Topic
topic_preds = []
with torch.no_grad():
    for i in range(0, len(test_texts), 16):
        enc = tokenizer(test_texts[i:i+16], padding=True, truncation=True, max_length=256, return_tensors="pt")
        input_ids = enc["input_ids"].to("cuda")
        attention_mask = enc["attention_mask"].to("cuda")
        logits = model_topic(input_ids, attention_mask)
        probs = torch.sigmoid(logits).cpu().numpy()
        labels = (probs > 0.4).astype(int)
        for row in labels:
            selected_topics = [topic_names[i] for i in range(len(topic_names)) if row[i] == 1]
            if selected_topics:
              topics_str = ";".join(selected_topics)
            else:
              topics_str = "unknown"
            topic_preds.append(topics_str)


# Gộp và xuất
submission = pd.DataFrame({
    "id": test_df["id"],
    "sentiment": pred_sentiments,
    "topic": topic_preds
})
submission.to_csv("submission.csv", index=False, encoding="utf-8-sig")

from google.colab import files
files.download("submission.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>