# 1. Import và cài đặt

In [1]:
!pip install transformers -q
!pip install vncorenlp -q
!pip install pyyaml -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/2.6 MB[0m [31m19.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.6/2.6 MB[0m [31m52.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for vncorenlp (setup.py) ... [?25l[?25hdone


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import yaml
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from argparse import Namespace
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import classification_report, accuracy_score
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [4]:
def load_config(config_path: str) -> Namespace:
    with open(config_path, 'r') as f:
        cfg_dict = yaml.safe_load(f)

    def dict_to_namespace(d):
        if isinstance(d, dict):
            return Namespace(**{k: dict_to_namespace(v) for k, v in d.items()})
        elif isinstance(d, list):
            return [dict_to_namespace(x) for x in d]
        else:
            return d

    return dict_to_namespace(cfg_dict)



# 2. Data

In [5]:
class PhoBERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }


# 3. PhoBERT

In [6]:
import torch.nn as nn
from transformers import AutoModel

class PhoBERTClassifier(nn.Module):
    def __init__(self, model_name="vinai/phobert-base", num_labels=2, dropout_rate=0.3, freeze_phobert=False):
        super().__init__()

        self.phobert = AutoModel.from_pretrained(model_name)

        if freeze_phobert:
            for param in self.phobert.parameters():
                param.requires_grad = False
            print("=> PhoBERT đã được freeze. Chỉ huấn luyện classifier.")

        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.phobert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.phobert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0]  # Lấy token [CLS]
        return self.classifier(self.dropout(cls_output))


# 4. Trainer

In [7]:
def train_model(model, train_loader, val_loader, config):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    loss_fn = nn.CrossEntropyLoss()

    best_val_acc = 0
    patience_counter = 0
    logs = []

    for epoch in range(config.training.epochs):
        start_time = time.time()

        model.train()
        total_loss = 0
        all_preds, all_labels = [], []

        for batch in tqdm(train_loader, desc=f"[Epoch {epoch+1}] Training"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        train_acc = accuracy_score(all_labels, all_preds)
        val_acc, val_f1, val_prec, val_rec, val_auc = evaluate_model(model, val_loader)

        elapsed_time = time.time() - start_time
        logs.append({
            'epoch': epoch + 1,
            'time': round(elapsed_time, 2),
            'train_acc': round(train_acc, 4),
            'val_acc': round(val_acc, 4),
            'f1': round(val_f1, 4),
            'precision': round(val_prec, 4),
            'recall': round(val_rec, 4),
            'auc': round(val_auc, 4),
        })

        print(f"[Epoch {epoch+1}] Time: {elapsed_time:.2f}s - Train Acc: {train_acc:.4f} - Val Acc: {val_acc:.4f} - F1: {val_f1:.4f} - Precision: {val_prec:.4f} - Recall: {val_rec:.4f} - AUC: {val_auc:.4f}")

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            torch.save(model.state_dict(), config.training.checkpoint_path)
        else:
            patience_counter += 1
            if patience_counter >= config.training.early_stopping.patience:
                print("=> Early stopping.")
                break

        # Save logs after each epoch
        pd.DataFrame(logs).to_csv(config.training.log_path, index=False)

# 5. Evaluator

In [8]:
import time
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, classification_report

def evaluate_model(model, data_loader):
    model.eval()
    device = next(model.parameters()).device
    all_preds, all_labels, all_probs = [], [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            probs = torch.softmax(outputs, dim=1)[:, 1]
            preds = torch.argmax(outputs, dim=1)

            all_probs.extend(probs.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)
    prec = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    rec = recall_score(all_labels, all_preds, average='macro', zero_division=0)
    try:
        auc = roc_auc_score(all_labels, all_probs)
    except:
        auc = 0.0

    print(classification_report(all_labels, all_preds, digits=4, zero_division=0))
    return acc, f1, prec, rec, auc

# 6. Chạy

In [9]:
%%writefile "/content/drive/MyDrive/1.PROJECTS/[TeamPe][BIT][BGRA2025]/NLP/Code/config_phobert_pretrain1.yaml"

mode: pretrain #train
seed: 42

paths:
  train: "/content/drive/MyDrive/1.PROJECTS/[TeamPe][BIT][BGRA2025]/NLP/Code/reintel_dataset/train.csv"
  test: "/content/drive/MyDrive/1.PROJECTS/[TeamPe][BIT][BGRA2025]/NLP/Code/reintel_dataset/test.csv"
  val: "/content/drive/MyDrive/1.PROJECTS/[TeamPe][BIT][BGRA2025]/NLP/Code/reintel_dataset/val.csv"
  #train: "/content/drive/MyDrive/1.PROJECTS/[TeamPe][BIT][BGRA2025]/NLP/Code/reintel_dataset/warmup.csv"

preprocessing:
  phobert:
    max_sequence_length: 256

models:
  phobert:
    model_name: "vinai/phobert-base"
    hidden_size: 768
    dropout_rate: 0.3
    num_labels: 2

training:
  batch_size: 32
  epochs: 10
  checkpoint_path: "/content/drive/MyDrive/1.PROJECTS/[TeamPe][BIT][BGRA2025]/NLP/Code/PhoBert/checkpoints/phobert_pretrain1.pt"
  log_path: "/content/drive/MyDrive/1.PROJECTS/[TeamPe][BIT][BGRA2025]/NLP/Code/PhoBert/logs/phobert_training_pretrain1.csv"
  early_stopping:
    patience: 3
  device: auto


Writing /content/drive/MyDrive/1.PROJECTS/[TeamPe][BIT][BGRA2025]/NLP/Code/config_phobert_train1.yaml


In [11]:
# Load data
config = load_config("/content/drive/MyDrive/1.PROJECTS/[TeamPe][BIT][BGRA2025]/NLP/Code/config_phobert.yaml")

train_df = pd.read_csv(config.paths.train)
val_df = pd.read_csv(config.paths.val)

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

train_dataset = PhoBERTDataset(train_df['post_message'].tolist(), train_df['label'].tolist(), tokenizer, config.preprocessing.phobert.max_sequence_length)
val_dataset = PhoBERTDataset(val_df['post_message'].tolist(), val_df['label'].tolist(), tokenizer, config.preprocessing.phobert.max_sequence_length)

train_loader = DataLoader(train_dataset, batch_size=config.training.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config.training.batch_size)

# Train model
model = PhoBERTClassifier(model_name=config.models.phobert.model_name, num_labels=config.models.phobert.num_labels, freeze_phobert=True)
train_model(model, train_loader, val_loader, config)


=> PhoBERT đã được freeze. Chỉ huấn luyện classifier.


[Epoch 1] Training: 100%|██████████| 274/274 [02:10<00:00,  2.10it/s]


              precision    recall  f1-score   support

           0     0.8313    1.0000    0.9079       404
           1     0.0000    0.0000    0.0000        82

    accuracy                         0.8313       486
   macro avg     0.4156    0.5000    0.4539       486
weighted avg     0.6910    0.8313    0.7547       486

[Epoch 1] Time: 137.04s - Train Acc: 0.7426 - Val Acc: 0.8313 - F1: 0.4539 - Precision: 0.4156 - Recall: 0.5000 - AUC: 0.4505


[Epoch 2] Training: 100%|██████████| 274/274 [02:05<00:00,  2.19it/s]


              precision    recall  f1-score   support

           0     0.8313    1.0000    0.9079       404
           1     0.0000    0.0000    0.0000        82

    accuracy                         0.8313       486
   macro avg     0.4156    0.5000    0.4539       486
weighted avg     0.6910    0.8313    0.7547       486

[Epoch 2] Time: 131.60s - Train Acc: 0.8307 - Val Acc: 0.8313 - F1: 0.4539 - Precision: 0.4156 - Recall: 0.5000 - AUC: 0.5936


[Epoch 3] Training: 100%|██████████| 274/274 [02:03<00:00,  2.22it/s]


              precision    recall  f1-score   support

           0     0.8313    1.0000    0.9079       404
           1     0.0000    0.0000    0.0000        82

    accuracy                         0.8313       486
   macro avg     0.4156    0.5000    0.4539       486
weighted avg     0.6910    0.8313    0.7547       486

[Epoch 3] Time: 129.70s - Train Acc: 0.8317 - Val Acc: 0.8313 - F1: 0.4539 - Precision: 0.4156 - Recall: 0.5000 - AUC: 0.7020


[Epoch 4] Training: 100%|██████████| 274/274 [02:03<00:00,  2.23it/s]


              precision    recall  f1-score   support

           0     0.8313    1.0000    0.9079       404
           1     0.0000    0.0000    0.0000        82

    accuracy                         0.8313       486
   macro avg     0.4156    0.5000    0.4539       486
weighted avg     0.6910    0.8313    0.7547       486

[Epoch 4] Time: 129.54s - Train Acc: 0.8317 - Val Acc: 0.8313 - F1: 0.4539 - Precision: 0.4156 - Recall: 0.5000 - AUC: 0.7626
=> Early stopping.


In [12]:
# Load lại tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(config.models.phobert.model_name)
model = PhoBERTClassifier(
    model_name=config.models.phobert.model_name,
    num_labels=config.models.phobert.num_labels
)

# Load checkpoint tốt nhất
model.load_state_dict(torch.load(config.training.checkpoint_path, map_location='cpu'))

# Chuẩn bị dữ liệu test
test_df = pd.read_csv(config.paths.test)
test_dataset = PhoBERTDataset(
    texts=test_df['post_message'].tolist(),
    labels=test_df['label'].tolist(),
    tokenizer=tokenizer,
    max_len=config.preprocessing.phobert.max_sequence_length
)
test_loader = DataLoader(test_dataset, batch_size=config.training.batch_size)

# Gọi đánh giá
acc, f1, precision, recall, auc = evaluate_model(model, test_loader)

print(f"\n✅ Test Accuracy: {acc:.4f}")
print(f"✅ Test F1 (macro): {f1:.4f}")
print(f"✅ Test Precision: {precision:.4f}")
print(f"✅ Test Recall: {recall:.4f}")
print(f"✅ Test AUC: {auc:.4f}")


              precision    recall  f1-score   support

           0     0.8313    1.0000    0.9079       404
           1     0.0000    0.0000    0.0000        82

    accuracy                         0.8313       486
   macro avg     0.4156    0.5000    0.4539       486
weighted avg     0.6910    0.8313    0.7547       486


✅ Test Accuracy: 0.8313
✅ Test F1 (macro): 0.4539
✅ Test Precision: 0.4156
✅ Test Recall: 0.5000
✅ Test AUC: 0.5215
