# 1. Import và cài đặt

In [1]:
!pip install transformers -q
!pip install vncorenlp -q
!pip install pyyaml -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for fastBPE (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Requested omegaconf<2.1 from https://files.pythonhosted.org/packages/d0/eb/9d63ce09dd8aa85767c65668d5414958ea29648a0eec80a4a7d311ec2684/omegaconf-2.0.6-py3-none-any.whl (from fairseq) has invalid metadata: .* suffix can only be used with `==` or `!=` operators
    PyYAML (>=5.1.*)
            ~~~~~~^
Please use pip<24.1 if you need to use this version.[0m[33m
Requested omegaconf<2.1 from https://files.pythonhosted.org/packages/e5/f6/043b6d255dd6fbf2025110cea35b87f4c5100a181681d8eab496269f0d5b/omegaconf-2.0.5-py3-none-any.whl (f

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import yaml
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from argparse import Namespace
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import classification_report, accuracy_score
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [4]:
def load_config(config_path: str) -> Namespace:
    with open(config_path, 'r') as f:
        cfg_dict = yaml.safe_load(f)

    def dict_to_namespace(d):
        if isinstance(d, dict):
            return Namespace(**{k: dict_to_namespace(v) for k, v in d.items()})
        elif isinstance(d, list):
            return [dict_to_namespace(x) for x in d]
        else:
            return d

    return dict_to_namespace(cfg_dict)



# 2. Data

In [5]:
class PhoBERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }


# 3. PhoBERT

In [6]:
import torch.nn as nn
from transformers import AutoModel

class PhoBERTClassifier(nn.Module):
    def __init__(self, model_name="vinai/phobert-base", num_labels=2, dropout_rate=0.3, freeze_phobert=False):
        super().__init__()

        self.phobert = AutoModel.from_pretrained(model_name)

        if freeze_phobert:
            for param in self.phobert.parameters():
                param.requires_grad = False
            print("=> PhoBERT đã được freeze. Chỉ huấn luyện classifier.")

        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.phobert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.phobert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0]  # Lấy token [CLS]
        return self.classifier(self.dropout(cls_output))


# 4. Trainer

In [7]:
def train_model(model, train_loader, val_loader, config):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    loss_fn = nn.CrossEntropyLoss()

    best_val_acc = 0
    patience_counter = 0
    logs = []

    for epoch in range(config.training.epochs):
        start_time = time.time()

        model.train()
        total_loss = 0
        all_preds, all_labels = [], []

        for batch in tqdm(train_loader, desc=f"[Epoch {epoch+1}] Training"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        train_acc = accuracy_score(all_labels, all_preds)
        val_acc, val_f1, val_prec, val_rec, val_auc = evaluate_model(model, val_loader)

        elapsed_time = time.time() - start_time
        logs.append({
            'epoch': epoch + 1,
            'time': round(elapsed_time, 2),
            'train_acc': round(train_acc, 4),
            'val_acc': round(val_acc, 4),
            'f1': round(val_f1, 4),
            'precision': round(val_prec, 4),
            'recall': round(val_rec, 4),
            'auc': round(val_auc, 4),
        })

        print(f"[Epoch {epoch+1}] Time: {elapsed_time:.2f}s - Train Acc: {train_acc:.4f} - Val Acc: {val_acc:.4f} - F1: {val_f1:.4f} - Precision: {val_prec:.4f} - Recall: {val_rec:.4f} - AUC: {val_auc:.4f}")

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            torch.save(model.state_dict(), config.training.checkpoint_path)
        else:
            patience_counter += 1
            if patience_counter >= config.training.early_stopping.patience:
                print("=> Early stopping.")
                break

        # Save logs after each epoch
        pd.DataFrame(logs).to_csv(config.training.log_path, index=False)

# 5. Evaluator

In [8]:
import time
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, classification_report

def evaluate_model(model, data_loader):
    model.eval()
    device = next(model.parameters()).device
    all_preds, all_labels, all_probs = [], [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            probs = torch.softmax(outputs, dim=1)[:, 1]
            preds = torch.argmax(outputs, dim=1)

            all_probs.extend(probs.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)
    prec = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    rec = recall_score(all_labels, all_preds, average='macro', zero_division=0)
    try:
        auc = roc_auc_score(all_labels, all_probs)
    except:
        auc = 0.0

    print(classification_report(all_labels, all_preds, digits=4, zero_division=0))
    return acc, f1, prec, rec, auc

# 6. Chạy

In [9]:
%%writefile "/content/drive/MyDrive/1.PROJECTS/[TeamPe][BIT][BGRA2025]/NLP/Code/config_phobert_train1.yaml"

mode: train #pretrain #train
seed: 42

paths:
  train: "/content/drive/MyDrive/1.PROJECTS/[TeamPe][BIT][BGRA2025]/NLP/Code/reintel_dataset/train.csv"
  test: "/content/drive/MyDrive/1.PROJECTS/[TeamPe][BIT][BGRA2025]/NLP/Code/reintel_dataset/test.csv"
  val: "/content/drive/MyDrive/1.PROJECTS/[TeamPe][BIT][BGRA2025]/NLP/Code/reintel_dataset/val.csv"
  #train: "/content/drive/MyDrive/1.PROJECTS/[TeamPe][BIT][BGRA2025]/NLP/Code/reintel_dataset/warmup.csv"

preprocessing:
  phobert:
    max_sequence_length: 256

models:
  phobert:
    model_name: "vinai/phobert-base"
    hidden_size: 768
    dropout_rate: 0.3
    num_labels: 2

training:
  batch_size: 32
  epochs: 10
  checkpoint_path: "/content/drive/MyDrive/1.PROJECTS/[TeamPe][BIT][BGRA2025]/NLP/Code/PhoBert/checkpoints/phobert_train1.pt"
  log_path: "/content/drive/MyDrive/1.PROJECTS/[TeamPe][BIT][BGRA2025]/NLP/Code/PhoBert/logs/phobert_training_train1.csv"
  early_stopping:
    patience: 3
  device: auto


Overwriting /content/drive/MyDrive/1.PROJECTS/[TeamPe][BIT][BGRA2025]/NLP/Code/config_phobert.yaml


In [10]:
# Load data
config = load_config("/content/drive/MyDrive/1.PROJECTS/[TeamPe][BIT][BGRA2025]/NLP/Code/config_phobert.yaml")

train_df = pd.read_csv(config.paths.train)
val_df = pd.read_csv(config.paths.val)

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

train_dataset = PhoBERTDataset(train_df['post_message'].tolist(), train_df['label'].tolist(), tokenizer, config.preprocessing.phobert.max_sequence_length)
val_dataset = PhoBERTDataset(val_df['post_message'].tolist(), val_df['label'].tolist(), tokenizer, config.preprocessing.phobert.max_sequence_length)

train_loader = DataLoader(train_dataset, batch_size=config.training.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config.training.batch_size)

# Train model
model = PhoBERTClassifier(model_name=config.models.phobert.model_name, num_labels=config.models.phobert.num_labels)
train_model(model, train_loader, val_loader, config)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

[Epoch 1] Training:   0%|          | 0/274 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

[Epoch 1] Training: 100%|██████████| 274/274 [06:22<00:00,  1.40s/it]


              precision    recall  f1-score   support

           0     0.9583    0.9678    0.9631       404
           1     0.8333    0.7927    0.8125        82

    accuracy                         0.9383       486
   macro avg     0.8958    0.8803    0.8878       486
weighted avg     0.9372    0.9383    0.9377       486

[Epoch 1] Time: 390.02s - Train Acc: 0.8848 - Val Acc: 0.9383 - F1: 0.8878 - Precision: 0.8958 - Recall: 0.8803 - AUC: 0.9736


[Epoch 2] Training: 100%|██████████| 274/274 [06:25<00:00,  1.41s/it]


              precision    recall  f1-score   support

           0     0.9410    0.9876    0.9638       404
           1     0.9194    0.6951    0.7917        82

    accuracy                         0.9383       486
   macro avg     0.9302    0.8414    0.8777       486
weighted avg     0.9374    0.9383    0.9347       486

[Epoch 2] Time: 392.41s - Train Acc: 0.9441 - Val Acc: 0.9383 - F1: 0.8777 - Precision: 0.9302 - Recall: 0.8414 - AUC: 0.9777


[Epoch 3] Training: 100%|██████████| 274/274 [06:23<00:00,  1.40s/it]


              precision    recall  f1-score   support

           0     0.9659    0.9802    0.9730       404
           1     0.8947    0.8293    0.8608        82

    accuracy                         0.9547       486
   macro avg     0.9303    0.9047    0.9169       486
weighted avg     0.9539    0.9547    0.9540       486

[Epoch 3] Time: 390.80s - Train Acc: 0.9765 - Val Acc: 0.9547 - F1: 0.9169 - Precision: 0.9303 - Recall: 0.9047 - AUC: 0.9826


[Epoch 4] Training: 100%|██████████| 274/274 [06:24<00:00,  1.40s/it]


              precision    recall  f1-score   support

           0     0.9571    0.9950    0.9757       404
           1     0.9697    0.7805    0.8649        82

    accuracy                         0.9588       486
   macro avg     0.9634    0.8878    0.9203       486
weighted avg     0.9593    0.9588    0.9570       486

[Epoch 4] Time: 391.39s - Train Acc: 0.9865 - Val Acc: 0.9588 - F1: 0.9203 - Precision: 0.9634 - Recall: 0.8878 - AUC: 0.9855


[Epoch 5] Training: 100%|██████████| 274/274 [06:24<00:00,  1.40s/it]


              precision    recall  f1-score   support

           0     0.9569    0.9901    0.9732       404
           1     0.9412    0.7805    0.8533        82

    accuracy                         0.9547       486
   macro avg     0.9491    0.8853    0.9133       486
weighted avg     0.9543    0.9547    0.9530       486

[Epoch 5] Time: 392.16s - Train Acc: 0.9915 - Val Acc: 0.9547 - F1: 0.9133 - Precision: 0.9491 - Recall: 0.8853 - AUC: 0.9789


[Epoch 6] Training: 100%|██████████| 274/274 [06:23<00:00,  1.40s/it]


              precision    recall  f1-score   support

           0     0.9660    0.9851    0.9755       404
           1     0.9189    0.8293    0.8718        82

    accuracy                         0.9588       486
   macro avg     0.9425    0.9072    0.9236       486
weighted avg     0.9581    0.9588    0.9580       486

[Epoch 6] Time: 390.39s - Train Acc: 0.9936 - Val Acc: 0.9588 - F1: 0.9236 - Precision: 0.9425 - Recall: 0.9072 - AUC: 0.9762


[Epoch 7] Training: 100%|██████████| 274/274 [06:23<00:00,  1.40s/it]


              precision    recall  f1-score   support

           0     0.9639    0.9926    0.9780       404
           1     0.9571    0.8171    0.8816        82

    accuracy                         0.9630       486
   macro avg     0.9605    0.9048    0.9298       486
weighted avg     0.9628    0.9630    0.9618       486

[Epoch 7] Time: 390.45s - Train Acc: 0.9943 - Val Acc: 0.9630 - F1: 0.9298 - Precision: 0.9605 - Recall: 0.9048 - AUC: 0.9851


[Epoch 8] Training: 100%|██████████| 274/274 [06:24<00:00,  1.40s/it]


              precision    recall  f1-score   support

           0     0.9617    0.9950    0.9781       404
           1     0.9706    0.8049    0.8800        82

    accuracy                         0.9630       486
   macro avg     0.9662    0.9000    0.9291       486
weighted avg     0.9632    0.9630    0.9615       486

[Epoch 8] Time: 391.90s - Train Acc: 0.9959 - Val Acc: 0.9630 - F1: 0.9291 - Precision: 0.9662 - Recall: 0.9000 - AUC: 0.9857


[Epoch 9] Training: 100%|██████████| 274/274 [06:23<00:00,  1.40s/it]


              precision    recall  f1-score   support

           0     0.9709    0.9926    0.9816       404
           1     0.9589    0.8537    0.9032        82

    accuracy                         0.9691       486
   macro avg     0.9649    0.9231    0.9424       486
weighted avg     0.9689    0.9691    0.9684       486

[Epoch 9] Time: 390.65s - Train Acc: 0.9913 - Val Acc: 0.9691 - F1: 0.9424 - Precision: 0.9649 - Recall: 0.9231 - AUC: 0.9904


[Epoch 10] Training: 100%|██████████| 274/274 [06:24<00:00,  1.40s/it]


              precision    recall  f1-score   support

           0     0.9757    0.9926    0.9840       404
           1     0.9600    0.8780    0.9172        82

    accuracy                         0.9733       486
   macro avg     0.9678    0.9353    0.9506       486
weighted avg     0.9730    0.9733    0.9728       486

[Epoch 10] Time: 391.26s - Train Acc: 0.9991 - Val Acc: 0.9733 - F1: 0.9506 - Precision: 0.9678 - Recall: 0.9353 - AUC: 0.9903


In [11]:
# Load lại tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(config.models.phobert.model_name)
model = PhoBERTClassifier(
    model_name=config.models.phobert.model_name,
    num_labels=config.models.phobert.num_labels
)

# Load checkpoint tốt nhất
model.load_state_dict(torch.load(config.training.checkpoint_path, map_location='cpu'))

# Chuẩn bị dữ liệu test
test_df = pd.read_csv(config.paths.test)
test_dataset = PhoBERTDataset(
    texts=test_df['post_message'].tolist(),
    labels=test_df['label'].tolist(),
    tokenizer=tokenizer,
    max_len=config.preprocessing.phobert.max_sequence_length
)
test_loader = DataLoader(test_dataset, batch_size=config.training.batch_size)

# Gọi đánh giá
acc, f1, precision, recall, auc = evaluate_model(model, test_loader)

print(f"\n✅ Test Accuracy: {acc:.4f}")
print(f"✅ Test F1 (macro): {f1:.4f}")
print(f"✅ Test Precision: {precision:.4f}")
print(f"✅ Test Recall: {recall:.4f}")
print(f"✅ Test AUC: {auc:.4f}")


              precision    recall  f1-score   support

           0     0.9617    0.9950    0.9781       404
           1     0.9706    0.8049    0.8800        82

    accuracy                         0.9630       486
   macro avg     0.9662    0.9000    0.9291       486
weighted avg     0.9632    0.9630    0.9615       486


✅ Test Accuracy: 0.9630
✅ Test F1 (macro): 0.9291
✅ Test Precision: 0.9662
✅ Test Recall: 0.9000
✅ Test AUC: 0.9797
