In [1]:
import io
import re
import torch
import numpy as np
import pandas as pd

from nltk.tokenize import RegexpTokenizer
from imblearn.over_sampling import RandomOverSampler
from google.colab import files
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from google.colab import drive

In [2]:
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModel

# Đăng nhập vào Hugging Face Hub
token = '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'
login(token=token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/train_data.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/val_data.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/test_data.csv')

train_texts = train_df['content']
train_labels = train_df['label']
val_texts = val_df['content']
val_labels = val_df['label']
test_texts = test_df['content']
test_labels = test_df['label']

In [5]:
# Tạo TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=4000)
tfidf_vectorizer.fit(train_texts)  # Fit trên tập huấn luyện

# Chuyển đổi dữ liệu thành TF-IDF vectors
train_tfidf_vectors = tfidf_vectorizer.transform(train_texts).toarray()
val_tfidf_vectors = tfidf_vectorizer.transform(val_texts).toarray()
test_tfidf_vectors = tfidf_vectorizer.transform(test_texts).toarray()

In [6]:
# Sử dụng mô hình từ Hugging Face
model_name = 'FPTAI/vibert-base-cased'

# Tokenizer ViSOBERT
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/581M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at FPTAI/vibert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Tokenize dữ liệu với padding
def tokenize_and_pad(texts, tokenizer, max_length=256):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    return encodings

train_encodings = tokenize_and_pad(train_texts.tolist(), tokenizer, max_length=256)
val_encodings = tokenize_and_pad(val_texts.tolist(), tokenizer, max_length=256)
test_encodings = tokenize_and_pad(test_texts.tolist(), tokenizer, max_length=256)

In [8]:
class CombinedDataset(Dataset):
    def __init__(self, encodings, labels, tfidf_vectors):
        self.encodings = encodings
        self.labels = labels
        self.tfidf_vectors = tfidf_vectors

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).long()
        tfidf_vector = torch.tensor(self.tfidf_vectors[idx], dtype=torch.float32)
        return {'input_ids': item['input_ids'], 'attention_mask': item['attention_mask'], 'labels': item['labels'], 'tfidf': tfidf_vector}

    def __len__(self):
        return len(self.labels)

In [9]:
train_dataset = CombinedDataset(train_encodings, train_labels.tolist(), train_tfidf_vectors)
val_dataset = CombinedDataset(val_encodings, val_labels.tolist(), val_tfidf_vectors)
test_dataset = CombinedDataset(test_encodings, test_labels.tolist(), test_tfidf_vectors)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [10]:
class CombinedModel(torch.nn.Module):
    def __init__(self, visobert_model, tfidf_dim):
        super(CombinedModel, self).__init__()
        self.visobert_model = visobert_model
        self.tfidf_dim = tfidf_dim
        self.fc = torch.nn.Linear(self.visobert_model.config.hidden_size + self.tfidf_dim, 2)
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids=None, attention_mask=None, tfidf=None, labels=None):
        # Lấy outputs từ ViSoBERT
        outputs = self.visobert_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)

        # Sử dụng hidden states từ layer cuối cùng
        hidden_states = outputs.hidden_states[-1]
        pooled_output = hidden_states[:, 0, :]  # Sử dụng hidden state của token [CLS]

        # Kết hợp TF-IDF với các đặc trưng từ ViSoBERT
        combined_features = torch.cat((pooled_output, tfidf), dim=1)
        logits = self.fc(combined_features)

        # Tính toán mất mát nếu nhãn được cung cấp
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return {"loss": loss, "logits": logits}

In [11]:
# Tạo mô hình
model = CombinedModel(model, tfidf_dim=train_tfidf_vectors.shape[1])

In [12]:
# Định nghĩa hàm huấn luyện với các siêu tham số có thể tinh chỉnh
def train_model(learning_rate, weight_decay, num_train_epochs):
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=weight_decay,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=50,
        save_steps=50,
        save_total_limit=3,
        load_best_model_at_end=True,
        learning_rate=learning_rate,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()

    eval_result = trainer.evaluate()
    return eval_result['eval_loss']

In [13]:
# Tinh chỉnh siêu tham số với Grid Search
param_grid = {
    'learning_rate': [5e-5, 3e-5, 2e-5],
    'weight_decay': [0.01, 0.015, 0.02],
    'num_train_epochs': [3, 4, 5],
}

best_params = None
best_score = float('inf')

In [14]:
for lr in param_grid['learning_rate']:
    for wd in param_grid['weight_decay']:
        for epochs in param_grid['num_train_epochs']:
            eval_loss = train_model(lr, wd, epochs)
            print(f"Learning rate: {lr}, Weight decay: {wd}, Epochs: {epochs}, Eval loss: {eval_loss}")
            if eval_loss < best_score:
                best_score = eval_loss
                best_params = {'learning_rate': lr, 'weight_decay': wd, 'num_train_epochs': epochs}

print(f"Best params: {best_params}, Best eval loss: {best_score}")



Step,Training Loss,Validation Loss
50,0.6718,0.653386
100,0.5003,0.488238
150,0.4587,0.420141
200,0.5291,0.519824
250,0.4223,0.37273
300,0.4775,0.441387
350,0.2916,0.611349
400,0.3172,0.424184


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.3727295398712158




Step,Training Loss,Validation Loss
50,0.2141,0.427025
100,0.1073,0.433619
150,0.1787,0.537502
200,0.4014,0.799667


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.4270254075527191




Step,Training Loss,Validation Loss
50,0.1865,0.453776
100,0.1077,0.419592
150,0.1799,0.512431
200,0.4335,0.666477
250,0.3361,0.574767


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.41959190368652344




Step,Training Loss,Validation Loss
50,0.1078,0.451053
100,0.0581,0.47836
150,0.2404,0.435218
200,0.3405,0.791213
250,0.4042,0.456273
300,0.2412,1.018623


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.43521830439567566




Step,Training Loss,Validation Loss
50,0.0603,0.652419
100,0.0039,0.554747
150,0.0805,0.663153
200,0.3019,0.588334
250,0.3864,0.635067


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.5547473430633545




Step,Training Loss,Validation Loss
50,0.0177,0.555254
100,0.0026,0.621025
150,0.0883,0.8053
200,0.3048,0.561321


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.5552540421485901




Step,Training Loss,Validation Loss
50,0.0038,0.589069
100,0.0023,0.638664
150,0.0832,0.734747
200,0.4463,0.762985


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.5890693068504333




Step,Training Loss,Validation Loss
50,0.0026,0.631744
100,0.0021,0.659322
150,0.0848,0.828981
200,0.3706,0.689593


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.6317442059516907




Step,Training Loss,Validation Loss
50,0.0022,0.667935
100,0.0019,0.695389
150,0.1597,0.660886
200,0.4631,0.998052
250,0.5648,0.606015
300,0.71,0.709329
350,0.6428,0.639654
400,0.2751,0.470719
450,0.3253,0.44842
500,0.3742,0.528965


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.4484202265739441




Step,Training Loss,Validation Loss
50,0.0536,0.475534
100,0.022,0.515697
150,0.0167,0.541789
200,0.0757,0.600845


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.47553446888923645




Step,Training Loss,Validation Loss
50,0.0313,0.501565
100,0.0153,0.553506
150,0.0064,0.540574
200,0.1237,0.646938


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.5015654563903809




Step,Training Loss,Validation Loss
50,0.02,0.544215
100,0.0107,0.615195
150,0.0054,0.507439
200,0.0762,0.596697
250,0.2142,0.550503
300,0.0914,0.708434


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.5074388980865479




Step,Training Loss,Validation Loss
50,0.0043,0.630797
100,0.0034,0.571267
150,0.0025,0.701543
200,0.2997,0.620611
250,0.2216,0.596804


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.5712670683860779




Step,Training Loss,Validation Loss
50,0.003,0.741444
100,0.0025,0.662137
150,0.0035,0.765302
200,0.2766,0.659266
250,0.2371,0.59143
300,0.2006,0.915295
350,0.057,0.690484
400,0.0491,0.976851


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.5914303064346313




Step,Training Loss,Validation Loss
50,0.0025,0.626665
100,0.0022,0.734377
150,0.0012,0.743479
200,0.1608,0.704853


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.6266652345657349




Step,Training Loss,Validation Loss
50,0.0015,0.709431
100,0.0013,0.734173
150,0.001,0.854766
200,0.0837,0.851201


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.7094314694404602




Step,Training Loss,Validation Loss
50,0.0014,0.723558
100,0.0012,0.721784
150,0.0009,0.746887
200,0.0832,1.056437
250,0.0903,0.642135
300,0.137,0.685638
350,0.0017,0.657685
400,0.0567,0.63783
450,0.3224,0.414592
500,0.1486,0.826231


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.414591908454895




Step,Training Loss,Validation Loss
50,0.0426,0.472845
100,0.0045,0.594476
150,0.0015,0.602225
200,0.066,0.616905


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.472845196723938




Step,Training Loss,Validation Loss
50,0.0062,0.5201
100,0.0038,0.611182
150,0.0015,0.611919
200,0.0677,0.621999


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.5200998783111572




Step,Training Loss,Validation Loss
50,0.0038,0.577516
100,0.0024,0.635918
150,0.0011,0.652841
200,0.0737,0.652643


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.5775158405303955




Step,Training Loss,Validation Loss
50,0.0024,0.643255
100,0.0014,0.704113
150,0.0007,0.719386
200,0.076,0.670326


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.6432545185089111




Step,Training Loss,Validation Loss
50,0.0015,0.718026
100,0.0009,0.754914
150,0.0006,0.756195
200,0.0843,0.727756


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.7180261611938477




Step,Training Loss,Validation Loss
50,0.0009,0.817131
100,0.0007,0.812349
150,0.0005,0.785622
200,0.081,0.713546
250,0.001,0.724855
300,0.0005,0.950827
350,0.0003,0.713467
400,0.0002,1.126456
450,0.0002,0.764922
500,0.0607,1.111217


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.713467001914978




Step,Training Loss,Validation Loss
50,0.0002,0.773098
100,0.0002,1.003129
150,0.0002,0.995062
200,0.0001,0.959963


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.7730975151062012




Step,Training Loss,Validation Loss
50,0.0002,0.815898
100,0.0002,0.999428
150,0.0002,0.983027
200,0.0001,0.942408


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.8158980011940002




Step,Training Loss,Validation Loss
50,0.0002,0.861315
100,0.0002,1.013134
150,0.0002,0.996518
200,0.0001,0.932855


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.8613148927688599




Step,Training Loss,Validation Loss
50,0.0002,0.901893
100,0.0002,0.993107
150,0.0002,0.995094
200,0.0001,0.926933


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.9018927812576294
Best params: {'learning_rate': 5e-05, 'weight_decay': 0.01, 'num_train_epochs': 3}, Best eval loss: 0.3727295398712158


In [15]:
# Khởi tạo Trainer với Dừng sớm (Early Stopping)
training_args_with_early_stop = TrainingArguments(
    output_dir='./results_with_early_stop',
    num_train_epochs=best_params['num_train_epochs'],
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=best_params['weight_decay'],
    logging_dir='./logs_with_early_stop',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    learning_rate=best_params['learning_rate'],
)

trainer_with_early_stop = Trainer(
    model=model,
    args=training_args_with_early_stop,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)



In [28]:
trainer_with_early_stop.train()

Step,Training Loss,Validation Loss
50,0.0002,0.63302
100,0.0751,0.621338
150,0.0001,0.814485
200,0.061,0.685687
250,0.0001,0.792916


TrainOutput(global_step=250, training_loss=0.024228889744728803, metrics={'train_runtime': 170.4052, 'train_samples_per_second': 35.228, 'train_steps_per_second': 4.419, 'total_flos': 0.0, 'train_loss': 0.024228889744728803, 'epoch': 0.9960159362549801})

In [29]:
# Dự đoán nhãn cho tập kiểm tra
predictions2 = trainer_with_early_stop.predict(val_dataset)

# Lấy nhãn dự đoán từ logits
pred_labels2 = np.argmax(predictions2.predictions, axis=1)

In [31]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels2)
precision2 = precision_score(val_labels, pred_labels2, pos_label=0)
recall2 = recall_score(val_labels, pred_labels2, pos_label=0)
f12 = f1_score(val_labels, pred_labels2, pos_label=0)
auc2 = roc_auc_score(val_labels, predictions2.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.916168
Precision: 0.927711
Recall: 0.905882
F1 Score: 0.916667
AUC: 0.972358


In [32]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels2)
precision2 = precision_score(val_labels, pred_labels2, pos_label=1)
recall2 = recall_score(val_labels, pred_labels2, pos_label=1)
f12 = f1_score(val_labels, pred_labels2, pos_label=1)
auc2 = roc_auc_score(val_labels, predictions2.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.916168
Precision: 0.904762
Recall: 0.926829
F1 Score: 0.915663
AUC: 0.972358
