In [1]:
import io
import re
import torch
import numpy as np
import pandas as pd

from torch.nn import CrossEntropyLoss
from torch.utils.data import Dataset, DataLoader

from google.colab import files

from imblearn.over_sampling import RandomOverSampler

from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from transformers import (AutoTokenizer, AutoModel,
                          AutoModelForSequenceClassification, Trainer,
                          TrainingArguments, EarlyStoppingCallback, AdamW)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/train_data.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/val_data.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/test_data.csv')

train_texts = train_df['content']
train_labels = train_df['label']
val_texts = val_df['content']
val_labels = val_df['label']
test_texts = test_df['content']
test_labels = test_df['label']

In [4]:
# Tạo TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=4000)
tfidf_vectorizer.fit(train_texts)  # Fit trên tập huấn luyện

# Chuyển đổi dữ liệu thành TF-IDF vectors
train_tfidf_vectors = tfidf_vectorizer.transform(train_texts).toarray()
val_tfidf_vectors = tfidf_vectorizer.transform(val_texts).toarray()
test_tfidf_vectors = tfidf_vectorizer.transform(test_texts).toarray()

In [5]:
# Tạo tokenizer và model từ PhoBERT
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
phobert_model = AutoModel.from_pretrained("vinai/phobert-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

In [6]:
# Tokenize dữ liệu với padding
def tokenize_and_pad(texts, tokenizer, max_length=256):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    return encodings

train_encodings = tokenize_and_pad(train_texts.tolist(), tokenizer, max_length=256)
val_encodings = tokenize_and_pad(val_texts.tolist(), tokenizer, max_length=256)
test_encodings = tokenize_and_pad(test_texts.tolist(), tokenizer, max_length=256)

In [7]:
class CombinedDataset(Dataset):
    def __init__(self, encodings, labels, tfidf_vectors):
        self.encodings = encodings
        self.labels = labels
        self.tfidf_vectors = tfidf_vectors

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).long()
        tfidf_vector = torch.tensor(self.tfidf_vectors[idx], dtype=torch.float32)
        return {'input_ids': item['input_ids'], 'attention_mask': item['attention_mask'], 'labels': item['labels'], 'tfidf': tfidf_vector}

    def __len__(self):
        return len(self.labels)

In [8]:
train_dataset = CombinedDataset(train_encodings, train_labels.tolist(), train_tfidf_vectors)
val_dataset = CombinedDataset(val_encodings, val_labels.tolist(), val_tfidf_vectors)
test_dataset = CombinedDataset(test_encodings, test_labels.tolist(), test_tfidf_vectors)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [9]:
class CombinedModel(torch.nn.Module):
    def __init__(self, phobert_model, tfidf_dim):
        super(CombinedModel, self).__init__()
        self.phobert_model = phobert_model
        self.tfidf_dim = tfidf_dim
        self.fc = torch.nn.Linear(self.phobert_model.config.hidden_size + self.tfidf_dim, 2)
        self.loss_fn = CrossEntropyLoss()

    def forward(self, input_ids=None, attention_mask=None, tfidf=None, labels=None):
        # Lấy outputs từ PhoBERT
        outputs = self.phobert_model(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        pooled_output = hidden_states[:, 0, :]  # Sử dụng hidden state của token [CLS]

        # Kết hợp TF-IDF với các đặc trưng từ PhoBERT
        combined_features = torch.cat((pooled_output, tfidf), dim=1)
        logits = self.fc(combined_features)

        # Tính toán mất mát nếu nhãn được cung cấp
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return {"loss": loss, "logits": logits}

In [10]:
# Tạo mô hình
model = CombinedModel(phobert_model, tfidf_dim=train_tfidf_vectors.shape[1])

In [11]:
# Định nghĩa hàm huấn luyện với các siêu tham số có thể tinh chỉnh
def train_model(learning_rate, weight_decay, num_train_epochs):
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=weight_decay,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy="steps",
        eval_steps=50,
        save_steps=50,
        save_total_limit=3,
        load_best_model_at_end=True,
        learning_rate=learning_rate,
        save_strategy="steps",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()

    eval_result = trainer.evaluate()
    return eval_result['eval_loss']

In [12]:
# Tinh chỉnh siêu tham số với Grid Search
param_grid = {
    'learning_rate': [5e-5, 3e-5, 2e-5],
    'weight_decay': [0.01, 0.015, 0.02],
    'num_train_epochs': [3, 4, 5],
}

best_params = None
best_score = float('inf')

In [13]:
for lr in param_grid['learning_rate']:
    for wd in param_grid['weight_decay']:
        for epochs in param_grid['num_train_epochs']:
            eval_loss = train_model(lr, wd, epochs)
            print(f"Learning rate: {lr}, Weight decay: {wd}, Epochs: {epochs}, Eval loss: {eval_loss}")
            if eval_loss < best_score:
                best_score = eval_loss
                best_params = {'learning_rate': lr, 'weight_decay': wd, 'num_train_epochs': epochs}

print(f"Best params: {best_params}, Best eval loss: {best_score}")

Step,Training Loss,Validation Loss
50,0.6621,0.656606
100,0.5785,0.558509
150,0.4123,0.466176
200,0.5097,0.499199
250,0.3711,0.34451
300,0.3597,0.494841
350,0.1988,0.445512
400,0.3717,0.311431
450,0.2451,0.320305
500,0.3516,0.325018


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.31143131852149963


Step,Training Loss,Validation Loss
50,0.2403,0.322921
100,0.2176,0.349706
150,0.1863,0.368319
200,0.1438,0.364252


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.32292139530181885


Step,Training Loss,Validation Loss
50,0.2327,0.344495
100,0.2132,0.344123
150,0.1989,0.373789
200,0.1103,0.356163
250,0.1658,0.393378


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.3441234529018402


Step,Training Loss,Validation Loss
50,0.1641,0.366378
100,0.1497,0.386673
150,0.2296,0.385072
200,0.0401,0.341887
250,0.1086,0.38499
300,0.1681,0.377194
350,0.1806,0.452755


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.34188684821128845


Step,Training Loss,Validation Loss
50,0.1172,0.32449
100,0.0659,0.339104
150,0.0676,0.342395
200,0.0706,0.361985


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.32449033856391907


Step,Training Loss,Validation Loss
50,0.1173,0.328412
100,0.066,0.333355
150,0.0694,0.345748
200,0.0632,0.347462


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.32841214537620544


Step,Training Loss,Validation Loss
50,0.1166,0.324425
100,0.0653,0.342269
150,0.0888,0.355544
200,0.0196,0.3763


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.3244253695011139


Step,Training Loss,Validation Loss
50,0.1162,0.358974
100,0.0646,0.350226
150,0.0875,0.353228
200,0.058,0.388622
250,0.2493,0.554716


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.3502263128757477


Step,Training Loss,Validation Loss
50,0.0937,0.345128
100,0.0607,0.345708
150,0.1301,0.419143
200,0.0121,0.412619


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.3451283574104309


Step,Training Loss,Validation Loss
50,0.0661,0.353186
100,0.0592,0.344453
150,0.1233,0.374698
200,0.118,0.411591
250,0.0297,0.337705
300,0.1231,0.470477
350,0.0731,0.49182
400,0.1508,0.345908


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.33770522475242615


Step,Training Loss,Validation Loss
50,0.0454,0.354379
100,0.0088,0.349896
150,0.0685,0.425847
200,0.1599,0.498383
250,0.0856,0.429453


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.3498956561088562


Step,Training Loss,Validation Loss
50,0.0083,0.343638
100,0.0077,0.352845
150,0.0693,0.525033
200,0.0633,0.392832


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.34363752603530884


Step,Training Loss,Validation Loss
50,0.0079,0.350704
100,0.0074,0.361304
150,0.0696,0.447098
200,0.2648,0.380417


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.35070350766181946


Step,Training Loss,Validation Loss
50,0.0076,0.356458
100,0.0071,0.362831
150,0.0699,0.601781
200,0.2738,0.477042


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.3564583361148834


Step,Training Loss,Validation Loss
50,0.0073,0.366437
100,0.0069,0.365618
150,0.0701,0.505905
200,0.0591,0.415488
250,0.0861,0.41042


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.36561810970306396


Step,Training Loss,Validation Loss
50,0.0065,0.376531
100,0.0062,0.376682
150,0.0706,0.711501
200,0.1994,0.448204


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.37653109431266785


Step,Training Loss,Validation Loss
50,0.0063,0.377536
100,0.0059,0.42511
150,0.1979,0.366782
200,0.1978,0.431373
250,0.0087,0.438672
300,0.1392,0.557838


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.36678194999694824


Step,Training Loss,Validation Loss
50,0.0053,0.391538
100,0.005,0.428506
150,0.0717,0.37784
200,0.0191,0.447041
250,0.1227,0.454214
300,0.196,0.540011


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.37783971428871155


Step,Training Loss,Validation Loss
50,0.0044,0.42616
100,0.0042,0.410943
150,0.0039,0.42214
200,0.1329,0.493493
250,0.0715,0.403987
300,0.129,0.426623
350,0.0563,0.373054
400,0.0679,0.360557
450,0.0779,0.394877
500,0.1902,0.389988


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.3605573773384094


Step,Training Loss,Validation Loss
50,0.0032,0.397839
100,0.0031,0.361308
150,0.0029,0.363161
200,0.0026,0.407673
250,0.0765,0.408294


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.3613077700138092


Step,Training Loss,Validation Loss
50,0.003,0.374335
100,0.0028,0.451194
150,0.0025,0.409507
200,0.0022,0.420851


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.37433528900146484


Step,Training Loss,Validation Loss
50,0.0029,0.391865
100,0.0027,0.455251
150,0.0025,0.411239
200,0.0021,0.43043


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.3918647766113281


Step,Training Loss,Validation Loss
50,0.0028,0.399541
100,0.0027,0.460318
150,0.0024,0.425379
200,0.0021,0.435754


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.39954084157943726


Step,Training Loss,Validation Loss
50,0.0027,0.407851
100,0.0026,0.476749
150,0.0023,0.441256
200,0.002,0.442168


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.40785080194473267


Step,Training Loss,Validation Loss
50,0.0027,0.415864
100,0.0025,0.478724
150,0.0023,0.44816
200,0.002,0.447368


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.4158640205860138


Step,Training Loss,Validation Loss
50,0.0026,0.42992
100,0.0025,0.482505
150,0.0022,0.456573
200,0.0019,0.444228


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.4299204349517822


Step,Training Loss,Validation Loss
50,0.0025,0.439495
100,0.0024,0.491282
150,0.0022,0.460125
200,0.0019,0.445452


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.43949466943740845
Best params: {'learning_rate': 5e-05, 'weight_decay': 0.01, 'num_train_epochs': 3}, Best eval loss: 0.31143131852149963


In [52]:
# Khởi tạo Trainer với Dừng sớm (Early Stopping)
training_args_with_early_stop = TrainingArguments(
    output_dir='./results_with_early_stop',
    num_train_epochs=best_params['num_train_epochs'],
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=best_params['weight_decay'],
    logging_dir='./logs_with_early_stop',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    learning_rate=best_params['learning_rate'],
)

trainer_with_early_stop = Trainer(
    model=model,
    args=training_args_with_early_stop,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)



In [78]:
trainer_with_early_stop.train()

Step,Training Loss,Validation Loss
50,0.0013,0.405076
100,0.0012,0.451599
150,0.0658,0.380872
200,0.1462,0.390066
250,0.0857,0.419602
300,0.2511,0.588869
350,0.0612,0.563959


TrainOutput(global_step=350, training_loss=0.05151999912623848, metrics={'train_runtime': 271.2146, 'train_samples_per_second': 22.134, 'train_steps_per_second': 2.776, 'total_flos': 0.0, 'train_loss': 0.05151999912623848, 'epoch': 1.3944223107569722})

In [79]:
# Dự đoán nhãn cho tập kiểm tra
predictions2 = trainer_with_early_stop.predict(val_dataset)

# Lấy nhãn dự đoán từ logits
pred_labels2 = np.argmax(predictions2.predictions, axis=1)

In [80]:
print(pred_labels2[:])  # In ra dự đoán

[0 1 0 1 1 1 0 1 1 1 1 0 0 1 1 0 0 0 0 1 0 1 0 1 0 1 1 0 1 1 1 0 1 1 0 0 1
 0 0 0 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 1 0 1 0 1 1 0 0 0 0 1 1 1 0 0
 1 1 0 0 1 1 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0 1 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 0 1 0 1 1 0 1 1 0 1 0 1 1 1 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 1 0 0 1 1 1 1 1
 1 1 1 1 0 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 0 0 0 1 1 0 0 1 0 0 1 1 0 1 0 0 1 1 0
 1 0 0 1 1 1 0 1 1 0 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 1 0 1 1 0 1 1 0 0 0 0 1
 1 0 0 0 1 1 0 1 0 1 1 0 0 0 1 1 0 0 1 1 1 0 1 1 0 0 1 0 1 0 0 1 1 1 1 0 0
 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 1 0 1 0 1 0 0 1 0 1 1 1 1 0 1 1 1
 1 0 1 0 0 1 1 1 0 0 1 0 1 0 1 0 1 1 1 0 1 1 0 0 1 1 1 0 0 0 1 1 1 1 0 1 0
 1 0 0 0 1 0 0 0 0 1 1 1 1 1 1 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 1 0 0 0 1 1 0 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 1 1 1 1 1 0 1 1 1 0 1
 1 0 1 0 1 0 1 1 0 1 0 0 1 1 1 0 1 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 1 1 1 0 1 

In [81]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels2)
precision2 = precision_score(val_labels, pred_labels2, pos_label=0)
recall2 = recall_score(val_labels, pred_labels2, pos_label=0)
f12 = f1_score(val_labels, pred_labels2, pos_label=0)
auc2 = roc_auc_score(val_labels, predictions2.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.942116
Precision: 0.937984
Recall: 0.949020
F1 Score: 0.943470
AUC: 0.975387


In [82]:
# Dự đoán nhãn cho tập kiểm tra
predictions3 = trainer_with_early_stop.predict(test_dataset)

# Lấy nhãn dự đoán từ logits
pred_labels3 = np.argmax(predictions3.predictions, axis=1)

In [83]:
print(pred_labels3[:])  # In ra dự đoán

[0 0 1 0 0 0 0 1 1 0 0]
