In [1]:
import io
import re
import torch
import numpy as np
import pandas as pd

from nltk.tokenize import RegexpTokenizer
from imblearn.over_sampling import RandomOverSampler
from google.colab import files
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Tạo tokenizer và model từ PhoBERT
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

In [4]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/train_data.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/val_data.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/test_data.csv')

train_texts = train_df['content']
train_labels = train_df['label']
val_texts = val_df['content']
val_labels = val_df['label']
test_texts = test_df['content']
test_labels = test_df['label']

In [5]:
# Tokenize dữ liệu với padding
def tokenize_and_pad(texts, tokenizer, max_length=256):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',  # Thêm padding để tất cả các văn bản có cùng chiều dài
        max_length=max_length
    )
    return encodings

train_encodings = tokenize_and_pad(train_texts.tolist(), tokenizer, max_length=256)
val_encodings = tokenize_and_pad(val_texts.tolist(), tokenizer, max_length=256)
test_encodings = tokenize_and_pad(test_texts.tolist(), tokenizer, max_length=256)

In [6]:
train_labels = np.array(train_labels).astype(int)
val_labels = np.array(val_labels).astype(int)
test_labels = np.array(test_labels).astype(int)

In [7]:
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).squeeze()  # Đảm bảo nhãn có kích thước đúng
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FakeNewsDataset(train_encodings, train_labels.tolist())
val_dataset = FakeNewsDataset(val_encodings, val_labels.tolist())
test_dataset = FakeNewsDataset(test_encodings, test_labels.tolist())

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [8]:
# Tải mô hình phân loại
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Định nghĩa hàm huấn luyện với các siêu tham số có thể tinh chỉnh
def train_model(learning_rate, weight_decay, num_train_epochs):
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=weight_decay,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy="steps",
        eval_steps=50,
        save_steps=50,
        save_total_limit=3,
        load_best_model_at_end=True,
        learning_rate=learning_rate,
        save_strategy="steps",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()

    eval_result = trainer.evaluate()
    return eval_result['eval_loss']

In [10]:
# Tinh chỉnh siêu tham số với Grid Search
param_grid = {
    'learning_rate': [5e-5, 3e-5, 2e-5],
    'weight_decay': [0.01, 0.015, 0.02],
    'num_train_epochs': [3, 4, 5],
}

best_params = None
best_score = float('inf')

for lr in param_grid['learning_rate']:
    for wd in param_grid['weight_decay']:
        for epochs in param_grid['num_train_epochs']:
            eval_loss = train_model(lr, wd, epochs)
            print(f"Learning rate: {lr}, Weight decay: {wd}, Epochs: {epochs}, Eval loss: {eval_loss}")
            if eval_loss < best_score:
                best_score = eval_loss
                best_params = {'learning_rate': lr, 'weight_decay': wd, 'num_train_epochs': epochs}

print(f"Best params: {best_params}, Best eval loss: {best_score}")

Step,Training Loss,Validation Loss
50,0.6595,0.662263
100,0.5573,0.532667
150,0.366,0.396071
200,0.4425,0.416503
250,0.5611,0.420262
300,0.4645,0.477894


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.3960709273815155


Step,Training Loss,Validation Loss
50,0.2566,0.370737
100,0.2405,0.381049
150,0.3231,0.538474
200,0.3631,0.38999


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.370737224817276


Step,Training Loss,Validation Loss
50,0.22,0.370464
100,0.236,0.39499
150,0.2956,0.703691
200,0.428,0.472206


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.3704642951488495


Step,Training Loss,Validation Loss
50,0.2126,0.395335
100,0.251,0.409363
150,0.2885,0.630064
200,0.4056,0.457664


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.39533519744873047


Step,Training Loss,Validation Loss
50,0.1995,0.443059
100,0.259,0.465636
150,0.293,0.651208
200,0.3167,0.359949
250,0.4508,0.346177
300,0.2444,0.548681
350,0.231,0.427684
400,0.4694,1.213251


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.3461766242980957


Step,Training Loss,Validation Loss
50,0.116,0.398404
100,0.1761,0.431406
150,0.135,0.521758
200,0.1199,0.665392


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.39840367436408997


Step,Training Loss,Validation Loss
50,0.1027,0.415938
100,0.1849,0.446268
150,0.1413,0.525593
200,0.1159,0.472647


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.4159376323223114


Step,Training Loss,Validation Loss
50,0.0829,0.43336
100,0.1602,0.435191
150,0.1419,0.52098
200,0.1497,0.530046


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.43336012959480286


Step,Training Loss,Validation Loss
50,0.0787,0.454131
100,0.1281,0.424692
150,0.1419,0.529734
200,0.1268,0.484061
250,0.2861,0.614706


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.4246922433376312


Step,Training Loss,Validation Loss
50,0.0091,0.450449
100,0.0764,0.41575
150,0.1402,0.452303
200,0.1508,0.711835
250,0.1733,0.425918


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.4157501757144928


Step,Training Loss,Validation Loss
50,0.0052,0.524031
100,0.0873,0.668035
150,0.158,0.72253
200,0.1246,0.813959


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.5240312218666077


Step,Training Loss,Validation Loss
50,0.0016,0.51719
100,0.089,0.655911
150,0.158,0.614389
200,0.1164,0.685379


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.5171902179718018


Step,Training Loss,Validation Loss
50,0.0009,0.551089
100,0.0892,0.657589
150,0.1852,0.528043
200,0.1789,0.80981
250,0.1194,0.423463
300,0.251,0.481833
350,0.1949,0.45348
400,0.3889,0.679125


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.42346256971359253


Step,Training Loss,Validation Loss
50,0.0699,0.481808
100,0.0192,0.493217
150,0.1486,0.531205
200,0.1693,0.469672
250,0.1127,0.492652
300,0.0985,0.39839
350,0.1526,0.624398
400,0.2208,0.541259
450,0.2029,0.481675


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.39839044213294983


Step,Training Loss,Validation Loss
50,0.0195,0.440951
100,0.001,0.555404
150,0.0002,0.632095
200,0.0001,0.654583


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.4409513771533966


Step,Training Loss,Validation Loss
50,0.0005,0.493925
100,0.0002,0.595898
150,0.0991,0.660952
200,0.1078,0.62263


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.4939253628253937


Step,Training Loss,Validation Loss
50,0.0002,0.576895
100,0.0001,0.683883
150,0.1072,0.645987
200,0.0905,0.728337


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.5768948197364807


Step,Training Loss,Validation Loss
50,0.0001,0.65328
100,0.0001,0.774947
150,0.1053,0.771744
200,0.1266,0.544041
250,0.3397,0.750134
300,0.1002,0.633507
350,0.1737,0.547914


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.5440405607223511


Step,Training Loss,Validation Loss
50,0.0001,0.52339
100,0.0001,0.525937
150,0.0001,0.772826
200,0.0,0.610583


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.5233896374702454


Step,Training Loss,Validation Loss
50,0.0001,0.549386
100,0.0001,0.611759
150,0.0,0.642594
200,0.088,0.740983


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.5493861436843872


Step,Training Loss,Validation Loss
50,0.0,0.613014
100,0.0,0.700025
150,0.0,0.812619
200,0.1264,0.963941


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.6130142211914062


Step,Training Loss,Validation Loss
50,0.0,0.678487
100,0.0,0.76863
150,0.0078,0.927469
200,0.0,0.724592


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.6784874200820923


Step,Training Loss,Validation Loss
50,0.0,0.718358
100,0.0,0.791558
150,0.0,0.863059
200,0.2821,1.7383


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.7183580994606018


Step,Training Loss,Validation Loss
50,0.0,0.750212
100,0.0,0.901245
150,0.0221,1.167588
200,0.0,0.761813


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.7502120137214661


Step,Training Loss,Validation Loss
50,0.0,0.774768
100,0.0,0.923728
150,0.0001,1.106045
200,0.0,0.770419
250,0.0,0.889498
300,0.0475,0.677496
350,0.2015,0.669621
400,0.3198,0.495189
450,0.1737,0.526448
500,0.1623,0.360566


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.3605659306049347


Step,Training Loss,Validation Loss
50,0.0012,0.385559
100,0.0004,0.452063
150,0.0016,0.57238
200,0.0001,0.524311


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.38555923104286194


Step,Training Loss,Validation Loss
50,0.0006,0.419905
100,0.0002,0.485555
150,0.0019,0.650709
200,0.0001,0.664674


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.41990482807159424
Best params: {'learning_rate': 5e-05, 'weight_decay': 0.015, 'num_train_epochs': 4}, Best eval loss: 0.3461766242980957


In [38]:
# Khởi tạo Trainer với Dừng sớm (Early Stopping)
training_args_with_early_stop = TrainingArguments(
    output_dir='./results_with_early_stop',
    num_train_epochs=best_params['num_train_epochs'],
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=best_params['weight_decay'],
    logging_dir='./logs_with_early_stop',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    learning_rate=best_params['learning_rate'],
)

trainer_with_early_stop = Trainer(
    model=model,
    args=training_args_with_early_stop,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)



In [46]:
trainer_with_early_stop.train()

Step,Training Loss,Validation Loss
50,0.0003,0.527116
100,0.0002,0.46229
150,0.0427,0.590211
200,0.114,0.67616
250,0.0003,0.681465
300,0.082,1.274371


TrainOutput(global_step=300, training_loss=0.030983072206145153, metrics={'train_runtime': 234.2479, 'train_samples_per_second': 34.169, 'train_steps_per_second': 4.286, 'total_flos': 314812377738240.0, 'train_loss': 0.030983072206145153, 'epoch': 1.1952191235059761})

In [47]:
# Dự đoán nhãn cho tập kiểm tra
predictions2 = trainer_with_early_stop.predict(val_dataset)

# Lấy nhãn dự đoán từ logits
pred_labels2 = np.argmax(predictions2.predictions, axis=1)

In [50]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels2)
precision2 = precision_score(val_labels, pred_labels2, pos_label=0)
recall2 = recall_score(val_labels, pred_labels2, pos_label=0)
f12 = f1_score(val_labels, pred_labels2, pos_label=0)
auc2 = roc_auc_score(val_labels, predictions2.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.944112
Precision: 0.952191
Recall: 0.937255
F1 Score: 0.944664
AUC: 0.980376


In [51]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels2)
precision2 = precision_score(val_labels, pred_labels2, pos_label=1)
recall2 = recall_score(val_labels, pred_labels2, pos_label=1)
f12 = f1_score(val_labels, pred_labels2, pos_label=1)
auc2 = roc_auc_score(val_labels, predictions2.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.944112
Precision: 0.936000
Recall: 0.951220
F1 Score: 0.943548
AUC: 0.980376
