In [1]:
import io
import re
import torch
import numpy as np
import pandas as pd

from nltk.tokenize import RegexpTokenizer
from imblearn.over_sampling import RandomOverSampler
from google.colab import files
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from google.colab import drive

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Tokenizer ViBERT
tokenizer = AutoTokenizer.from_pretrained("FPTAI/vibert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("FPTAI/vibert-base-cased", num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/581M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at FPTAI/vibert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Đọc dữ liệu
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/train_data.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/val_data.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/test_data.csv')

train_texts = train_df['content']
train_labels = train_df['label']
val_texts = val_df['content']
val_labels = val_df['label']
test_texts = test_df['content']
test_labels = test_df['label']

In [5]:
# Tokenize dữ liệu với padding
def tokenize_and_pad(texts, tokenizer, max_length=256):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',  # Thêm padding để tất cả các văn bản có cùng chiều dài
        max_length=max_length
    )
    return encodings

train_encodings = tokenize_and_pad(train_texts.tolist(), tokenizer, max_length=256)
val_encodings = tokenize_and_pad(val_texts.tolist(), tokenizer, max_length=256)
test_encodings = tokenize_and_pad(test_texts.tolist(), tokenizer, max_length=256)

In [6]:
train_labels = np.array(train_labels).astype(int)
val_labels = np.array(val_labels).astype(int)
test_labels = np.array(test_labels).astype(int)

In [7]:
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).squeeze()  # Đảm bảo nhãn có kích thước đúng
        return item

    def __len__(self):
        return len(self.labels)

In [8]:
train_dataset = FakeNewsDataset(train_encodings, train_labels.tolist())
val_dataset = FakeNewsDataset(val_encodings, val_labels.tolist())
test_dataset = FakeNewsDataset(test_encodings, test_labels.tolist())

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [9]:
# Định nghĩa hàm huấn luyện với các siêu tham số có thể tinh chỉnh
def train_model(learning_rate, weight_decay, num_train_epochs):
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=weight_decay,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy="steps",
        eval_steps=50,
        save_steps=50,
        save_total_limit=3,
        load_best_model_at_end=True,
        learning_rate=learning_rate,
        save_strategy="steps",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()

    eval_result = trainer.evaluate()
    return eval_result['eval_loss']

In [10]:
# Tinh chỉnh siêu tham số với Grid Search
param_grid = {
    'learning_rate': [5e-5, 3e-5, 2e-5],
    'weight_decay': [0.01, 0.015, 0.02],
    'num_train_epochs': [3, 4, 5],
}

best_params = None
best_score = float('inf')

for lr in param_grid['learning_rate']:
    for wd in param_grid['weight_decay']:
        for epochs in param_grid['num_train_epochs']:
            eval_loss = train_model(lr, wd, epochs)
            print(f"Learning rate: {lr}, Weight decay: {wd}, Epochs: {epochs}, Eval loss: {eval_loss}")
            if eval_loss < best_score:
                best_score = eval_loss
                best_params = {'learning_rate': lr, 'weight_decay': wd, 'num_train_epochs': epochs}

print(f"Best params: {best_params}, Best eval loss: {best_score}")

Step,Training Loss,Validation Loss
50,0.6622,0.670186
100,0.5632,0.532752
150,0.4588,0.444831
200,0.5401,0.546003
250,0.4725,0.414928
300,0.4533,0.440029
350,0.231,0.528007
400,0.8896,0.730787


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.4149281680583954


Step,Training Loss,Validation Loss
50,0.276,0.431516
100,0.1741,0.384109
150,0.2046,0.437764
200,0.6513,0.418191
250,0.377,0.589259


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.3841085135936737


Step,Training Loss,Validation Loss
50,0.0971,0.465731
100,0.0708,0.496844
150,0.2111,0.499245
200,0.5637,0.48967


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.4657314717769623


Step,Training Loss,Validation Loss
50,0.0776,0.525786
100,0.0734,0.513085
150,0.2562,0.564369
200,0.6161,0.516924
250,0.3473,0.524923


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.5130853652954102


Step,Training Loss,Validation Loss
50,0.042,0.59347
100,0.1155,0.66746
150,0.1936,0.637224
200,0.4765,0.809761


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.5934695601463318


Step,Training Loss,Validation Loss
50,0.0173,0.580652
100,0.121,0.680382
150,0.1734,0.574766
200,0.4485,1.121668
250,0.4005,0.669783
300,0.3662,0.646288


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.5747660398483276


Step,Training Loss,Validation Loss
50,0.0018,0.678674
100,0.0007,0.707871
150,0.5373,0.552483
200,0.693,0.516817
250,0.4227,0.449381
300,0.3066,0.623348
350,0.8113,0.516583
400,0.1452,0.501675


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.4493807554244995


Step,Training Loss,Validation Loss
50,0.0782,0.549071
100,0.0007,0.649342
150,0.0002,1.04917
200,0.3268,0.738024


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.5490707755088806


Step,Training Loss,Validation Loss
50,0.0092,0.586435
100,0.0004,0.731507
150,0.0999,1.41825
200,0.3553,0.585015
250,0.1706,0.750442
300,0.3315,0.440294
350,0.3249,0.634691
400,0.0471,0.576292
450,0.3281,0.633102


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.44029396772384644


Step,Training Loss,Validation Loss
50,0.0389,0.602677
100,0.0007,0.629358
150,0.0034,0.751193
200,0.0974,0.946666


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.6026774644851685


Step,Training Loss,Validation Loss
50,0.023,0.609059
100,0.0005,0.68106
150,0.0038,0.803066
200,0.0943,1.164178


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.6090587973594666


Step,Training Loss,Validation Loss
50,0.0003,0.649145
100,0.0029,0.732449
150,0.0005,0.898238
200,0.0854,1.108147


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.6491453051567078


Step,Training Loss,Validation Loss
50,0.0001,0.709404
100,0.0713,0.766494
150,0.0002,0.971099
200,0.0998,1.147385


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.7094041705131531


Step,Training Loss,Validation Loss
50,0.0001,0.79246
100,0.0001,0.803854
150,0.0178,0.85242
200,0.0588,1.507277


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.792460024356842


Step,Training Loss,Validation Loss
50,0.0001,0.881907
100,0.0001,0.840549
150,0.0036,0.887236
200,0.0896,1.68223
250,0.1026,0.600414
300,0.0201,0.946702
350,0.0876,0.527797
400,0.1551,0.598149
450,0.1316,0.562104
500,0.0758,0.719941


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.527796745300293


Step,Training Loss,Validation Loss
50,0.0784,0.628206
100,0.0003,0.727653
150,0.0001,0.870522
200,0.0001,0.824193


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.6282062530517578


Step,Training Loss,Validation Loss
50,0.0498,0.718078
100,0.0001,0.87345
150,0.0373,1.077389
200,0.0,1.009302


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.718077540397644


Step,Training Loss,Validation Loss
50,0.0001,0.809935
100,0.0001,0.948391
150,0.1352,1.318348
200,0.0001,1.300233


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.8099347949028015


Step,Training Loss,Validation Loss
50,0.0,0.869154
100,0.0001,1.049122
150,0.0,1.176794
200,0.0,1.264913


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.8691535592079163


Step,Training Loss,Validation Loss
50,0.0,0.910961
100,0.0001,1.10581
150,0.0,0.958421
200,0.0,1.030471


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.9109606742858887


Step,Training Loss,Validation Loss
50,0.0,0.937414
100,0.0001,1.164711
150,0.0,0.969307
200,0.0,1.025071


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.9374143481254578


Step,Training Loss,Validation Loss
50,0.0,0.959039
100,0.0,1.204997
150,0.0,0.953232
200,0.0,1.353744
250,0.1288,0.924849
300,0.0,0.91519
350,0.0,1.532037
400,0.228,0.429216
450,0.1676,0.354181
500,0.1921,0.53062


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.35418128967285156


Step,Training Loss,Validation Loss
50,0.0611,0.38752
100,0.0049,0.598861
150,0.0009,0.740633
200,0.0001,0.86166


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.3875202238559723


Step,Training Loss,Validation Loss
50,0.0045,0.45759
100,0.0052,0.626875
150,0.0003,0.786933
200,0.0001,0.915976


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.4575902819633484


Step,Training Loss,Validation Loss
50,0.0034,0.439214
100,0.0017,0.681575
150,0.0001,0.872032
200,0.0001,0.989631


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.4392140507698059


Step,Training Loss,Validation Loss
50,0.0024,0.438957
100,0.001,0.699577
150,0.0001,0.905255
200,0.0,1.012732


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.4389566481113434


Step,Training Loss,Validation Loss
50,0.0015,0.451769
100,0.0005,0.723591
150,0.0001,0.940537
200,0.0,1.006305


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.45176851749420166
Best params: {'learning_rate': 2e-05, 'weight_decay': 0.015, 'num_train_epochs': 3}, Best eval loss: 0.35418128967285156


In [11]:
# Khởi tạo Trainer với Dừng sớm (Early Stopping)
training_args_with_early_stop = TrainingArguments(
    output_dir='./results_with_early_stop',
    num_train_epochs=best_params['num_train_epochs'],
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=best_params['weight_decay'],
    logging_dir='./logs_with_early_stop',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    learning_rate=best_params['learning_rate'],
)

trainer_with_early_stop = Trainer(
    model=model,
    args=training_args_with_early_stop,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)



In [12]:
trainer_with_early_stop.train()

Step,Training Loss,Validation Loss
50,0.0007,0.467163
100,0.0002,0.733267
150,0.0001,0.991155
200,0.0,1.004897
250,0.0001,0.917759


TrainOutput(global_step=250, training_loss=0.007391573046450503, metrics={'train_runtime': 204.5663, 'train_samples_per_second': 29.345, 'train_steps_per_second': 3.681, 'total_flos': 263111055360000.0, 'train_loss': 0.007391573046450503, 'epoch': 0.9960159362549801})

In [13]:
# Dự đoán nhãn cho tập kiểm tra
predictions2 = trainer_with_early_stop.predict(val_dataset)

# Lấy nhãn dự đoán từ logits
pred_labels2 = np.argmax(predictions2.predictions, axis=1)

In [16]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels2)
precision2 = precision_score(val_labels, pred_labels2, pos_label=0)
recall2 = recall_score(val_labels, pred_labels2, pos_label=0)
f12 = f1_score(val_labels, pred_labels2, pos_label=0)
auc2 = roc_auc_score(val_labels, predictions2.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.930140
Precision: 0.923077
Recall: 0.941176
F1 Score: 0.932039
AUC: 0.970126


In [17]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels2)
precision2 = precision_score(val_labels, pred_labels2, pos_label=1)
recall2 = recall_score(val_labels, pred_labels2, pos_label=1)
f12 = f1_score(val_labels, pred_labels2, pos_label=1)
auc2 = roc_auc_score(val_labels, predictions2.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.930140
Precision: 0.937759
Recall: 0.918699
F1 Score: 0.928131
AUC: 0.970126
