In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from transformers import EarlyStoppingCallback, AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
# Tạo tokenizer và model từ BERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/train_data.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/val_data.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/test_data.csv')

train_texts = train_df['content']
train_labels = train_df['label']
val_texts = val_df['content']
val_labels = val_df['label']
test_texts = test_df['content']
test_labels = test_df['label']

In [5]:
# Tokenize dữ liệu với padding
def tokenize_and_pad(texts, tokenizer, max_length=256):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',  # Thêm padding để tất cả các văn bản có cùng chiều dài
        max_length=max_length
    )
    return encodings

In [6]:
train_encodings = tokenize_and_pad(train_texts.tolist(), tokenizer, max_length=256)
val_encodings = tokenize_and_pad(val_texts.tolist(), tokenizer, max_length=256)
test_encodings = tokenize_and_pad(test_texts.tolist(), tokenizer, max_length=256)

In [7]:
train_labels = np.array(train_labels).astype(int)
val_labels = np.array(val_labels).astype(int)
test_labels = np.array(test_labels).astype(int)

In [8]:
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).squeeze()  # Đảm bảo nhãn có kích thước đúng
        return item

    def __len__(self):
        return len(self.labels)

In [9]:
train_dataset = FakeNewsDataset(train_encodings, train_labels.tolist())
val_dataset = FakeNewsDataset(val_encodings, val_labels.tolist())
test_dataset = FakeNewsDataset(test_encodings, test_labels.tolist())

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [10]:
# Định nghĩa hàm huấn luyện với các siêu tham số có thể tinh chỉnh
def train_model(learning_rate, weight_decay, num_train_epochs):
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=weight_decay,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy="steps",
        eval_steps=50,
        save_steps=50,
        save_total_limit=3,
        load_best_model_at_end=True,
        learning_rate=learning_rate,
        save_strategy="steps",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()

    eval_result = trainer.evaluate()
    return eval_result['eval_loss']

In [11]:
# Tinh chỉnh siêu tham số với Grid Search
param_grid = {
    'learning_rate': [5e-5, 3e-5, 2e-5],
    'weight_decay': [0.01, 0.015, 0.02],
    'num_train_epochs': [3, 4, 5],
}

best_params = None
best_score = float('inf')

for lr in param_grid['learning_rate']:
    for wd in param_grid['weight_decay']:
        for epochs in param_grid['num_train_epochs']:
            eval_loss = train_model(lr, wd, epochs)
            print(f"Learning rate: {lr}, Weight decay: {wd}, Epochs: {epochs}, Eval loss: {eval_loss}")
            if eval_loss < best_score:
                best_score = eval_loss
                best_params = {'learning_rate': lr, 'weight_decay': wd, 'num_train_epochs': epochs}

print(f"Best params: {best_params}, Best eval loss: {best_score}")

Step,Training Loss,Validation Loss
50,0.6726,0.659045
100,0.5819,0.618156
150,0.5923,0.552361
200,0.5826,0.590312
250,0.6038,0.513551
300,0.5133,0.578035
350,0.4515,0.60745
400,0.6702,0.585515


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.5135512948036194


Step,Training Loss,Validation Loss
50,0.376,0.508059
100,0.3462,0.485977
150,0.3236,0.516637
200,0.2818,0.632346
250,0.6687,0.776079


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.4859768748283386


Step,Training Loss,Validation Loss
50,0.213,0.556672
100,0.2252,0.585802
150,0.3522,0.462719
200,0.2824,0.781808
250,0.6534,0.868855
300,0.5672,0.587972


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.4627186954021454


Step,Training Loss,Validation Loss
50,0.1465,0.646841
100,0.1081,0.697635
150,0.3251,0.957538
200,0.2304,0.648717


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.6468408107757568


Step,Training Loss,Validation Loss
50,0.1157,0.649064
100,0.1133,0.705864
150,0.271,0.83256
200,0.2917,0.624303
250,0.8677,0.600478
300,0.8939,0.755103
350,0.542,0.681564
400,0.5051,0.749882


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.6004784107208252


Step,Training Loss,Validation Loss
50,0.1077,0.681293
100,0.0776,0.891501
150,0.2068,0.877106
200,0.5558,0.564247
250,0.7512,0.786917
300,0.4578,0.66801
350,0.3741,0.552907
400,0.5492,0.527023
450,0.431,0.588435
500,0.281,0.714627


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.5270230770111084


Step,Training Loss,Validation Loss
50,0.1833,0.550704
100,0.0682,0.86148
150,0.4582,0.858104
200,0.2039,0.778447


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.5507041215896606


Step,Training Loss,Validation Loss
50,0.1172,0.642745
100,0.0938,0.868753
150,0.4623,0.830526
200,0.1849,0.7294


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.6427454352378845


Step,Training Loss,Validation Loss
50,0.0887,0.748875
100,0.1096,0.926343
150,0.4469,0.839635
200,0.1933,0.785204


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.7488747835159302


Step,Training Loss,Validation Loss
50,0.0589,0.824064
100,0.1085,0.88371
150,0.34,0.884415
200,0.1865,0.723707
250,0.5009,0.970248
300,0.1648,0.862979
350,0.1835,1.185981


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.723707377910614


Step,Training Loss,Validation Loss
50,0.0037,0.819277
100,0.005,0.908059
150,0.1426,0.856179
200,0.1255,1.037401


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.8192765712738037


Step,Training Loss,Validation Loss
50,0.0026,0.857749
100,0.0296,0.933607
150,0.1459,0.880847
200,0.15,0.996749


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.8577486276626587


Step,Training Loss,Validation Loss
50,0.002,0.89001
100,0.0029,0.969556
150,0.152,1.00623
200,0.1291,0.858271
250,0.464,1.22643
300,0.1447,0.917796
350,0.3035,0.914521


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.858270525932312


Step,Training Loss,Validation Loss
50,0.0012,0.974762
100,0.0008,1.050906
150,0.094,0.904575
200,0.076,0.810703
250,0.4037,1.442516
300,0.1508,0.940424
350,0.2309,0.877586


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.8107031583786011


Step,Training Loss,Validation Loss
50,0.0006,1.028234
100,0.0004,1.070756
150,0.0003,1.125942
200,0.0011,1.150724


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 1.0282336473464966


Step,Training Loss,Validation Loss
50,0.0004,1.049838
100,0.0003,1.121837
150,0.0002,1.195082
200,0.0011,1.222352


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 1.049837589263916


Step,Training Loss,Validation Loss
50,0.0003,1.101625
100,0.0002,1.167853
150,0.0001,1.190184
200,0.0021,1.188635


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 1.101624846458435


Step,Training Loss,Validation Loss
50,0.0002,1.191463
100,0.0001,1.227481
150,0.0001,1.271571
200,0.0944,1.236365


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 1.1914634704589844


Step,Training Loss,Validation Loss
50,0.0001,1.244935
100,0.0001,1.24523
150,0.0001,1.280832
200,0.0046,1.26145


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 1.2449352741241455


Step,Training Loss,Validation Loss
50,0.0001,1.294123
100,0.0001,1.283894
150,0.0001,1.339707
200,0.0008,1.307234
250,0.2914,1.132033
300,0.1922,1.280323
350,0.0663,0.890478
400,0.2773,0.803573
450,0.1968,0.664183
500,0.3387,0.450236


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.4502364993095398


Step,Training Loss,Validation Loss
50,0.0143,0.524979
100,0.0025,0.67104
150,0.0837,0.66197
200,0.0004,0.932345


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.5249791145324707


Step,Training Loss,Validation Loss
50,0.0056,0.587002
100,0.0016,0.708556
150,0.0843,0.713932
200,0.0002,0.949598


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.5870019197463989


Step,Training Loss,Validation Loss
50,0.0021,0.634253
100,0.0011,0.75231
150,0.0625,0.772587
200,0.0001,1.019793


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.6342529654502869


Step,Training Loss,Validation Loss
50,0.0011,0.692243
100,0.0008,0.800749
150,0.013,0.859127
200,0.0001,1.175539


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.6922428011894226


Step,Training Loss,Validation Loss
50,0.0007,0.744075
100,0.0006,0.842237
150,0.0064,0.880577
200,0.0002,1.32972


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.744075357913971


Step,Training Loss,Validation Loss
50,0.0004,0.800181
100,0.0004,0.880056
150,0.0057,0.878753
200,0.0003,1.320222


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.8001809120178223


Step,Training Loss,Validation Loss
50,0.0002,0.865362
100,0.0003,0.918365
150,0.0019,0.906304
200,0.0003,1.315626


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.8653624653816223
Best params: {'learning_rate': 2e-05, 'weight_decay': 0.01, 'num_train_epochs': 4}, Best eval loss: 0.4502364993095398


In [26]:
# Khởi tạo Trainer với Dừng sớm (Early Stopping)
training_args_with_early_stop = TrainingArguments(
    output_dir='./results_with_early_stop',
    num_train_epochs=best_params['num_train_epochs'],
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=best_params['weight_decay'],
    logging_dir='./logs_with_early_stop',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    learning_rate=best_params['learning_rate'],
)

trainer_with_early_stop = Trainer(
    model=model,
    args=training_args_with_early_stop,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)



In [40]:
# Huấn luyện mô hình có dừng sớm
trainer_with_early_stop.train()

Step,Training Loss,Validation Loss
50,0.0081,0.730802
100,0.0004,0.789749
150,0.1552,0.889986
200,0.0173,0.885879


TrainOutput(global_step=200, training_loss=0.05964421203476377, metrics={'train_runtime': 148.1717, 'train_samples_per_second': 54.018, 'train_steps_per_second': 6.776, 'total_flos': 210488844288000.0, 'train_loss': 0.05964421203476377, 'epoch': 0.796812749003984})

In [41]:
# Dự đoán nhãn cho tập kiểm tra
predictions2 = trainer_with_early_stop.predict(val_dataset)

# Lấy nhãn dự đoán từ logits
pred_labels2 = np.argmax(predictions2.predictions, axis=1)

In [42]:
print(pred_labels2[:])  # In ra dự đoán

[0 0 0 1 1 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 1 1 0 1 0 1 0 0 1 1 1 1 1 1 0 0 1
 0 0 0 1 1 1 0 1 1 0 0 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0 1 1 1 1 0
 1 1 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 1 0 1 0 0 1 1 1 0 1 1 1 0 0 0 1 0 0 1 0
 0 1 0 1 1 0 1 1 0 1 0 1 1 1 1 0 0 1 1 0 0 0 0 1 1 0 1 0 0 1 0 0 1 1 1 1 1
 1 1 1 1 0 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0 1 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 0 0 0 1 1 0 0 1 0 0 1 1 0 1 0 0 1 1 0
 1 0 0 1 1 1 0 1 1 0 0 1 1 0 0 0 0 0 1 0 1 1 0 1 0 1 0 1 1 0 1 1 0 0 0 1 1
 1 0 0 0 1 1 1 1 0 1 1 0 0 0 1 1 0 1 0 1 1 0 1 1 0 0 1 0 1 1 0 0 1 1 1 0 0
 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 1 0 1 0 1 0 1 1 0 1 1 1 1 0 1 1 1
 1 0 1 0 0 1 1 1 0 0 1 1 0 0 1 0 1 1 1 0 1 1 0 0 1 1 1 0 0 0 1 1 1 1 0 1 0
 1 0 0 0 1 0 0 0 0 1 1 1 1 1 1 0 0 0 0 1 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0
 0 1 1 0 1 1 0 0 1 0 0 0 1 0 1 1 1 0 0 1 0 0 0 1 1 0 1 1 1 1 1 0 1 1 1 0 0
 1 0 1 0 1 0 1 1 0 1 0 0 1 1 1 0 1 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 1 0 1 0 1 

In [43]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels2)
precision2 = precision_score(val_labels, pred_labels2, pos_label=0)
recall2 = recall_score(val_labels, pred_labels2, pos_label=0)
f12 = f1_score(val_labels, pred_labels2, pos_label=0)
auc2 = roc_auc_score(val_labels, predictions2.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.898204
Precision: 0.895349
Recall: 0.905882
F1 Score: 0.900585
AUC: 0.940220


In [44]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels2)
precision2 = precision_score(val_labels, pred_labels2, pos_label=1)
recall2 = recall_score(val_labels, pred_labels2, pos_label=1)
f12 = f1_score(val_labels, pred_labels2, pos_label=1)
auc2 = roc_auc_score(val_labels, predictions2.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.898204
Precision: 0.901235
Recall: 0.890244
F1 Score: 0.895706
AUC: 0.940220


In [45]:
# Dự đoán nhãn cho tập kiểm tra
predictions3 = trainer_with_early_stop.predict(test_dataset)

# Lấy nhãn dự đoán từ logits
pred_labels3 = np.argmax(predictions3.predictions, axis=1)

print(pred_labels3[:])  # In ra dự đoán

[0 0 1 0 0 0 0 1 1 0 0]
