In [1]:
import io
import re
import torch
import numpy as np
import pandas as pd

from nltk.tokenize import RegexpTokenizer
from imblearn.over_sampling import RandomOverSampler
from google.colab import files
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
model = AutoModel.from_pretrained("vinai/phobert-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

In [4]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/train_data.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/val_data.csv')

train_texts = train_df['content']
train_labels = train_df['label']
val_texts = val_df['content']
val_labels = val_df['label']

In [5]:
# Tokenize dữ liệu với padding
def tokenize_and_pad(texts, tokenizer, max_length=256):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',  # Thêm padding để tất cả các văn bản có cùng chiều dài
        max_length=max_length
    )
    return encodings

train_encodings = tokenize_and_pad(train_texts.tolist(), tokenizer, max_length=256)
val_encodings = tokenize_and_pad(val_texts.tolist(), tokenizer, max_length=256)

In [6]:
# Kiểm tra dữ liệu sau khi token hóa
print(train_encodings.keys())
print(val_encodings.keys())
print(train_encodings['input_ids'][:2])  # Hiển thị một vài mẫu tokenized
print(val_encodings['input_ids'][:2])

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
[[0, 1656, 8, 1347, 8915, 336, 5963, 2546, 620, 396, 30, 1302, 9412, 56669, 11, 197, 133, 151, 3634, 848, 99, 396, 123, 292, 336, 20014, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 2925, 474, 2515, 23523, 34, 275, 262, 829, 133

In [7]:
train_labels = np.array(train_labels).astype(int)
val_labels = np.array(val_labels).astype(int)

In [8]:
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).squeeze()  # Đảm bảo nhãn có kích thước đúng
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FakeNewsDataset(train_encodings, train_labels.tolist())
val_dataset = FakeNewsDataset(val_encodings, val_labels.tolist())

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


In [9]:
# Tải mô hình phân loại
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Định nghĩa hàm huấn luyện với các siêu tham số có thể tinh chỉnh
def train_model(learning_rate, weight_decay, num_train_epochs):
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=weight_decay,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy="steps",
        eval_steps=50,
        save_steps=50,
        save_total_limit=3,
        load_best_model_at_end=True,
        learning_rate=learning_rate,
        save_strategy="steps",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()

    eval_result = trainer.evaluate()
    return eval_result['eval_loss']

In [14]:
# Tinh chỉnh siêu tham số với Grid Search
param_grid = {
    'learning_rate': [5e-5, 3e-5, 2e-5],
    'weight_decay': [0.01, 0.015, 0.02],
    'num_train_epochs': [3, 4, 5],
}

best_params = None
best_score = float('inf')

for lr in param_grid['learning_rate']:
    for wd in param_grid['weight_decay']:
        for epochs in param_grid['num_train_epochs']:
            eval_loss = train_model(lr, wd, epochs)
            print(f"Learning rate: {lr}, Weight decay: {wd}, Epochs: {epochs}, Eval loss: {eval_loss}")
            if eval_loss < best_score:
                best_score = eval_loss
                best_params = {'learning_rate': lr, 'weight_decay': wd, 'num_train_epochs': epochs}

print(f"Best params: {best_params}, Best eval loss: {best_score}")

Step,Training Loss,Validation Loss
50,0.682,0.676364
100,0.6091,0.572168
150,0.4635,0.396522
200,0.4293,0.330356
250,0.3451,0.386171
300,0.159,0.37659
350,0.1521,0.501541


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.33035632967948914


Step,Training Loss,Validation Loss
50,0.2975,0.327499
100,0.2074,0.387773
150,0.1127,0.400248
200,0.3507,0.518462


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.3274986743927002


Step,Training Loss,Validation Loss
50,0.2284,0.336311
100,0.2062,0.395445
150,0.1012,0.414484
200,0.2298,0.553081


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.3363111913204193


Step,Training Loss,Validation Loss
50,0.1605,0.363198
100,0.2153,0.423842
150,0.088,0.426553
200,0.2029,0.57343


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.363197922706604


Step,Training Loss,Validation Loss
50,0.0974,0.405003
100,0.2589,0.467051
150,0.0762,0.448319
200,0.2766,0.572566


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.4050034284591675


Step,Training Loss,Validation Loss
50,0.0444,0.462218
100,0.3042,0.492218
150,0.0683,0.467068
200,0.2755,0.592222


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.46221789717674255


Step,Training Loss,Validation Loss
50,0.0307,0.553105
100,0.3227,0.517127
150,0.0671,0.490742
200,0.3884,0.604523
250,0.2371,0.467131
300,0.1511,0.466805
350,0.2886,0.575551
400,0.1756,0.495245


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.466805100440979


Step,Training Loss,Validation Loss
50,0.0203,0.525412
100,0.0518,0.564636
150,0.0018,0.632069
200,0.032,0.728772


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.5254124999046326


Step,Training Loss,Validation Loss
50,0.0023,0.618174
100,0.0629,0.530522
150,0.001,0.558852
200,0.1771,0.691136
250,0.1111,0.709436


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.5305215716362


Step,Training Loss,Validation Loss
50,0.0011,0.587815
100,0.0039,0.537732
150,0.0004,0.60335
200,0.0139,0.682409
250,0.1512,0.496713
300,0.0014,0.628215
350,0.1045,0.854925
400,0.351,0.530547


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.49671339988708496


Step,Training Loss,Validation Loss
50,0.0014,0.520151
100,0.0004,0.617074
150,0.0661,0.655997
200,0.0002,0.574458


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.5201513171195984


Step,Training Loss,Validation Loss
50,0.0005,0.568611
100,0.0003,0.767072
150,0.144,0.889948
200,0.0001,0.692796


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.5686113238334656


Step,Training Loss,Validation Loss
50,0.0003,0.633665
100,0.0003,0.800637
150,0.146,0.794892
200,0.0029,1.009892


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.6336652040481567


Step,Training Loss,Validation Loss
50,0.0002,0.698338
100,0.0002,0.841011
150,0.0003,0.743117
200,0.007,1.04769


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.6983376741409302


Step,Training Loss,Validation Loss
50,0.0001,0.786601
100,0.0064,0.778629
150,0.0584,0.822997
200,0.0734,0.914257
250,0.0377,0.842427


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.7786286473274231


Step,Training Loss,Validation Loss
50,0.0001,0.817411
100,0.0,0.939038
150,0.1273,1.094294
200,0.1179,1.070049


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.8174108862876892


Step,Training Loss,Validation Loss
50,0.0001,0.873029
100,0.0,0.976205
150,0.1265,0.962024
200,0.1218,0.846905
250,0.0079,0.68028
300,0.1423,0.696539
350,0.0775,0.653824
400,0.2272,0.627854
450,0.1078,0.523424
500,0.1009,0.694368


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.5234236717224121


Step,Training Loss,Validation Loss
50,0.0023,0.538342
100,0.0004,0.665642
150,0.0002,0.695868
200,0.0001,0.763566


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.5383416414260864


Step,Training Loss,Validation Loss
50,0.0007,0.592702
100,0.0003,0.758449
150,0.1065,0.952447
200,0.0002,0.934325


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.592702329158783


Step,Training Loss,Validation Loss
50,0.0004,0.650104
100,0.0002,0.989074
150,0.0001,0.754883
200,0.0001,0.805984


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.6501036286354065


Step,Training Loss,Validation Loss
50,0.0002,0.693994
100,0.0002,1.165316
150,0.0001,0.99899
200,0.0,1.052209


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.6939943432807922


Step,Training Loss,Validation Loss
50,0.0002,0.730542
100,0.0002,1.214168
150,0.0001,0.80419
200,0.0,0.865506


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.7305424809455872


Step,Training Loss,Validation Loss
50,0.0001,0.766631
100,0.0001,1.250507
150,0.0,1.073606
200,0.0,1.155268


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.7666313052177429


Step,Training Loss,Validation Loss
50,0.0001,0.800581
100,0.0001,1.293469
150,0.0,0.852853
200,0.0,0.887627


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.800580620765686


Step,Training Loss,Validation Loss
50,0.0001,0.828657


Step,Training Loss,Validation Loss
50,0.0001,0.828657
100,0.0001,1.267234
150,0.0,0.891334
200,0.0,0.930376


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.8286572694778442


Step,Training Loss,Validation Loss
50,0.0001,0.864683
100,0.0001,1.34084
150,0.0,1.109195
200,0.0,1.245242


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.8646829128265381


Step,Training Loss,Validation Loss
50,0.0,0.882138
100,0.0001,1.320236
150,0.0,0.917508
200,0.0,0.953401


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.8821384906768799
Best params: {'learning_rate': 5e-05, 'weight_decay': 0.01, 'num_train_epochs': 4}, Best eval loss: 0.3274986743927002


In [21]:
# Khởi tạo Trainer với Dừng sớm (Early Stopping)
training_args_with_early_stop = TrainingArguments(
    output_dir='./results_with_early_stop',
    num_train_epochs=best_params['num_train_epochs'],
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=best_params['weight_decay'],
    logging_dir='./logs_with_early_stop',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    learning_rate=best_params['learning_rate'],
)

trainer_with_early_stop = Trainer(
    model=model,
    args=training_args_with_early_stop,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)



In [22]:
trainer_with_early_stop.train()

Step,Training Loss,Validation Loss
50,0.0687,0.550317
100,0.0531,0.555108
150,0.0662,0.526029
200,0.0041,0.470895
250,0.0061,0.56765
300,0.2851,0.493436
350,0.0795,0.748695
400,0.0012,0.793703


TrainOutput(global_step=400, training_loss=0.07501803091261536, metrics={'train_runtime': 286.6165, 'train_samples_per_second': 15.686, 'train_steps_per_second': 1.968, 'total_flos': 419925244354560.0, 'train_loss': 0.07501803091261536, 'epoch': 2.8368794326241136})

In [23]:
# Dự đoán nhãn cho tập kiểm tra
predictions2 = trainer_with_early_stop.predict(val_dataset)

# Lấy nhãn dự đoán từ logits
pred_labels2 = np.argmax(predictions2.predictions, axis=1)

In [24]:
print(pred_labels2[:])  # In ra dự đoán

[0 0 0 1 1 1 1 0 1 0 1 1 1 1 1 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 1 0 0 1 1 1
 0 1 1 1 0 0 0 0 1 0 1 0 1 1 0 0 1 0 0 1 1 1 1 1 0 0 0 1 1 1 1 0 0 1 0 1 1
 1 1 0 1 0 1 1 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 1 0 0 1 1 1 0 1 1 0 0 0 1 1 0
 0 0 0 1 1 1 0 0 1 0 1 1 0 0 1 1 0 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 1 0 0 0 1
 0 0 1 1 0 0 0 0 1 1 1 1 0 1 0 1 0 0 0 0 1 0 0 1 1 0 0 0 1 1 1 0 1 1 1 1 0
 0 0 0 0 0 1 0 0 1 1 0 0 1 1 1 0 0 0 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0
 0 1 1 1 0 1 1 1 1 0 0 1 0 1 1 1 0 1 1 1 1 0 0 1 0 1 1 1 1 0 0 1 0 1 1 0 0
 1 0 1 0 1 0 1 1 1 1 0 1 0 0 1 0 1 1 1 0 0 0 1]


In [25]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels2)
precision2 = precision_score(val_labels, pred_labels2, pos_label=0)
recall2 = recall_score(val_labels, pred_labels2, pos_label=0)
f12 = f1_score(val_labels, pred_labels2, pos_label=0)
auc2 = roc_auc_score(val_labels, predictions2.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.914894
Precision: 0.928571
Recall: 0.902778
F1 Score: 0.915493
AUC: 0.966838


In [26]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels2)
precision2 = precision_score(val_labels, pred_labels2, pos_label=1)
recall2 = recall_score(val_labels, pred_labels2, pos_label=1)
f12 = f1_score(val_labels, pred_labels2, pos_label=1)
auc2 = roc_auc_score(val_labels, predictions2.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.914894
Precision: 0.901408
Recall: 0.927536
F1 Score: 0.914286
AUC: 0.966838
