In [1]:
import io
import re
import torch
import numpy as np
import pandas as pd

from nltk.tokenize import RegexpTokenizer
from imblearn.over_sampling import RandomOverSampler
from google.colab import files
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from google.colab import drive

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Tokenizer ViBERT
tokenizer = AutoTokenizer.from_pretrained("FPTAI/vibert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("FPTAI/vibert-base-cased", num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/581M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at FPTAI/vibert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Đọc dữ liệu
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/train_data.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/val_data.csv')

train_texts = train_df['content']
train_labels = train_df['label']
val_texts = val_df['content']
val_labels = val_df['label']


In [5]:
# Tokenize dữ liệu với padding
def tokenize_and_pad(texts, tokenizer, max_length=256):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',  # Thêm padding để tất cả các văn bản có cùng chiều dài
        max_length=max_length
    )
    return encodings

train_encodings = tokenize_and_pad(train_texts.tolist(), tokenizer, max_length=256)
val_encodings = tokenize_and_pad(val_texts.tolist(), tokenizer, max_length=256)

In [6]:
# Kiểm tra dữ liệu sau khi token hóa
print(train_encodings.keys())
print(val_encodings.keys())
print(train_encodings['input_ids'][:2])  # Hiển thị một vài mẫu tokenized
print(val_encodings['input_ids'][:2])

train_labels = np.array(train_labels).astype(int)
val_labels = np.array(val_labels).astype(int)

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
[[2, 606, 523, 13, 20, 37, 232, 338, 913, 1683, 338, 598, 1303, 889, 830, 1620, 23, 29, 1481, 39, 131, 33, 131, 780, 39, 544, 1208, 1018, 83, 1303, 2094, 788, 37, 232, 338, 1870, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 4056, 1767, 146, 15, 1670, 80, 937

In [7]:
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).squeeze()  # Đảm bảo nhãn có kích thước đúng
        return item

    def __len__(self):
        return len(self.labels)

In [8]:
train_dataset = FakeNewsDataset(train_encodings, train_labels.tolist())
val_dataset = FakeNewsDataset(val_encodings, val_labels.tolist())

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [9]:
# Thiết lập các tham số huấn luyện (có dừng sớm)
training_args_with_early_stop = TrainingArguments(
    output_dir='./results_with_early_stop',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_with_early_stop',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
)



In [10]:
# Khởi tạo Trainer (có dừng sớm)
trainer_with_early_stop = Trainer(
    model=model,
    args=training_args_with_early_stop,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer_with_early_stop.train()

Step,Training Loss,Validation Loss
50,0.655,0.662713
100,0.6224,0.580699
150,0.4821,0.496088
200,0.3857,0.548091
250,0.3953,0.426365
300,0.2495,0.582748
350,0.3771,0.634436
400,0.4893,0.553656


TrainOutput(global_step=400, training_loss=0.47757221937179567, metrics={'train_runtime': 287.296, 'train_samples_per_second': 19.562, 'train_steps_per_second': 2.454, 'total_flos': 419925244354560.0, 'train_loss': 0.47757221937179567, 'epoch': 2.8368794326241136})

In [14]:
# Dự đoán nhãn cho tập kiểm tra
predictions2 = trainer_with_early_stop.predict(val_dataset)

# Lấy nhãn dự đoán từ logits
pred_labels2 = np.argmax(predictions2.predictions, axis=1)

In [12]:
# In ra một số dự đoán để kiểm tra
print(pred_labels2[:10])  # In ra 10 dự đoán đầu tiên
print(pred_labels2[:])  # In ra dự đoán

[0 0 0 1 0 0 1 0 1 0]
[0 0 0 1 0 0 1 0 1 0 1 1 1 1 1 0 0 0 0 1 1 0 1 1 0 0 1 1 0 0 0 0 0 0 1 1 1
 0 1 1 1 0 0 0 0 1 0 1 0 1 1 1 0 1 0 0 1 1 1 1 1 0 0 0 0 1 1 1 0 1 1 0 1 1
 1 1 0 1 0 1 1 0 0 0 0 1 1 1 0 0 1 1 1 0 0 1 1 0 0 1 1 1 0 1 1 1 0 0 1 1 0
 0 0 0 1 0 1 0 1 0 0 1 1 0 0 0 1 0 1 0 1 0 0 1 1 1 1 0 1 1 1 1 0 1 0 0 0 1
 0 0 1 1 0 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 1 1 0 1 1 1 0 0 1 1 1 0 1 0 1 1 0
 0 0 0 0 0 1 1 1 1 1 0 1 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0
 0 1 1 1 0 1 1 1 1 0 0 1 0 1 1 1 0 1 1 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 1 0 1
 0 0 1 0 1 0 0 1 0 1 0 0 0 0 1 0 1 1 1 0 0 1 1]


In [13]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels2)
precision2 = precision_score(val_labels, pred_labels2, pos_label=0)
recall2 = recall_score(val_labels, pred_labels2, pos_label=0)
f12 = f1_score(val_labels, pred_labels2, pos_label=0)
auc2 = roc_auc_score(val_labels, predictions2.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.858156
Precision: 0.871429
Recall: 0.847222
F1 Score: 0.859155
AUC: 0.916365
