In [2]:
import io
import re
import torch
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download
from nltk.tokenize import RegexpTokenizer
from imblearn.over_sampling import RandomOverSampler
from google.colab import files
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from google.colab import drive
from huggingface_hub import login

In [3]:
# Tokenizer ViSOBERT
model= AutoModel.from_pretrained('uitnlp/visobert')
tokenizer = AutoTokenizer.from_pretrained('uitnlp/visobert')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/390M [00:00<?, ?B/s]

Some weights of XLMRobertaModel were not initialized from the model checkpoint at uitnlp/visobert and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


sentencepiece.bpe.model:   0%|          | 0.00/471k [00:00<?, ?B/s]



In [4]:
# Kết hợp Word2Vec
model_w2v = KeyedVectors.load_word2vec_format(hf_hub_download(repo_id="Word2vec/nlpl_74", filename="model.bin"), binary=True, unicode_errors="ignore")

# Mount Google Drive
drive.mount('/content/drive')

model.bin:   0%|          | 0.00/1.59G [00:00<?, ?B/s]

Mounted at /content/drive


In [5]:
# Tải dữ liệu
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/train_data.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/val_data.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/test_data.csv')

train_texts = train_df['content']
train_labels = train_df['label']
val_texts = val_df['content']
val_labels = val_df['label']
test_texts = test_df['content']
test_labels = test_df['label']

In [6]:
# Tokenize dữ liệu với padding
def tokenize_and_pad(texts, tokenizer, max_length=256):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_length
    )
    return encodings

train_encodings = tokenize_and_pad(train_texts.tolist(), tokenizer, max_length=256)
val_encodings = tokenize_and_pad(val_texts.tolist(), tokenizer, max_length=256)
test_encodings = tokenize_and_pad(test_texts.tolist(), tokenizer, max_length=256)

train_labels = np.array(train_labels).astype(int)
val_labels = np.array(val_labels).astype(int)
test_labels = np.array(test_labels).astype(int)

In [7]:
# Tạo Dataset
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).squeeze()
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FakeNewsDataset(train_encodings, train_labels.tolist())
val_dataset = FakeNewsDataset(val_encodings, val_labels.tolist())
test_dataset = FakeNewsDataset(test_encodings, test_labels.tolist())

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [8]:
# Tải mô hình phân loại
model_classification = AutoModelForSequenceClassification.from_pretrained("uitnlp/visobert", num_labels=2)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at uitnlp/visobert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Hàm huấn luyện cho mô hình kết hợp
def train_combined_model(learning_rate, weight_decay, num_train_epochs):
    training_args = TrainingArguments(
        output_dir='./results_combined',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=weight_decay,
        logging_dir='./logs_combined',
        logging_steps=10,
        eval_strategy="steps",
        eval_steps=50,
        save_steps=50,
        save_total_limit=3,
        load_best_model_at_end=True,
        learning_rate=learning_rate,
        save_strategy="steps",
    )

    # Khởi tạo Trainer cho mô hình kết hợp
    trainer = Trainer(
        model=model_classification,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()

    eval_result = trainer.evaluate()
    return eval_result['eval_loss']

In [10]:
# Tinh chỉnh siêu tham số với Grid Search
param_grid = {
    'learning_rate': [5e-5, 3e-5, 2e-5],
    'weight_decay': [0.01, 0.015, 0.02],
    'num_train_epochs': [3, 4, 5],
}

best_params = None
best_score = float('inf')

# Thực hiện Grid Search
for lr in param_grid['learning_rate']:
    for wd in param_grid['weight_decay']:
        for epochs in param_grid['num_train_epochs']:
            eval_loss = train_combined_model(lr, wd, epochs)
            print(f"Learning rate: {lr}, Weight decay: {wd}, Epochs: {epochs}, Eval loss: {eval_loss}")
            if eval_loss < best_score:
                best_score = eval_loss
                best_params = {'learning_rate': lr, 'weight_decay': wd, 'num_train_epochs': epochs}

print(f"Best params: {best_params}, Best eval loss: {best_score}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
50,0.6481,0.60887
100,0.53,0.429731
150,0.4268,0.413455
200,0.6604,0.472929
250,0.3019,0.358174
300,0.4412,0.458207
350,0.1673,0.642026
400,0.3726,0.351352
450,0.4377,0.410457
500,0.2034,0.258608


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.2586076557636261


Step,Training Loss,Validation Loss
50,0.0607,0.316243
100,0.0249,0.370987
150,0.1416,0.439477
200,0.022,0.437767


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.31624314188957214


Step,Training Loss,Validation Loss
50,0.0412,0.384977
100,0.0442,0.397081
150,0.1811,0.43634
200,0.0023,0.405781


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.3849770426750183


Step,Training Loss,Validation Loss
50,0.0019,0.465949
100,0.0387,0.400332
150,0.164,0.411157
200,0.0028,0.356859
250,0.0007,0.466156
300,0.0003,0.51019
350,0.0002,0.515561


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.35685908794403076


Step,Training Loss,Validation Loss
50,0.0002,0.431941
100,0.0012,0.59905
150,0.0,0.593067
200,0.0007,0.611672


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.43194079399108887


Step,Training Loss,Validation Loss
50,0.0001,0.480499
100,0.0005,0.5804
150,0.0,0.661827
200,0.0023,0.694441


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.4804988503456116


Step,Training Loss,Validation Loss
50,0.0,0.579729
100,0.0,0.777418
150,0.0,0.879403
200,0.0,0.988063


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.5797288417816162


Step,Training Loss,Validation Loss
50,0.0,0.714689
100,0.0,0.872623
150,0.0,0.756324
200,0.137,0.694936
250,0.1272,0.649842
300,0.0729,0.652788
350,0.0413,0.524563
400,0.0001,0.74896
450,0.1198,0.483136
500,0.4069,0.487511


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.3477761745452881


Step,Training Loss,Validation Loss
50,0.0005,0.411465
100,0.0002,0.512397
150,0.0001,0.637277
200,0.0,0.604769


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.4114646017551422


Step,Training Loss,Validation Loss
50,0.0003,0.460643
100,0.0001,0.504526
150,0.0001,0.59351
200,0.0,0.672664


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.4606426954269409


Step,Training Loss,Validation Loss
50,0.0002,0.524611
100,0.0001,0.519195
150,0.0,0.673676
200,0.0,0.721139
250,0.1956,0.64614


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.5191954374313354


Step,Training Loss,Validation Loss
50,0.0,0.582721
100,0.0,0.681981
150,0.0,0.853226
200,0.0001,0.700709


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.5827210545539856


Step,Training Loss,Validation Loss
50,0.0,0.621567
100,0.0,0.720549
150,0.0,0.874499
200,0.0,0.785998


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.6215673089027405


Step,Training Loss,Validation Loss
50,0.0,0.657597
100,0.0,0.744783
150,0.0,0.894141
200,0.0,0.760313


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.6575970649719238


Step,Training Loss,Validation Loss
50,0.0,0.691847
100,0.0,0.7657
150,0.0,0.931461
200,0.0,0.713103


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.6918469071388245


Step,Training Loss,Validation Loss
50,0.0,0.720622
100,0.0,0.784996
150,0.0,0.942479
200,0.0001,0.851916


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.7206222414970398


Step,Training Loss,Validation Loss
50,0.0,0.745403
100,0.0,0.802914
150,0.0,0.953262
200,0.0141,0.878674


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.7454025149345398


Step,Training Loss,Validation Loss
50,0.0,0.767374
100,0.0,0.819839
150,0.0,0.966515
200,0.0192,0.943753


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.7673735022544861


Step,Training Loss,Validation Loss
50,0.0,0.780935
100,0.0,0.817554
150,0.0,0.884069
200,0.0235,0.917965


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.7809352278709412


Step,Training Loss,Validation Loss
50,0.0,0.793657
100,0.0,0.828173
150,0.0,0.879786
200,0.0307,0.924171


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.7936571836471558


Step,Training Loss,Validation Loss
50,0.0,0.805639
100,0.0,0.838295
150,0.0,0.880313
200,0.0381,0.92949


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.8056387305259705


Step,Training Loss,Validation Loss
50,0.0,0.816942
100,0.0,0.847945
150,0.0,0.884775
200,0.0446,0.934089


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.8169422745704651


Step,Training Loss,Validation Loss
50,0.0,0.827661
100,0.0,0.857165
150,0.0,0.888528
200,0.051,0.938456


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.8276609778404236


Step,Training Loss,Validation Loss
50,0.0,0.837791
100,0.0,0.866001
150,0.0,0.887888
200,0.0572,0.940752


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.8377912044525146


Step,Training Loss,Validation Loss
50,0.0,0.847425
100,0.0,0.874319
150,0.0,0.883511
200,0.0619,0.94116


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.8474252820014954


Step,Training Loss,Validation Loss
50,0.0,0.856591
100,0.0,0.88233
150,0.0,0.870409
200,0.0626,0.941718


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.8565911650657654


Step,Training Loss,Validation Loss
50,0.0,0.865344
100,0.0,0.890038
150,0.0,0.870794
200,0.0639,0.94248


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.8653441667556763
Best params: {'learning_rate': 5e-05, 'weight_decay': 0.01, 'num_train_epochs': 3}, Best eval loss: 0.2586076557636261


In [24]:
# Khởi tạo Trainer với Dừng sớm cho mô hình kết hợp với các siêu tham số tốt nhất
training_args_with_early_stop_combined = TrainingArguments(
    output_dir='./results_with_early_stop_combined',
    num_train_epochs=best_params['num_train_epochs'],
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=best_params['weight_decay'],
    logging_dir='./logs_with_early_stop_combined',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    learning_rate=best_params['learning_rate'],
)

trainer_with_early_stop_combined = Trainer(
    model=model_classification,
    args=training_args_with_early_stop_combined,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)



In [37]:
# Huấn luyện mô hình kết hợp với các siêu tham số tối ưu
trainer_with_early_stop_combined.train()

Step,Training Loss,Validation Loss
50,0.0,0.597734
100,0.0,0.65266
150,0.0,0.682007
200,0.0,0.705723


TrainOutput(global_step=200, training_loss=1.7527056188555433e-05, metrics={'train_runtime': 142.7468, 'train_samples_per_second': 42.053, 'train_steps_per_second': 5.275, 'total_flos': 210488844288000.0, 'train_loss': 1.7527056188555433e-05, 'epoch': 0.796812749003984})

In [38]:
# Dự đoán nhãn cho tập kiểm tra
predictions = trainer_with_early_stop_combined.predict(val_dataset)

# Lấy nhãn dự đoán từ logits
pred_labels = np.argmax(predictions.predictions, axis=1)

In [39]:
print(pred_labels[:])  # In ra dự đoán

[0 1 0 1 1 0 0 1 1 1 1 0 0 1 1 0 0 0 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1
 0 0 0 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 1 0 1 0 1 1 0 0 0 0 1 1 1 0 0
 1 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 1 1 0 1 0 0 1 1 1 0 1 0 0 0 0 1 1 0 0 1 0
 0 1 0 1 1 0 1 1 0 1 0 1 1 1 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 1 0 0 1 1 1 1 1
 1 1 1 1 0 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 0 1 1 1 0 1 1 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 0 0 0 1 1 0 0 1 0 0 1 1 0 1 0 0 1 1 0
 1 0 0 1 1 1 0 1 1 0 0 1 1 0 0 0 1 1 1 0 1 1 0 1 0 1 1 1 1 0 1 1 0 0 0 1 1
 1 0 1 0 1 1 0 1 0 1 1 0 0 0 1 1 0 0 0 1 1 0 1 1 0 0 1 1 1 0 0 0 1 1 1 0 0
 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 1 1 1 0 1 0 1 1 0 1 1 1 1 0 1 1 1
 1 0 1 0 0 1 1 1 0 0 1 1 1 0 1 0 1 1 1 0 1 1 0 0 1 1 1 0 0 0 1 1 1 1 0 1 0
 1 0 0 0 1 0 0 0 0 1 0 1 1 1 1 0 0 0 1 1 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 1 0 0 0 1 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 1 1 1 1 1 0 1 1 1 0 1
 1 0 1 0 1 0 1 1 0 1 0 0 1 1 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 1 0 1 0 1 

In [40]:
# Tính các chỉ số
accuracy = accuracy_score(val_labels, pred_labels)
precision = precision_score(val_labels, pred_labels, pos_label=0)
recall = recall_score(val_labels, pred_labels, pos_label=0)
f1 = f1_score(val_labels, pred_labels, pos_label=0)
auc = roc_auc_score(val_labels, predictions.predictions[:, 1])

print(f"Accuracy: {accuracy:.6f}")
print(f"Precision: {precision:.6f}")
print(f"Recall: {recall:.6f}")
print(f"F1 Score: {f1:.6f}")
print(f'AUC: {auc:.6f}')

Accuracy: 0.932136
Precision: 0.940239
Recall: 0.925490
F1 Score: 0.932806
AUC: 0.978718


In [41]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels)
precision2 = precision_score(val_labels, pred_labels, pos_label=1)
recall2 = recall_score(val_labels, pred_labels, pos_label=1)
f12 = f1_score(val_labels, pred_labels, pos_label=1)
auc2 = roc_auc_score(val_labels, predictions.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.932136
Precision: 0.924000
Recall: 0.939024
F1 Score: 0.931452
AUC: 0.978718


In [42]:
# Dự đoán nhãn cho tập kiểm tra
predictions_combined_final = trainer_with_early_stop_combined.predict(test_dataset)

# Lấy nhãn dự đoán từ logits
pred_labels_test = np.argmax(predictions_combined_final.predictions, axis=1)

# In kết quả cuối cùng
print("Dự đoán nhãn cho tập kiểm tra:", pred_labels_test)

Dự đoán nhãn cho tập kiểm tra: [0 0 1 0 0 0 0 1 1 0 0]
