In [1]:
import io
import re
import torch
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download
from nltk.tokenize import RegexpTokenizer
from imblearn.over_sampling import RandomOverSampler
from google.colab import files
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from google.colab import drive

In [2]:
# Kết hợp Word2Vec
model_w2v = KeyedVectors.load_word2vec_format(hf_hub_download(repo_id="Word2vec/nlpl_74", filename="model.bin"), binary=True, unicode_errors="ignore")

# Mount Google Drive
drive.mount('/content/drive')

# Tải mô hình PhoBERT
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
model = AutoModel.from_pretrained("vinai/phobert-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.bin:   0%|          | 0.00/1.59G [00:00<?, ?B/s]

Mounted at /content/drive


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

In [3]:
# Tải dữ liệu
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/train_data.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/val_data.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/test_data.csv')

train_texts = train_df['content']
train_labels = train_df['label']
val_texts = val_df['content']
val_labels = val_df['label']
test_texts = test_df['content']
test_labels = test_df['label']

In [4]:
# Tokenize dữ liệu với padding
def tokenize_and_pad(texts, tokenizer, max_length=256):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_length
    )
    return encodings

train_encodings = tokenize_and_pad(train_texts.tolist(), tokenizer, max_length=256)
val_encodings = tokenize_and_pad(val_texts.tolist(), tokenizer, max_length=256)
test_encodings = tokenize_and_pad(test_texts.tolist(), tokenizer, max_length=256)

train_labels = np.array(train_labels).astype(int)
val_labels = np.array(val_labels).astype(int)
test_labels = np.array(test_labels).astype(int)

In [5]:
# Tạo Dataset
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).squeeze()
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FakeNewsDataset(train_encodings, train_labels.tolist())
val_dataset = FakeNewsDataset(val_encodings, val_labels.tolist())
test_dataset = FakeNewsDataset(test_encodings, test_labels.tolist())

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [6]:
# Tải mô hình phân loại
model_classification = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Hàm huấn luyện cho mô hình kết hợp
def train_combined_model(learning_rate, weight_decay, num_train_epochs):
    training_args = TrainingArguments(
        output_dir='./results_combined',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=weight_decay,
        logging_dir='./logs_combined',
        logging_steps=10,
        eval_strategy="steps",
        eval_steps=50,
        save_steps=50,
        save_total_limit=3,
        load_best_model_at_end=True,
        learning_rate=learning_rate,
        save_strategy="steps",
    )

    # Khởi tạo Trainer cho mô hình kết hợp
    trainer = Trainer(
        model=model_classification,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()

    eval_result = trainer.evaluate()
    return eval_result['eval_loss']

In [8]:
# Tinh chỉnh siêu tham số với Grid Search
param_grid = {
    'learning_rate': [5e-5, 3e-5, 2e-5],
    'weight_decay': [0.01, 0.015, 0.02],
    'num_train_epochs': [3, 4, 5],
}

best_params = None
best_score = float('inf')

# Thực hiện Grid Search
for lr in param_grid['learning_rate']:
    for wd in param_grid['weight_decay']:
        for epochs in param_grid['num_train_epochs']:
            eval_loss = train_combined_model(lr, wd, epochs)
            print(f"Learning rate: {lr}, Weight decay: {wd}, Epochs: {epochs}, Eval loss: {eval_loss}")
            if eval_loss < best_score:
                best_score = eval_loss
                best_params = {'learning_rate': lr, 'weight_decay': wd, 'num_train_epochs': epochs}

print(f"Best params: {best_params}, Best eval loss: {best_score}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
50,0.649,0.646146
100,0.565,0.516002
150,0.4072,0.444807
200,0.4848,0.500291
250,0.364,0.362218
300,0.3491,0.462652
350,0.1754,0.45644
400,0.4947,0.328958
450,0.3949,0.37763
500,0.2995,0.342373


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.32895803451538086


Step,Training Loss,Validation Loss
50,0.1427,0.335088
100,0.1715,0.399333
150,0.1894,0.419747
200,0.2068,0.469256


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.33508771657943726


Step,Training Loss,Validation Loss
50,0.1337,0.375833
100,0.1791,0.380345
150,0.185,0.425032
200,0.1323,0.442481


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.37583285570144653


Step,Training Loss,Validation Loss
50,0.1005,0.387183
100,0.1882,0.380721
150,0.1765,0.417207
200,0.236,0.466476
250,0.1006,0.377302
300,0.0182,0.551999
350,0.0543,0.53282
400,0.554,0.406133


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.3773021697998047


Step,Training Loss,Validation Loss
50,0.0543,0.401479
100,0.0492,0.399214
150,0.0073,0.425099
200,0.0012,0.343373
250,0.0007,0.52017
300,0.0113,0.519139
350,0.077,0.505256


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.34337344765663147


Step,Training Loss,Validation Loss
50,0.0009,0.404656
100,0.0156,0.437463
150,0.0003,0.529794
200,0.0003,0.482427


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.4046560525894165


Step,Training Loss,Validation Loss
50,0.0006,0.410681
100,0.0004,0.381619
150,0.0002,0.478884
200,0.0002,0.517704
250,0.0613,0.746495


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.38161933422088623


Step,Training Loss,Validation Loss
50,0.0002,0.410688
100,0.0003,0.46707
150,0.0001,0.453828
200,0.0001,0.691549


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.41068753600120544


Step,Training Loss,Validation Loss
50,0.0001,0.56392
100,0.0002,0.540427
150,0.0001,0.47972
200,0.0585,0.642082
250,0.3172,1.448724
300,0.2902,0.537859


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.47971996665000916


Step,Training Loss,Validation Loss
50,0.0001,0.499046
100,0.0001,0.64403
150,0.0,0.594654
200,0.0,0.586623


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.4990459978580475


Step,Training Loss,Validation Loss
50,0.0001,0.58805
100,0.0,0.688551
150,0.0,0.625256
200,0.0,0.667571


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.5880498290061951


Step,Training Loss,Validation Loss
50,0.0,0.63486
100,0.0,0.703563
150,0.0,0.755167
200,0.2838,0.723444


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.6348598003387451


Step,Training Loss,Validation Loss
50,0.0,0.671408
100,0.0,0.730769
150,0.0,0.997223
200,0.0002,0.828774


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.6714076399803162


Step,Training Loss,Validation Loss
50,0.0,0.700988
100,0.0,0.783992
150,0.0,0.728025
200,0.1607,0.70905


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.700987696647644


Step,Training Loss,Validation Loss
50,0.0,0.725938
100,0.0,0.814201
150,0.0,0.77583
200,0.0,0.800708


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.7259383201599121


Step,Training Loss,Validation Loss
50,0.0,0.749124
100,0.0,0.837482
150,0.0,0.873324
200,0.0,0.815281


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.7491241693496704


Step,Training Loss,Validation Loss
50,0.0,0.771336
100,0.0,0.853097
150,0.0,0.888813
200,0.0,0.939173


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.7713356018066406


Step,Training Loss,Validation Loss
50,0.0,0.791093
100,0.0,0.88311
150,0.0,0.94868
200,0.3851,0.749496
250,0.0513,0.710002
300,0.1318,0.680262
350,0.0983,0.710456
400,0.3174,0.669423
450,0.1467,0.548793
500,0.1703,0.324957


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.32495683431625366


Step,Training Loss,Validation Loss
50,0.0031,0.351137
100,0.0611,0.419878
150,0.0005,0.483393
200,0.0006,0.522226


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.35113683342933655


Step,Training Loss,Validation Loss
50,0.0018,0.388211
100,0.0655,0.455284
150,0.0003,0.508748
200,0.0058,0.577014


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.38821062445640564


Step,Training Loss,Validation Loss
50,0.0011,0.426285
100,0.0654,0.485054
150,0.0002,0.537461
200,0.0626,0.638225


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.42628517746925354


Step,Training Loss,Validation Loss
50,0.0007,0.463444
100,0.0669,0.510115
150,0.0002,0.568082
200,0.051,0.61969


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.4634435772895813


Step,Training Loss,Validation Loss
50,0.0004,0.497909
100,0.0661,0.533588
150,0.0001,0.591321
200,0.0054,0.607559


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.49790939688682556


Step,Training Loss,Validation Loss
50,0.0003,0.52978
100,0.0624,0.547534
150,0.0001,0.611234
200,0.0001,0.668654


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.5297799110412598


Step,Training Loss,Validation Loss
50,0.0002,0.56118
100,0.0513,0.573779
150,0.0001,0.631069
200,0.0001,0.582235


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.5611802339553833


Step,Training Loss,Validation Loss
50,0.0001,0.593469
100,0.0232,0.601944
150,0.0001,0.665587
200,0.0001,0.605153


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.5934694409370422


Step,Training Loss,Validation Loss
50,0.0001,0.632936
100,0.0055,0.643533
150,0.0,0.682517
200,0.0001,0.602159
250,0.0001,0.658235
300,0.1147,0.751349
350,0.0,0.686732


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.6021589636802673
Best params: {'learning_rate': 3e-05, 'weight_decay': 0.02, 'num_train_epochs': 5}, Best eval loss: 0.32495683431625366


In [22]:
# Khởi tạo Trainer với Dừng sớm cho mô hình kết hợp với các siêu tham số tốt nhất
training_args_with_early_stop_combined = TrainingArguments(
    output_dir='./results_with_early_stop_combined',
    num_train_epochs=best_params['num_train_epochs'],
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=best_params['weight_decay'],
    logging_dir='./logs_with_early_stop_combined',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    learning_rate=best_params['learning_rate'],
)

trainer_with_early_stop_combined = Trainer(
    model=model_classification,
    args=training_args_with_early_stop_combined,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)



In [61]:
# Huấn luyện mô hình kết hợp với các siêu tham số tối ưu
trainer_with_early_stop_combined.train()

Step,Training Loss,Validation Loss
50,0.0001,0.43784
100,0.0003,0.484668
150,0.0001,0.511893
200,0.0001,0.511981


TrainOutput(global_step=200, training_loss=0.008429550276050577, metrics={'train_runtime': 167.7834, 'train_samples_per_second': 59.63, 'train_steps_per_second': 7.48, 'total_flos': 210488844288000.0, 'train_loss': 0.008429550276050577, 'epoch': 0.796812749003984})

In [62]:
# Dự đoán nhãn cho tập kiểm tra
predictions = trainer_with_early_stop_combined.predict(val_dataset)

# Lấy nhãn dự đoán từ logits
pred_labels = np.argmax(predictions.predictions, axis=1)

In [63]:
print(pred_labels[:])  # In ra dự đoán

[0 1 0 1 1 1 0 1 1 1 1 0 0 1 1 0 0 0 0 1 0 1 0 1 0 1 1 0 1 1 1 0 1 1 0 0 1
 0 0 0 1 1 1 0 1 1 0 0 1 1 0 0 0 1 0 0 1 0 0 1 0 1 0 1 1 0 0 0 0 1 1 1 0 0
 1 1 0 0 1 1 0 0 1 0 1 0 0 0 1 0 1 1 0 1 0 0 1 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 0 1 0 1 1 0 1 1 0 1 1 1 1 1 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 1 0 0 1 1 1 1 1
 1 1 1 1 0 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 0 0 0 1 1 0 0 1 0 0 1 1 0 1 0 0 1 1 0
 1 0 0 1 1 1 0 1 1 0 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 1 0 1 1 0 1 1 0 0 0 0 1
 1 0 0 0 1 1 0 1 0 1 1 0 0 0 1 1 0 0 1 1 1 0 1 1 0 0 1 0 1 0 0 1 1 1 1 0 0
 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 1 1 1 0 1 0 1 1 0 1 1 1 1 0 1 1 1
 1 0 1 0 0 1 1 1 0 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 0 0 0 1 1 1 1 0 1 0
 1 0 0 0 1 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 1 0 0 0 1 1 0 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 1 1 1 1 1 0 1 1 1 0 1
 1 1 1 0 1 0 1 1 0 1 0 0 1 1 1 0 1 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 1 0 1 0 1 

In [64]:
# Tính các chỉ số
accuracy = accuracy_score(val_labels, pred_labels)
precision = precision_score(val_labels, pred_labels, pos_label=0)
recall = recall_score(val_labels, pred_labels, pos_label=0)
f1 = f1_score(val_labels, pred_labels, pos_label=0)
auc = roc_auc_score(val_labels, predictions.predictions[:, 1])

print(f"Accuracy: {accuracy:.6f}")
print(f"Precision: {precision:.6f}")
print(f"Recall: {recall:.6f}")
print(f"F1 Score: {f1:.6f}")
print(f'AUC: {auc:.6f}')

Accuracy: 0.944112
Precision: 0.955823
Recall: 0.933333
F1 Score: 0.944444
AUC: 0.982704


In [65]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels)
precision2 = precision_score(val_labels, pred_labels, pos_label=1)
recall2 = recall_score(val_labels, pred_labels, pos_label=1)
f12 = f1_score(val_labels, pred_labels, pos_label=1)
auc2 = roc_auc_score(val_labels, predictions.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.944112
Precision: 0.932540
Recall: 0.955285
F1 Score: 0.943775
AUC: 0.982704


In [66]:
# Dự đoán nhãn cho tập kiểm tra
predictions_combined_final = trainer_with_early_stop_combined.predict(test_dataset)

# Lấy nhãn dự đoán từ logits
pred_labels_test = np.argmax(predictions_combined_final.predictions, axis=1)

# In kết quả cuối cùng
print("Dự đoán nhãn cho tập kiểm tra:", pred_labels_test)

Dự đoán nhãn cho tập kiểm tra: [0 0 1 0 0 0 0 1 1 0 0]
