In [1]:
import io
import re
import torch
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download

from nltk.tokenize import RegexpTokenizer
from imblearn.over_sampling import RandomOverSampler
from google.colab import files
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

In [2]:
# 1. Load PhoBERT và Word2Vec
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
phobert_model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=2)

# Load mô hình Word2Vec từ Hugging Face Hub
word2vec_model = KeyedVectors.load_word2vec_format(hf_hub_download(repo_id="Word2vec/nlpl_74", filename="model.bin"), binary=True, unicode_errors="ignore")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.bin:   0%|          | 0.00/1.59G [00:00<?, ?B/s]

In [4]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# 2. Load dữ liệu

train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/train_data.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/val_data.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/test_data.csv')

train_texts = train_df['content'].tolist()
train_labels = train_df['label'].tolist()
val_texts = val_df['content'].tolist()
val_labels = val_df['label'].tolist()
test_texts = test_df['content'].tolist()
test_labels = test_df['label'].tolist()

In [None]:
# 3. Tokenization với PhoBERT
def tokenize_and_pad(texts, tokenizer, max_length=256):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_length
    )
    return encodings

train_encodings = tokenize_and_pad(train_texts.tolist(), tokenizer, max_length=256)
val_encodings = tokenize_and_pad(val_texts.tolist(), tokenizer, max_length=256)
test_encodings = tokenize_and_pad(test_texts.tolist(), tokenizer, max_length=256)

train_labels = np.array(train_labels).astype(int)
val_labels = np.array(val_labels).astype(int)
test_labels = np.array(test_labels).astype(int)

In [None]:
# 4. Tạo embedding với Word2Vec
# Trích xuất embedding từ Word2Vec
def get_word2vec_embedding(text, model):
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text.lower())  # Tokenize và chuyển sang chữ thường
    embeddings = []

    for word in words:
        if word in model:
            embeddings.append(model[word])

    if len(embeddings) == 0:
        return np.zeros(model.vector_size)  # Trả về vector không có thông tin nếu không có từ trong model
    return np.mean(embeddings, axis=0)  # Trung bình các embedding của các từ trong câu


In [None]:
# Kết hợp đặc trưng từ Phobert và Word2Vec
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels, texts, word2vec_model, phobert_model):
        self.encodings = encodings
        self.labels = labels
        self.texts = texts
        self.word2vec_model = word2vec_model
        self.phobert_model = phobert_model

    def __getitem__(self, idx):
        # Trích xuất các đặc trưng từ Phobert
        encoding = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

        # Lấy input_ids và attention_mask 
        input_ids = encoding['input_ids'].unsqueeze(0)  
        attention_mask = encoding.get('attention_mask', torch.ones_like(input_ids)).unsqueeze(0)

        with torch.no_grad():
            outputs = self.phobert_model(input_ids=input_ids, attention_mask=attention_mask)

        phobert_embedding = outputs.last_hidden_state.mean(dim=1).squeeze()  # Mean pooling

        # Trích xuất đặc trưng từ Word2Vec
        word2vec_embedding = get_word2vec_embedding(self.texts[idx], self.word2vec_model)

        # Kết hợp PhoBERT và Word2Vec embeddings
        combined_embedding = torch.cat((phobert_embedding, torch.tensor(word2vec_embedding, dtype=torch.float32)), dim=0)

        encoding['embedding'] = combined_embedding
        encoding['labels'] = torch.tensor(self.labels[idx]).squeeze()

        return encoding



    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = FakeNewsDataset(train_encodings, train_labels.tolist(), train_texts.tolist(), word2vec_model, phobert_model)
val_dataset = FakeNewsDataset(val_encodings, val_labels.tolist(), val_texts.tolist(), word2vec_model, phobert_model)
test_dataset = FakeNewsDataset(test_encodings, test_labels.tolist(), test_texts.tolist(), word2vec_model, phobert_model)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

  self.w2v_embeddings = torch.tensor(w2v_embeddings, dtype=torch.float32)


In [None]:
# Tạo mô hình phân loại 
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=2)

In [10]:
# Hàm huấn luyện cho mô hình kết hợp
def train_combined_model(learning_rate, weight_decay, num_train_epochs):
    training_args = TrainingArguments(
        output_dir='./results_combined',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=weight_decay,
        logging_dir='./logs_combined',
        logging_steps=10,
        eval_strategy="steps",
        eval_steps=50,
        save_steps=50,
        save_total_limit=3,
        load_best_model_at_end=True,
        learning_rate=learning_rate,
        save_strategy="steps",
    )

    # Khởi tạo Trainer cho mô hình kết hợp
    trainer = Trainer(
        model=phobert_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()

    eval_result = trainer.evaluate()
    return eval_result['eval_loss']

In [11]:
# Tinh chỉnh siêu tham số với Grid Search
param_grid = {
    'learning_rate': [5e-5, 3e-5, 2e-5],
    'weight_decay': [0.01, 0.015, 0.02],
    'num_train_epochs': [3, 4, 5],
}

best_params = None
best_score = float('inf')

# Thực hiện Grid Search
for lr in param_grid['learning_rate']:
    for wd in param_grid['weight_decay']:
        for epochs in param_grid['num_train_epochs']:
            eval_loss = train_combined_model(lr, wd, epochs)
            print(f"Learning rate: {lr}, Weight decay: {wd}, Epochs: {epochs}, Eval loss: {eval_loss}")
            if eval_loss < best_score:
                best_score = eval_loss
                best_params = {'learning_rate': lr, 'weight_decay': wd, 'num_train_epochs': epochs}

print(f"Best params: {best_params}, Best eval loss: {best_score}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
50,0.6488,0.64944
100,0.5541,0.5066
150,0.361,0.396245
200,0.4774,0.515111
250,0.4246,0.33055
300,0.5068,0.370266
350,0.2368,0.568516
400,0.368,0.389619


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.3305502235889435


Step,Training Loss,Validation Loss
50,0.2447,0.329917
100,0.2239,0.358577
150,0.2333,0.507247
200,0.2602,0.633928


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.32991713285446167


Step,Training Loss,Validation Loss
50,0.2138,0.34891
100,0.2156,0.371411
150,0.2222,0.506338
200,0.1971,0.602999


Learning rate: 5e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.34890952706336975


Step,Training Loss,Validation Loss
50,0.207,0.389132
100,0.2164,0.379634
150,0.2538,0.501057
200,0.2738,0.683106
250,0.1603,0.433371


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.37963423132896423


Step,Training Loss,Validation Loss
50,0.16,0.419932
100,0.1065,0.521748
150,0.288,0.530758
200,0.4116,0.924236


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.41993197798728943


Step,Training Loss,Validation Loss
50,0.1358,0.48518
100,0.1178,0.525804
150,0.2828,0.59235
200,0.2036,0.697589


Learning rate: 5e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.48518046736717224


Step,Training Loss,Validation Loss
50,0.1199,0.487145
100,0.0932,0.492429
150,0.2907,0.603876
200,0.2127,0.694049


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.48714539408683777


Step,Training Loss,Validation Loss
50,0.1224,0.500627
100,0.1461,0.527552
150,0.2973,0.486776
200,0.3393,0.851609
250,0.2776,0.532681
300,0.3711,0.677942


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.4867757260799408


Step,Training Loss,Validation Loss
50,0.0692,0.524818
100,0.0991,0.486522
150,0.2321,0.474669
200,0.1847,0.329445
250,0.2829,0.46334
300,0.5233,0.474393
350,0.3461,0.646667


Learning rate: 5e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.3294447362422943


Step,Training Loss,Validation Loss
50,0.0069,0.378974
100,0.0017,0.449406
150,0.1722,0.529113
200,0.1839,0.525715


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.37897422909736633


Step,Training Loss,Validation Loss
50,0.0018,0.438783
100,0.001,0.509179
150,0.1872,0.525959
200,0.1251,0.475553


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.43878307938575745


Step,Training Loss,Validation Loss
50,0.001,0.491027
100,0.0007,0.52771
150,0.1753,0.611973
200,0.1978,0.584259


Learning rate: 3e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.4910270869731903


Step,Training Loss,Validation Loss
50,0.0008,0.516378
100,0.0007,0.540974
150,0.1847,0.521946
200,0.1449,0.452466
250,0.2768,0.516801
300,0.4054,0.365529
350,0.123,0.447237
400,0.1635,0.303181
450,0.2061,0.298154
500,0.2017,0.241953


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.2419527918100357


Step,Training Loss,Validation Loss
50,0.0349,0.270956
100,0.0044,0.338229
150,0.0008,0.433009
200,0.0841,0.469971


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.2709555923938751


Step,Training Loss,Validation Loss
50,0.0273,0.295678
100,0.0014,0.378645
150,0.0007,0.447837
200,0.0851,0.490317


Learning rate: 3e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.29567816853523254


Step,Training Loss,Validation Loss
50,0.0018,0.344348
100,0.0011,0.391809
150,0.0005,0.454845
200,0.0864,0.498349


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.3443484306335449


Step,Training Loss,Validation Loss
50,0.0012,0.385123
100,0.0043,0.443783
150,0.0009,0.414763
200,0.0965,0.486183


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.3851228654384613


Step,Training Loss,Validation Loss
50,0.0008,0.404593
100,0.0014,0.496928
150,0.0004,0.427087
200,0.103,0.538303


Learning rate: 3e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.4045927822589874


Step,Training Loss,Validation Loss
50,0.0006,0.427938
100,0.0005,0.475029
150,0.0003,0.492927
200,0.1027,0.516335


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.427937775850296


Step,Training Loss,Validation Loss
50,0.0005,0.449905
100,0.0004,0.486532
150,0.0003,0.50686
200,0.1037,0.526812


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.449905127286911


Step,Training Loss,Validation Loss
50,0.0004,0.470454
100,0.0003,0.486022
150,0.0002,0.56166
200,0.0998,0.604292


Learning rate: 2e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.470453679561615


Step,Training Loss,Validation Loss
50,0.0003,0.495096
100,0.0111,0.522785
150,0.0001,0.582924
200,0.1108,0.593123


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 0.4950956106185913


Step,Training Loss,Validation Loss
50,0.0002,0.525565
100,0.038,0.542603
150,0.0001,0.598423
200,0.1069,0.653458


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.5255650281906128


Step,Training Loss,Validation Loss
50,0.0002,0.553503
100,0.0002,0.607679
150,0.0001,0.611154
200,0.101,0.660007


Learning rate: 2e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 0.5535025596618652


Step,Training Loss,Validation Loss
50,0.0001,0.578744
100,0.0292,0.602371
150,0.0001,0.601569
200,0.111,0.6467


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 0.5787444114685059


Step,Training Loss,Validation Loss
50,0.0001,0.601098
100,0.0426,0.614747
150,0.0001,0.597877
200,0.11,0.628093
250,0.0001,0.543819
300,0.0001,0.723405
350,0.1883,0.399182
400,0.0003,0.536056
450,0.1155,0.519132
500,0.1418,0.533668


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 0.3991822302341461


Step,Training Loss,Validation Loss
50,0.0003,0.427671
100,0.0551,0.589275
150,0.0001,0.491046
200,0.0001,0.551047


Learning rate: 2e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 0.42767131328582764
Best params: {'learning_rate': 3e-05, 'weight_decay': 0.015, 'num_train_epochs': 3}, Best eval loss: 0.2419527918100357


In [13]:
# Khởi tạo Trainer với Dừng sớm (Early Stopping)
training_args_with_early_stop = TrainingArguments(
    output_dir='./results_with_early_stop',
    num_train_epochs=best_params['num_train_epochs'],
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=best_params['weight_decay'],
    logging_dir='./logs_with_early_stop',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    learning_rate=best_params['learning_rate'],
)

trainer_with_early_stop = Trainer(
    model=phobert_model,
    args=training_args_with_early_stop,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)



In [26]:
trainer_with_early_stop.train()

Step,Training Loss,Validation Loss
50,0.1042,0.416946
100,0.1313,0.484746
150,0.0004,0.401399
200,0.0863,0.375797
250,0.1486,0.477886
300,0.0643,0.576847
350,0.0056,0.488487
400,0.2646,0.413812


TrainOutput(global_step=400, training_loss=0.05901278503995854, metrics={'train_runtime': 667.9228, 'train_samples_per_second': 8.988, 'train_steps_per_second': 1.127, 'total_flos': 420056799882240.0, 'train_loss': 0.05901278503995854, 'epoch': 1.593625498007968})

In [27]:
# Dự đoán nhãn cho tập kiểm tra
predictions2 = trainer_with_early_stop.predict(val_dataset)

# Lấy nhãn dự đoán từ logits
pred_labels2 = np.argmax(predictions2.predictions, axis=1)

In [29]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels2)
precision2 = precision_score(val_labels, pred_labels2, pos_label=0)
recall2 = recall_score(val_labels, pred_labels2, pos_label=0)
f12 = f1_score(val_labels, pred_labels2, pos_label=0)
auc2 = roc_auc_score(val_labels, predictions2.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.950100
Precision: 0.925926
Recall: 0.980392
F1 Score: 0.952381
AUC: 0.985477


In [30]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels2)
precision2 = precision_score(val_labels, pred_labels2, pos_label=1)
recall2 = recall_score(val_labels, pred_labels2, pos_label=1)
f12 = f1_score(val_labels, pred_labels2, pos_label=1)
auc2 = roc_auc_score(val_labels, predictions2.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.950100
Precision: 0.978355
Recall: 0.918699
F1 Score: 0.947589
AUC: 0.985477
