In [1]:
import re
import io
import torch
import numpy as np
import pandas as pd

from nltk.tokenize import RegexpTokenizer
from imblearn.over_sampling import RandomOverSampler
from google.colab import files
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from transformers import EarlyStoppingCallback, AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from torch.nn import CrossEntropyLoss

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/train_data.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/val_data.csv')

train_texts = train_df['content']
train_labels = train_df['label']
val_texts = val_df['content']
val_labels = val_df['label']

In [4]:
# Tạo TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=4000)
tfidf_vectorizer.fit(train_texts)  # Fit trên tập huấn luyện

# Chuyển đổi dữ liệu thành TF-IDF vectors
train_tfidf_vectors = tfidf_vectorizer.transform(train_texts).toarray()
val_tfidf_vectors = tfidf_vectorizer.transform(val_texts).toarray()

In [5]:
# Tạo tokenizer và model từ PhoBERT
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
phobert_model = AutoModel.from_pretrained("vinai/phobert-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

In [6]:
# Tokenize dữ liệu với padding
def tokenize_and_pad(texts, tokenizer, max_length=256):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    return encodings

train_encodings = tokenize_and_pad(train_texts.tolist(), tokenizer, max_length=256)
val_encodings = tokenize_and_pad(val_texts.tolist(), tokenizer, max_length=256)

In [7]:
class CombinedDataset(Dataset):
    def __init__(self, encodings, labels, tfidf_vectors):
        self.encodings = encodings
        self.labels = labels
        self.tfidf_vectors = tfidf_vectors

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).long()
        tfidf_vector = torch.tensor(self.tfidf_vectors[idx], dtype=torch.float32)
        return {'input_ids': item['input_ids'], 'attention_mask': item['attention_mask'], 'labels': item['labels'], 'tfidf': tfidf_vector}

    def __len__(self):
        return len(self.labels)

In [8]:
train_dataset = CombinedDataset(train_encodings, train_labels.tolist(), train_tfidf_vectors)
val_dataset = CombinedDataset(val_encodings, val_labels.tolist(), val_tfidf_vectors)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [9]:
class CombinedModel(torch.nn.Module):
    def __init__(self, phobert_model, tfidf_dim):
        super(CombinedModel, self).__init__()
        self.phobert_model = phobert_model
        self.tfidf_dim = tfidf_dim
        self.fc = torch.nn.Linear(self.phobert_model.config.hidden_size + self.tfidf_dim, 2)
        self.loss_fn = CrossEntropyLoss()

    def forward(self, input_ids=None, attention_mask=None, tfidf=None, labels=None):
        # Lấy outputs từ PhoBERT
        outputs = self.phobert_model(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        pooled_output = hidden_states[:, 0, :]  # Sử dụng hidden state của token [CLS]

        # Kết hợp TF-IDF với các đặc trưng từ PhoBERT
        combined_features = torch.cat((pooled_output, tfidf), dim=1)
        logits = self.fc(combined_features)

        # Tính toán mất mát nếu nhãn được cung cấp
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return {"loss": loss, "logits": logits}

In [10]:
# Tạo mô hình
model = CombinedModel(phobert_model, tfidf_dim=train_tfidf_vectors.shape[1])

In [11]:
# Thiết lập các tham số huấn luyện
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Khởi tạo Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Huấn luyện mô hình
trainer.train()

Step,Training Loss
10,0.6963
20,0.6884
30,0.7027
40,0.6861
50,0.679
60,0.664
70,0.6508
80,0.6663
90,0.6397
100,0.6236


TrainOutput(global_step=423, training_loss=0.4569994859379798, metrics={'train_runtime': 169.6775, 'train_samples_per_second': 19.873, 'train_steps_per_second': 2.493, 'total_flos': 0.0, 'train_loss': 0.4569994859379798, 'epoch': 3.0})

In [12]:
# Dự đoán nhãn cho tập kiểm tra
predictions = trainer.predict(val_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

In [13]:
# Tính các chỉ số
accuracy = accuracy_score(val_labels, pred_labels)
precision = precision_score(val_labels, pred_labels, pos_label=0)
recall = recall_score(val_labels, pred_labels, pos_label=0)
f1 = f1_score(val_labels, pred_labels, pos_label=0)
auc = roc_auc_score(val_labels, predictions.predictions[:, 1])
print(f"Accuracy: {accuracy:.6f}")
print(f"Precision: {precision:.6f}")
print(f"Recall: {recall:.6f}")
print(f"F1 Score: {f1:.6f}")
print(f'AUC: {auc:.6f}')

Accuracy: 0.804965
Precision: 0.887931
Recall: 0.710345
F1 Score: 0.789272
AUC: 0.888296


In [17]:
# Thiết lập các tham số huấn luyện (có dừng sớm)
training_args_with_early_stop = TrainingArguments(
    output_dir='./results_with_early_stop',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.015,
    logging_dir='./logs_with_early_stop',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
)

# Khởi tạo Trainer (có dừng sớm)
trainer_with_early_stop = Trainer(
    model=model,
    args=training_args_with_early_stop,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)



In [18]:
trainer_with_early_stop.train()

Step,Training Loss,Validation Loss
50,0.223,0.319715
100,0.1724,0.336677


Step,Training Loss,Validation Loss
50,0.223,0.319715
100,0.1724,0.336677
150,0.1281,0.352031
200,0.1798,0.479914
250,0.2617,0.423528


TrainOutput(global_step=250, training_loss=0.21305989921092988, metrics={'train_runtime': 7085.4529, 'train_samples_per_second': 0.793, 'train_steps_per_second': 0.099, 'total_flos': 0.0, 'train_loss': 0.21305989921092988, 'epoch': 1.773049645390071})

In [19]:
# Dự đoán nhãn cho tập kiểm tra
predictions2 = trainer_with_early_stop.predict(val_dataset)

# Lấy nhãn dự đoán từ logits
pred_labels2 = np.argmax(predictions2.predictions, axis=1)

In [21]:
# In ra một số dự đoán để kiểm tra
print(pred_labels2[:10])  # In ra 10 dự đoán đầu tiên

[0 0 0 1 1 1 1 0 1 0]


In [22]:
print(pred_labels2[:])  # In ra dự đoán

[0 0 0 1 1 1 1 0 1 0 1 1 1 0 1 0 0 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 0 0 1 1 1
 0 1 1 1 0 0 0 0 1 0 1 1 1 1 0 0 1 0 0 1 1 1 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 0 0 1 1 1 0 1 1 0 0 1 1 1 0
 0 0 0 1 1 1 0 0 0 0 1 1 0 0 1 1 0 1 0 1 0 1 1 1 1 1 0 1 1 1 0 0 1 0 0 0 1
 0 0 1 1 0 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 1 0 0 1 1 0 0 0 1 1 1 0 1 1 1 1 0
 0 0 0 0 0 1 1 0 1 1 0 1 1 1 1 0 1 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0
 0 1 1 1 0 1 1 1 1 0 0 1 0 1 0 1 0 1 1 1 1 0 0 1 0 0 1 1 1 0 0 1 0 1 1 0 0
 1 0 1 0 1 0 0 1 1 1 1 1 0 0 1 0 1 1 1 1 0 0 1]


In [23]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels2)
precision2 = precision_score(val_labels, pred_labels2, pos_label=0)
recall2 = recall_score(val_labels, pred_labels2, pos_label=0)
f12 = f1_score(val_labels, pred_labels2, pos_label=0)
auc2 = roc_auc_score(val_labels, predictions2.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.875887
Precision: 0.897810
Recall: 0.854167
F1 Score: 0.875445
AUC: 0.931159


In [None]:
import os
# Đường dẫn để lưu mô hình
output_dir = '/content/drive/MyDrive/Colab Notebooks/Report1/TFIDF_fakenews'

# Tạo thư mục nếu nó chưa tồn tại
os.makedirs(output_dir, exist_ok=True)

# Đường dẫn để lưu mô hình
output_model_path = os.path.join(output_dir, 'PhoBERT.pt')

# Lưu mô hình
torch.save(model.state_dict(), output_model_path)