In [None]:
import re
import torch
import numpy as np

from nltk.tokenize import RegexpTokenizer
from imblearn.over_sampling import RandomOverSampler
from google.colab import files
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from transformers import EarlyStoppingCallback, AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

In [None]:
import pandas as pd
import io

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
model = AutoModel.from_pretrained("vinai/phobert-base")

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/train_data.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/val_data.csv')

train_texts = train_df['content']
train_labels = train_df['label']
val_texts = val_df['content']
val_labels = val_df['label']

In [None]:
# Tokenize d·ªØ li·ªáu v·ªõi padding
def tokenize_and_pad(texts, tokenizer, max_length=256):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',  # Th√™m padding ƒë·ªÉ t·∫•t c·∫£ c√°c vƒÉn b·∫£n c√≥ c√πng chi·ªÅu d√†i
        max_length=max_length
    )
    return encodings

train_encodings = tokenize_and_pad(train_texts.tolist(), tokenizer, max_length=256)
val_encodings = tokenize_and_pad(val_texts.tolist(), tokenizer, max_length=256)

In [None]:
# Tokenize
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=256, truncation_strategy='longest_first')
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=256, truncation_strategy='longest_first')


In [None]:
# Ki·ªÉm tra d·ªØ li·ªáu sau khi token h√≥a
print(train_encodings.keys())
print(val_encodings.keys())
print(train_encodings['input_ids'][:2])  # Hi·ªÉn th·ªã m·ªôt v√†i m·∫´u tokenized
print(val_encodings['input_ids'][:2])


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
[[0, 1656, 8, 1347, 8915, 336, 5963, 2546, 620, 396, 30, 1302, 9412, 56669, 11, 197, 133, 151, 3634, 848, 99, 396, 123, 292, 336, 20014, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 2925, 474, 2515, 23523, 34, 275, 262, 829, 133

In [None]:
train_labels = np.array(train_labels).astype(int)
val_labels = np.array(val_labels).astype(int)

In [None]:
# Ki·ªÉm tra nh√£n
print(set(train_labels))
print(set(val_labels))


{0, 1}
{0, 1}


In [None]:
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).squeeze()  # ƒê·∫£m b·∫£o nh√£n c√≥ k√≠ch th∆∞·ªõc ƒë√∫ng
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FakeNewsDataset(train_encodings, train_labels.tolist())
val_dataset = FakeNewsDataset(val_encodings, val_labels.tolist())

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


In [None]:
# T·∫£i m√¥ h√¨nh ph√¢n lo·∫°i
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Thi·∫øt l·∫≠p c√°c tham s·ªë hu·∫•n luy·ªán
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Kh·ªüi t·∫°o Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Hu·∫•n luy·ªán m√¥ h√¨nh
trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.7065
20,0.7145
30,0.696
40,0.6934
50,0.6871
60,0.675
70,0.667
80,0.6489
90,0.6157
100,0.6002


TrainOutput(global_step=429, training_loss=0.4429106942979328, metrics={'train_runtime': 9759.135, 'train_samples_per_second': 0.351, 'train_steps_per_second': 0.044, 'total_flos': 450709237831680.0, 'train_loss': 0.4429106942979328, 'epoch': 3.0})

In [None]:
# D·ª± ƒëo√°n nh√£n cho t·∫≠p ki·ªÉm tra
predictions = trainer.predict(val_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

In [None]:
# T√≠nh c√°c ch·ªâ s·ªë
accuracy = accuracy_score(val_labels, pred_labels)
precision = precision_score(val_labels, pred_labels, pos_label=0)
recall = recall_score(val_labels, pred_labels, pos_label=0)
f1 = f1_score(val_labels, pred_labels, pos_label=0)
auc = roc_auc_score(val_labels, predictions.predictions[:, 1])

print(f"Accuracy: {accuracy:.6f}")
print(f"Precision: {precision:.6f}")
print(f"Recall: {recall:.6f}")
print(f"F1 Score: {f1:.6f}")
print(f'AUC: {auc:.6f}')

Accuracy: 0.881119
Precision: 0.861111
Recall: 0.898551
F1 Score: 0.879433
AUC: 0.923423


In [None]:
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/bpe.codes',
 './saved_model/added_tokens.json')

In [None]:
output_dir = '/content/drive/MyDrive/Colab Notebooks/Report1/PhoBERT_fakenews'

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('/content/drive/MyDrive/Colab Notebooks/Report1/PhoBERT_fakenews/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/Report1/PhoBERT_fakenews/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/Report1/PhoBERT_fakenews/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/Report1/PhoBERT_fakenews/bpe.codes',
 '/content/drive/MyDrive/Colab Notebooks/Report1/PhoBERT_fakenews/added_tokens.json')

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained('./saved_model')
tokenizer = AutoTokenizer.from_pretrained('./saved_model')

inputs = tokenizer("S·∫≠p h·∫ßm thang Qu·∫£ng Ninh con s·ªë thi·ªát m·∫°ng kh·ªïng l·ªì", return_tensors="pt")
# Th·ª±c hi·ªán d·ª± ƒëo√°n
with torch.no_grad():
    outputs = model(**inputs)

# L·∫•y logits t·ª´ k·∫øt qu·∫£ d·ª± ƒëo√°n
logits = outputs.logits
print(logits)

from torch.nn.functional import softmax

# Chuy·ªÉn ƒë·ªïi logits th√†nh x√°c su·∫•t
probs = softmax(logits, dim=1)
print(probs)

# L·∫•y l·ªõp d·ª± ƒëo√°n (l·ªõp c√≥ x√°c su·∫•t cao nh·∫•t)
predicted_class = torch.argmax(probs, dim=1)
print(predicted_class)


In [None]:
# Thi·∫øt l·∫≠p c√°c tham s·ªë hu·∫•n luy·ªán (c√≥ d·ª´ng s·ªõm)
training_args_with_early_stop = TrainingArguments(
    output_dir='./results_with_early_stop',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_with_early_stop',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
)

# Kh·ªüi t·∫°o Trainer (c√≥ d·ª´ng s·ªõm)
trainer_with_early_stop = Trainer(
    model=model,
    args=training_args_with_early_stop,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)



In [None]:
trainer_with_early_stop.train()

Step,Training Loss,Validation Loss
50,0.6669,0.6496
100,0.6088,0.536838
150,0.4336,0.401675


Step,Training Loss,Validation Loss
50,0.6669,0.6496
100,0.6088,0.536838
150,0.4336,0.401675
200,0.4762,0.414312
250,0.2463,0.322671
300,0.2253,0.360279
350,0.3378,0.591495
400,0.3284,0.309345
450,0.4214,0.387609
500,0.3802,0.853271


TrainOutput(global_step=550, training_loss=0.37616828766736116, metrics={'train_runtime': 14967.261, 'train_samples_per_second': 0.375, 'train_steps_per_second': 0.047, 'total_flos': 577265655459840.0, 'train_loss': 0.37616828766736116, 'epoch': 3.900709219858156})

In [None]:
# D·ª± ƒëo√°n nh√£n cho t·∫≠p ki·ªÉm tra
predictions2 = trainer_with_early_stop.predict(val_dataset)

# L·∫•y nh√£n d·ª± ƒëo√°n t·ª´ logits
pred_labels2 = np.argmax(predictions2.predictions, axis=1)

In [None]:
# In ra m·ªôt s·ªë d·ª± ƒëo√°n ƒë·ªÉ ki·ªÉm tra
print(pred_labels2[:10])  # In ra 10 d·ª± ƒëo√°n ƒë·∫ßu ti√™n

[0 0 0 1 0 1 1 0 1 0]


In [None]:
print(pred_labels2[:])  # In ra d·ª± ƒëo√°n

[0 0 0 1 0 1 1 0 1 0 1 1 1 1 1 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1
 0 0 1 0 0 0 0 0 1 0 1 0 1 1 0 0 1 0 0 1 1 1 1 1 0 0 0 0 1 1 1 0 0 1 0 1 1
 1 1 0 1 0 1 1 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 1 0 0 0 1 1 0 0 1 0 0 0 1 1 0
 0 0 0 1 1 1 0 0 0 0 1 1 0 0 1 1 0 1 0 1 0 0 0 1 1 1 0 1 1 1 0 0 1 0 0 0 1
 0 0 1 1 0 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 1 0
 0 0 0 0 0 1 1 0 1 1 0 0 1 1 1 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0
 0 1 1 1 0 1 1 1 1 0 0 1 0 1 1 1 0 1 1 1 1 0 0 1 0 0 1 1 1 0 0 1 1 1 1 0 0
 1 0 0 0 1 0 0 1 1 1 0 1 0 0 1 0 1 1 1 0 0 0 1]


In [None]:
# T√≠nh c√°c ch·ªâ s·ªë
accuracy2 = accuracy_score(val_labels, pred_labels2)
precision2 = precision_score(val_labels, pred_labels2, pos_label=0)
recall2 = recall_score(val_labels, pred_labels2, pos_label=0)
f12 = f1_score(val_labels, pred_labels2, pos_label=0)
auc2 = roc_auc_score(val_labels, predictions2.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.872340
Precision: 0.850649
Recall: 0.909722
F1 Score: 0.879195
AUC: 0.947665
