In [12]:
import io
import re
import torch
import numpy as np
import pandas as pd

from torch.nn import CrossEntropyLoss
from torch.utils.data import Dataset, DataLoader

from google.colab import files

from imblearn.over_sampling import RandomOverSampler

from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from transformers import (AutoTokenizer, AutoModel,
                          AutoModelForSequenceClassification, Trainer,
                          TrainingArguments, EarlyStoppingCallback, AdamW)

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/train_data.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Report1/val_data.csv')

train_texts = train_df['content']
train_labels = train_df['label']
val_texts = val_df['content']
val_labels = val_df['label']

In [15]:
# Tạo TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=4000)
tfidf_vectorizer.fit(train_texts)  # Fit trên tập huấn luyện

# Chuyển đổi dữ liệu thành TF-IDF vectors
train_tfidf_vectors = tfidf_vectorizer.transform(train_texts).toarray()
val_tfidf_vectors = tfidf_vectorizer.transform(val_texts).toarray()

In [16]:
# Tạo tokenizer và model từ PhoBERT
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
phobert_model = AutoModel.from_pretrained("vinai/phobert-base")



In [17]:
# Tokenize dữ liệu với padding
def tokenize_and_pad(texts, tokenizer, max_length=256):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    return encodings

train_encodings = tokenize_and_pad(train_texts.tolist(), tokenizer, max_length=256)
val_encodings = tokenize_and_pad(val_texts.tolist(), tokenizer, max_length=256)

In [18]:
class CombinedDataset(Dataset):
    def __init__(self, encodings, labels, tfidf_vectors):
        self.encodings = encodings
        self.labels = labels
        self.tfidf_vectors = tfidf_vectors

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).long()
        tfidf_vector = torch.tensor(self.tfidf_vectors[idx], dtype=torch.float32)
        return {'input_ids': item['input_ids'], 'attention_mask': item['attention_mask'], 'labels': item['labels'], 'tfidf': tfidf_vector}

    def __len__(self):
        return len(self.labels)

In [19]:
train_dataset = CombinedDataset(train_encodings, train_labels.tolist(), train_tfidf_vectors)
val_dataset = CombinedDataset(val_encodings, val_labels.tolist(), val_tfidf_vectors)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [20]:
class CombinedModel(torch.nn.Module):
    def __init__(self, phobert_model, tfidf_dim):
        super(CombinedModel, self).__init__()
        self.phobert_model = phobert_model
        self.tfidf_dim = tfidf_dim
        self.fc = torch.nn.Linear(self.phobert_model.config.hidden_size + self.tfidf_dim, 2)
        self.loss_fn = CrossEntropyLoss()

    def forward(self, input_ids=None, attention_mask=None, tfidf=None, labels=None):
        # Lấy outputs từ PhoBERT
        outputs = self.phobert_model(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        pooled_output = hidden_states[:, 0, :]  # Sử dụng hidden state của token [CLS]

        # Kết hợp TF-IDF với các đặc trưng từ PhoBERT
        combined_features = torch.cat((pooled_output, tfidf), dim=1)
        logits = self.fc(combined_features)

        # Tính toán mất mát nếu nhãn được cung cấp
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return {"loss": loss, "logits": logits}

In [21]:
# Tạo mô hình
model = CombinedModel(phobert_model, tfidf_dim=train_tfidf_vectors.shape[1])

In [22]:
# Định nghĩa hàm huấn luyện với các siêu tham số có thể tinh chỉnh
def train_model(learning_rate, weight_decay, num_train_epochs):
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=weight_decay,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=50,
        save_steps=50,
        save_total_limit=3,
        load_best_model_at_end=True,
        learning_rate=learning_rate,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    trainer.train()

    eval_result = trainer.evaluate()
    return eval_result['eval_loss']

In [23]:
# Tinh chỉnh siêu tham số với Grid Search
param_grid = {
    'learning_rate': [5e-5, 3e-5, 2e-5],
    'weight_decay': [0.01, 0.015, 0.02],
    'num_train_epochs': [3, 4, 5],
}

best_params = None
best_score = float('inf')

In [24]:
for lr in param_grid['learning_rate']:
    for wd in param_grid['weight_decay']:
        for epochs in param_grid['num_train_epochs']:
            eval_loss = train_model(lr, wd, epochs)
            if eval_loss < best_score:
                best_score = eval_loss
                best_params = {'learning_rate': lr, 'weight_decay': wd, 'num_train_epochs': epochs}

print(f"Best params: {best_params}, Best eval loss: {best_score}")



Step,Training Loss,Validation Loss
50,0.6772,0.66171
100,0.606,0.560139
150,0.4438,0.408701
200,0.4025,0.370667
250,0.2795,0.369231
300,0.2943,0.353284
350,0.304,0.472916
400,0.1728,0.359498




Step,Training Loss,Validation Loss
50,0.2014,0.360157
100,0.1654,0.400743
150,0.1159,0.440419
200,0.3112,0.604689
250,0.0516,0.39908
300,0.0686,0.546037
350,0.3335,0.476965
400,0.1949,0.413354
450,0.2185,0.423796
500,0.1867,0.456314




Step,Training Loss,Validation Loss
50,0.1786,0.377492
100,0.1777,0.417215
150,0.1123,0.449627
200,0.2379,0.417585
250,0.0622,0.482537
300,0.0881,0.476128
350,0.0653,0.394623
400,0.2219,0.402526
450,0.092,0.508748
500,0.1823,0.39229




Step,Training Loss,Validation Loss
50,0.1469,0.388251
100,0.1804,0.425369
150,0.1066,0.420176
200,0.1122,0.39942
250,0.1044,0.370686
300,0.0782,0.382447
350,0.0734,0.465466
400,0.5421,0.399812




Step,Training Loss,Validation Loss
50,0.1157,0.353882
100,0.0914,0.393989
150,0.0666,0.429986
200,0.1237,0.748716
250,0.1299,0.918957
300,0.1141,0.43966
350,0.2961,0.61822
400,0.3959,0.366396
450,0.1381,0.433654
500,0.2421,0.467663




Step,Training Loss,Validation Loss
50,0.066,0.376942
100,0.0623,0.432737
150,0.0621,0.477737
200,0.3484,0.743062
250,0.1128,0.579882
300,0.009,0.463485
350,0.2601,0.56028
400,0.2978,0.55448
450,0.1697,0.601459
500,0.5069,0.451907




Step,Training Loss,Validation Loss
50,0.0257,0.43971
100,0.0128,0.464108
150,0.0664,0.393036
200,0.0673,0.478236
250,0.0664,0.599447
300,0.1331,0.631865
350,0.148,0.864049
400,0.2433,0.483182




Step,Training Loss,Validation Loss
50,0.0098,0.461373
100,0.1231,0.455774
150,0.0081,0.471439
200,0.1497,0.62025
250,0.1288,0.500178
300,0.0661,0.597734
350,0.0852,0.468535
400,0.9001,0.810517
450,0.6825,0.724231
500,0.8558,0.923398




Step,Training Loss,Validation Loss
50,0.2884,0.406569
100,0.1661,0.338401
150,0.0221,0.409799
200,0.068,0.41416
250,0.106,0.514601
300,0.1257,0.653448
350,0.0605,0.462187
400,0.1315,0.55406
450,0.1076,0.407774
500,0.1294,0.498907




Step,Training Loss,Validation Loss
50,0.0492,0.359733
100,0.0801,0.412385
150,0.015,0.452832
200,0.0677,0.470651
250,0.0103,0.623688
300,0.0261,0.751341
350,0.0569,0.466909
400,0.1177,0.459539




Step,Training Loss,Validation Loss
50,0.0294,0.372452
100,0.0681,0.414691
150,0.0137,0.456715
200,0.0678,0.44504
250,0.0739,0.453598
300,0.0084,0.546798
350,0.0677,0.422501
400,0.115,0.420378
450,0.1206,0.462935
500,0.192,0.829457




Step,Training Loss,Validation Loss
50,0.0243,0.372153
100,0.0672,0.431611
150,0.0129,0.46922
200,0.0677,0.488524
250,0.0619,0.497714
300,0.0654,0.742621
350,0.1084,0.496766
400,0.069,0.51405
450,0.0073,0.442634
500,0.0074,0.474097




Step,Training Loss,Validation Loss
50,0.0209,0.405324
100,0.0667,0.436099
150,0.0142,0.452933
200,0.0677,0.479855
250,0.0451,0.681113
300,0.0073,0.611161
350,0.1162,0.436199
400,0.1341,0.363965




Step,Training Loss,Validation Loss
50,0.0069,0.459078
100,0.0052,0.471043
150,0.0057,0.509541
200,0.0633,0.610931
250,0.0034,0.501875
300,0.0031,0.550061
350,0.0026,0.563136
400,0.1743,0.541033
450,0.1469,0.503865
500,0.1596,0.560067




Step,Training Loss,Validation Loss
50,0.0069,0.469721
100,0.067,0.432397
150,0.0044,0.454934
200,0.0037,0.502577
250,0.003,0.510745
300,0.1277,0.910428
350,0.0025,0.886236
400,0.1309,1.022737
450,0.0684,0.619469
500,0.0807,0.613923




Step,Training Loss,Validation Loss
50,0.0053,0.436221
100,0.0588,0.470859
150,0.0035,0.473775
200,0.0726,1.032724
250,0.0026,0.64816
300,0.0021,0.551013
350,0.0017,0.925328
400,0.2131,0.64016




Step,Training Loss,Validation Loss
50,0.0049,0.439828
100,0.0499,0.474011
150,0.0033,0.479876
200,0.0025,0.521475
250,0.0022,0.460377
300,0.1383,0.79912
350,0.0018,0.67082
400,0.0194,0.642672
450,0.0728,0.537088
500,0.0818,0.656663




Step,Training Loss,Validation Loss
50,0.0043,0.454888
100,0.0043,0.475033
150,0.0027,0.511721
200,0.002,0.551547
250,0.0023,0.473182
300,0.0023,0.513027
350,0.0779,0.799896
400,0.003,0.595438
450,0.2051,0.444129
500,0.203,0.6197




Step,Training Loss,Validation Loss
50,0.0227,0.465187
100,0.1172,0.439968
150,0.002,0.443421
200,0.0027,0.408862
250,0.0017,0.542366
300,0.0015,0.539915
350,0.0522,0.492613
400,0.0011,0.730862




Step,Training Loss,Validation Loss
50,0.0018,0.422467
100,0.0017,0.444732
150,0.0015,0.445051
200,0.0014,0.426646
250,0.0013,0.614713
300,0.1223,0.445248
350,0.0011,0.731997
400,0.001,0.476702
450,0.0009,0.557144
500,0.2633,0.972152




Step,Training Loss,Validation Loss
50,0.0017,0.457455
100,0.0016,0.453243
150,0.0014,0.446274
200,0.0012,0.425876
250,0.0012,0.557612
300,0.0011,0.549464
350,0.0009,0.543859
400,0.0008,0.485356
450,0.0324,0.612303
500,0.2762,0.739208




Step,Training Loss,Validation Loss
50,0.0012,0.46586
100,0.0012,0.464904
150,0.001,0.464015
200,0.0009,0.466575
250,0.0008,0.570556
300,0.0007,0.56407
350,0.0007,0.795616
400,0.0791,0.676623




Step,Training Loss,Validation Loss
50,0.001,0.491723
100,0.0009,0.488165
150,0.0008,0.489832
200,0.0007,0.48974
250,0.0006,0.569444
300,0.0096,0.539944
350,0.0005,0.573557
400,0.0004,0.560435
450,0.0005,0.565203
500,0.1175,0.558196




Step,Training Loss,Validation Loss
50,0.0009,0.515174
100,0.0008,0.508224
150,0.0007,0.505328
200,0.0006,0.503225
250,0.0006,0.563879
300,0.0005,0.612167
350,0.0004,0.599626
400,0.0004,0.584931
450,0.0004,0.605845
500,0.1621,0.92354




Step,Training Loss,Validation Loss
50,0.0006,0.519025
100,0.0005,0.495171
150,0.0005,0.514153
200,0.0004,0.531521
250,0.0004,0.737454
300,0.0004,0.717497
350,0.0004,0.800727
400,0.0033,0.791769




Step,Training Loss,Validation Loss
50,0.0005,0.509085
100,0.0005,0.501784
150,0.0004,0.523025
200,0.0004,0.563379
250,0.0004,0.73527
300,0.0003,0.746677
350,0.0003,0.734335
400,0.0003,0.738892
450,0.0004,0.63388
500,0.0986,0.759663




Step,Training Loss,Validation Loss
50,0.0005,0.511663
100,0.0004,0.510706
150,0.0004,0.535209
200,0.0003,0.540051
250,0.0004,0.826353
300,0.0003,0.670096
350,0.0003,0.770805
400,0.0003,0.78543
450,0.0004,0.770695
500,0.1848,0.588724


Best params: {'learning_rate': 5e-05, 'weight_decay': 0.02, 'num_train_epochs': 5}, Best eval loss: 0.3384014666080475


In [25]:
# Khởi tạo Trainer với Dừng sớm (Early Stopping)
training_args_with_early_stop = TrainingArguments(
    output_dir='./results_with_early_stop',
    num_train_epochs=best_params['num_train_epochs'],
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=best_params['weight_decay'],
    logging_dir='./logs_with_early_stop',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    learning_rate=best_params['learning_rate'],
)

trainer_with_early_stop = Trainer(
    model=model,
    args=training_args_with_early_stop,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)



In [27]:
trainer_with_early_stop.train()

Step,Training Loss,Validation Loss
50,0.0004,0.527441
100,0.0003,0.570453
150,0.0003,0.565561
200,0.0002,0.642088
250,0.1023,0.610466


TrainOutput(global_step=250, training_loss=0.016906124525237827, metrics={'train_runtime': 186.4432, 'train_samples_per_second': 30.143, 'train_steps_per_second': 3.781, 'total_flos': 0.0, 'train_loss': 0.016906124525237827, 'epoch': 1.773049645390071})

In [28]:
# Dự đoán nhãn cho tập kiểm tra
predictions2 = trainer_with_early_stop.predict(val_dataset)

# Lấy nhãn dự đoán từ logits
pred_labels2 = np.argmax(predictions2.predictions, axis=1)

In [29]:
print(pred_labels2[:])  # In ra dự đoán

[0 0 0 1 0 1 1 0 1 0 1 1 1 1 1 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 1 0 0 1 1 1
 0 1 1 1 0 0 0 0 1 0 1 1 1 1 0 0 1 0 0 1 1 1 1 1 0 0 0 0 1 1 1 0 0 1 0 1 1
 1 1 0 1 0 1 1 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 1 1 0
 0 0 0 1 1 1 0 0 0 0 1 1 0 0 1 1 0 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 1 0 0 0 1
 0 0 1 1 0 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 1 0 0 1 1 0 0 0 1 1 1 0 1 1 1 1 0
 0 0 0 0 0 1 0 0 1 1 0 0 1 1 1 0 0 0 0 1 0 1 1 1 0 1 0 1 0 1 0 0 0 0 0 0 1
 0 1 1 1 0 1 1 1 1 0 0 1 0 1 1 1 0 1 1 1 1 0 0 1 0 1 1 1 1 0 0 1 0 1 1 0 0
 1 0 1 0 1 0 1 1 1 1 0 1 0 0 1 0 1 1 1 1 0 0 1]


In [30]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels2)
precision2 = precision_score(val_labels, pred_labels2, pos_label=0)
recall2 = recall_score(val_labels, pred_labels2, pos_label=0)
f12 = f1_score(val_labels, pred_labels2, pos_label=0)
auc2 = roc_auc_score(val_labels, predictions2.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')

Accuracy: 0.925532
Precision: 0.936170
Recall: 0.916667
F1 Score: 0.926316
AUC: 0.955264
