In [None]:
# !pip install transformers datasets torch scikit-learn pandas numpy tqdm

Defaulting to user installation because normal site-packages is not writeable


In [4]:
import os
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score, f1_score
import torch

In [None]:
# Đường dẫn
UIT_PATH = "UIT-VSMEC"
DATA_PATH = "data"

# Load UIT-VSMEC
train_uit = pd.read_csv(os.path.join(UIT_PATH, "train.csv"))
valid_uit = pd.read_csv(os.path.join(UIT_PATH, "valid.csv"))
test_uit = pd.read_csv(os.path.join(UIT_PATH, "test.csv"))

# Chuẩn hóa cột
train_uit = train_uit.rename(columns={"Sentence": "text", "Emotion": "label"})
valid_uit = valid_uit.rename(columns={"Sentence": "text", "Emotion": "label"})
test_uit = test_uit.rename(columns={"Sentence": "text", "Emotion": "label"})

# Load dữ liệu custom
data1 = pd.read_csv(os.path.join(DATA_PATH, "data.csv")).rename(columns={"Comment": "text", "Emotion": "label"})
data2 = pd.read_csv(os.path.join(DATA_PATH, "augmented_dataset.csv"))[["text", "emotion"]].rename(columns={"emotion": "label"})
data3 = pd.read_csv(os.path.join(DATA_PATH, "temp_augmented.csv"))[["text", "emotion"]].rename(columns={"emotion": "label"})

# Gộp custom data
custom_train = pd.concat([data1, data2, data3], ignore_index=True)

# Gộp vào train chính
train_df = pd.concat([train_uit, custom_train], ignore_index=True)
valid_df = valid_uit
test_df = test_uit

print("Train shape:", train_df.shape)
print("Labels:", sorted(train_df["label"].unique()))

Train shape: (6961, 2)
Labels: ['Accepting', 'Anger', 'Disappointed', 'Disgust', 'Enjoyment', 'Fear', 'Highly negative', 'Hopeless', 'Hurt', 'Indifferent', 'Loneliness', 'Lonely', 'Neutral', 'Other', 'Sadness', 'Spam', 'Surprise']


In [6]:
train_df["label"] = train_df["label"].str.strip().str.title()
valid_df["label"] = valid_df["label"].str.strip().str.title()
test_df["label"] = test_df["label"].str.strip().str.title()

In [7]:
# Tạo Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[["text", "label"]])
valid_dataset = Dataset.from_pandas(valid_df[["text", "label"]])
test_dataset = Dataset.from_pandas(test_df[["text", "label"]])

# Tạo label mapping
labels = sorted(train_df["label"].unique())
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

print("label2id:", label2id)

label2id: {'Accepting': 0, 'Anger': 1, 'Disappointed': 2, 'Disgust': 3, 'Enjoyment': 4, 'Fear': 5, 'Highly Negative': 6, 'Hopeless': 7, 'Hurt': 8, 'Indifferent': 9, 'Loneliness': 10, 'Lonely': 11, 'Neutral': 12, 'Other': 13, 'Sadness': 14, 'Spam': 15, 'Surprise': 16}


In [8]:
MODEL_NAME = "vinai/phobert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=128)

def encode_labels(example):
    example["label"] = label2id[example["label"]]
    return example

# Áp dụng
train_tokenized = train_dataset.map(tokenize_function, batched=True).map(encode_labels)
valid_tokenized = valid_dataset.map(tokenize_function, batched=True).map(encode_labels)
test_tokenized = test_dataset.map(tokenize_function, batched=True).map(encode_labels)

# Định dạng cho PyTorch
train_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
valid_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map: 100%|██████████| 6961/6961 [00:03<00:00, 2023.66 examples/s]
Map: 100%|██████████| 6961/6961 [00:00<00:00, 8204.74 examples/s]
Map: 100%|██████████| 686/686 [00:00<00:00, 3261.90 examples/s]
Map: 100%|██████████| 686/686 [00:00<00:00, 9435.23 examples/s]
Map: 100%|██████████| 693/693 [00:00<00:00, 3479.16 examples/s]
Map: 100%|██████████| 693/693 [00:00<00:00, 9351.89 examples/s]


In [10]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}

training_args = TrainingArguments(
    output_dir="./phobert-emotion",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [11]:
# Train
trainer.train()

# Đánh giá trên test set
test_results = trainer.evaluate(test_tokenized)
print("Test results:", test_results)

# Lưu model
trainer.save_model("./phobert-emotion-final")
tokenizer.save_pretrained("./phobert-emotion-final")



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.318678,0.54519,0.495047
2,1.687300,1.209537,0.593294,0.564196
3,1.176000,1.195321,0.593294,0.572367




Test results: {'eval_loss': 1.2105369567871094, 'eval_accuracy': 0.5916305916305916, 'eval_f1': 0.568616795006812, 'eval_runtime': 35.57, 'eval_samples_per_second': 19.483, 'eval_steps_per_second': 1.237, 'epoch': 3.0}


('./phobert-emotion-final\\tokenizer_config.json',
 './phobert-emotion-final\\special_tokens_map.json',
 './phobert-emotion-final\\vocab.txt',
 './phobert-emotion-final\\bpe.codes',
 './phobert-emotion-final\\added_tokens.json')