In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from datasets import Dataset

In [None]:
MODEL_PATH = "./phobert-emotion-model-final"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

In [None]:
valid_df = pd.read_csv("data/UIT_valid_clean.csv", encoding="utf-8-sig")

valid_emotions = {"Anger", "Disgust", "Fear", "Enjoyment", "Sadness", "Surprise", "Other"}
valid_df = valid_df[valid_df["Emotion"].str.capitalize().isin(valid_emotions)]
valid_df["Emotion"] = valid_df["Emotion"].str.capitalize()

label2id = {"Anger":0, "Disgust":1, "Fear":2, "Enjoyment":3, "Sadness":4, "Surprise":5, "Other":6}
id2label = {v:k for k,v in label2id.items()}
valid_df["Emotion"] = valid_df["Emotion"].map(label2id)

# Predict
def predict(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=-1)
    return preds.cpu().numpy()


In [None]:
y_true = valid_df["Emotion"].tolist()
y_pred = []

batch_size = 32
for i in range(0, len(valid_df), batch_size):
    batch_texts = valid_df["Clean_sentence"].iloc[i:i+batch_size].tolist()
    preds = predict(batch_texts)
    y_pred.extend(preds)


print("🔹 Accuracy:", accuracy_score(y_true, y_pred))
print("\n🔹 Classification Report:")
print(classification_report(y_true, y_pred, target_names=list(label2id.keys())))

print("\n🔹 Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))