In [1]:
# ======================================================
# 0. IMPORT THƯ VIỆN & LOAD DATA UIT-VSMEC
# ======================================================
import os
import re
import unicodedata
import random
from collections import Counter

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Nếu đọc .xlsx lỗi thì bật dòng dưới (thường Kaggle có sẵn rồi)
# !pip install --quiet openpyxl

# Đường dẫn Kaggle dataset (thay bằng tên dataset nếu bạn đổi)
DATA_DIR = "/kaggle/input/uit-data-nhatnam2"

train_path = os.path.join(DATA_DIR, "train_nor_811.xlsx")
valid_path = os.path.join(DATA_DIR, "valid_nor_811.xlsx")
test_path  = os.path.join(DATA_DIR, "test_nor_811.xlsx")

train_raw = pd.read_excel(train_path)
valid_raw = pd.read_excel(valid_path)
test_raw  = pd.read_excel(test_path)

print("Train raw shape:", train_raw.shape)
print("Valid raw shape:", valid_raw.shape)
print("Test  raw shape:", test_raw.shape)
print("Columns:", train_raw.columns.tolist())


Train raw shape: (5548, 3)
Valid raw shape: (686, 3)
Test  raw shape: (693, 3)
Columns: ['Unnamed: 0', 'Emotion', 'Sentence']


In [2]:
# ======================================================
# 1. PHÁT HIỆN CỘT TEXT & LABEL + TIỀN XỬ LÝ CƠ BẢN
# ======================================================

def detect_text_and_label_cols(df):
    cols = list(df.columns)

    # ---- đoán cột text ----
    text_candidates = [
        "text", "comment", "sentence", "content", "review",
        "message", "utterance", "cmt"
    ]
    label_candidates = [
        "label", "labels", "emotion", "sentiment", "class", "target"
    ]

    # tìm cột text theo tên
    text_col = None
    lower_map = {c.lower(): c for c in cols}
    for cand in text_candidates:
        if cand in lower_map:
            text_col = lower_map[cand]
            break

    # nếu không tìm được theo tên thì chọn cột string dài nhất
    if text_col is None:
        obj_cols = df.select_dtypes(include=["object"]).columns
        if len(obj_cols) == 0:
            raise ValueError("Không tìm được cột text (không có cột kiểu object).")
        best_col = None
        best_len = -1
        for c in obj_cols:
            avg_len = df[c].astype(str).str.len().mean()
            if avg_len > best_len:
                best_len = avg_len
                best_col = c
        text_col = best_col

    # ---- đoán cột label ----
    label_col = None
    for cand in label_candidates:
        if cand in lower_map and lower_map[cand] != text_col:
            label_col = lower_map[cand]
            break

    # nếu không tìm được theo tên → chọn cột có ít giá trị khác nhau
    if label_col is None:
        candidate_cols = [
            c for c in cols
            if c != text_col and df[c].nunique() >= 2 and df[c].nunique() <= 50
        ]
        if not candidate_cols:
            raise ValueError("Không tìm được cột label phù hợp.")
        best_col = None
        best_nunique = 999999
        for c in candidate_cols:
            nunique = df[c].nunique()
            if nunique < best_nunique:
                best_nunique = nunique
                best_col = c
        label_col = best_col

    return text_col, label_col


def prepare_df(df):
    text_col, label_col = detect_text_and_label_cols(df)
    print(f"Detected text column:  {text_col}")
    print(f"Detected label column: {label_col}")

    df = df[[text_col, label_col]].copy()
    df.columns = ["text", "label"]
    df = df.dropna(subset=["text", "label"])

    # ép kiểu string
    df["text"] = df["text"].astype(str)
    df["label"] = df["label"].astype(str)
    return df


train_df = prepare_df(train_raw)
valid_df = prepare_df(valid_raw)
test_df  = prepare_df(test_raw)

print("\nLabel distribution (train):")
print(train_df["label"].value_counts())


Detected text column:  Sentence
Detected label column: Emotion
Detected text column:  Sentence
Detected label column: Emotion
Detected text column:  Sentence
Detected label column: Emotion

Label distribution (train):
label
Enjoyment    1558
Disgust      1071
Other        1021
Sadness       947
Anger         391
Fear          318
Surprise      242
Name: count, dtype: int64


In [3]:
# ======================================================
# 1.1. HÀM CHUẨN HOÁ TEXT
# ======================================================
URL_RE = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)

def normalize_text(s: str) -> str:
    s = unicodedata.normalize("NFC", s)
    # thay URL bằng token chung
    s = URL_RE.sub(" URL ", s)
    # bỏ bớt khoảng trắng thừa
    s = re.sub(r"\s+", " ", s)
    return s.strip()

for df in (train_df, valid_df, test_df):
    df["text"] = df["text"].map(normalize_text)

print(train_df.head())


                                                text      label
0              cho mình xin bài nhạc tên là gì với ạ      Other
1  cho đáng đời con quỷ . về nhà lôi con nhà mày ...    Disgust
2  lo học đi . yêu đương lol gì hay lại thích học...    Disgust
3    uớc gì sau này về già vẫn có thể như cụ này :))  Enjoyment
4  mỗi lần có video của con là cứ coi đi coi lại ...  Enjoyment


In [4]:
# ======================================================
# 1.2. SET SEED CHUNG (CHO TF-IDF + BiLSTM)
# ======================================================
import numpy as np
import torch
import random

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [5]:
# ======================================================
# 2. BASELINE 1 – TF-IDF + LINEAR SVM (SCIKIT-LEARN)
# ======================================================

# Lấy dữ liệu
X_train = train_df["text"].tolist()
y_train = train_df["label"].tolist()

X_valid = valid_df["text"].tolist()
y_valid = valid_df["label"].tolist()

X_test  = test_df["text"].tolist()
y_test  = test_df["label"].tolist()

# Pipeline: TF-IDF (uni+bi-gram) -> Linear SVM
svm_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1, 2),   # unigram + bigram
        min_df=2,
        max_df=0.95,
        max_features=50000,
        sublinear_tf=True
    )),
    ("clf", LinearSVC(C=1.0))
])

# Huấn luyện
svm_pipeline.fit(X_train, y_train)

def eval_split_svm(name, X, y_true):
    y_pred = svm_pipeline.predict(X)
    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    weighted_f1 = f1_score(y_true, y_pred, average="weighted")

    print(f"\n===== TF-IDF + SVM - {name} =====")
    print(f"Accuracy   : {acc:.4f}")
    print(f"Macro F1   : {macro_f1:.4f}")
    print(f"Weighted F1: {weighted_f1:.4f}")
    print("\nClassification report:")
    print(classification_report(y_true, y_pred))

# Đánh giá trên train / valid / test
eval_split_svm("Train", X_train, y_train)
eval_split_svm("Valid", X_valid, y_valid)
eval_split_svm("Test",  X_test,  y_test)



===== TF-IDF + SVM - Train =====
Accuracy   : 0.9941
Macro F1   : 0.9937
Weighted F1: 0.9941

Classification report:
              precision    recall  f1-score   support

       Anger       1.00      0.99      1.00       391
     Disgust       0.99      0.99      0.99      1071
   Enjoyment       0.99      1.00      0.99      1558
        Fear       0.99      0.99      0.99       318
       Other       1.00      0.99      0.99      1021
     Sadness       1.00      1.00      1.00       947
    Surprise       0.99      0.99      0.99       242

    accuracy                           0.99      5548
   macro avg       0.99      0.99      0.99      5548
weighted avg       0.99      0.99      0.99      5548


===== TF-IDF + SVM - Valid =====
Accuracy   : 0.5569
Macro F1   : 0.5056
Weighted F1: 0.5529

Classification report:
              precision    recall  f1-score   support

       Anger       0.47      0.39      0.43        49
     Disgust       0.56      0.59      0.57       135
   E

In [6]:
# ======================================================
# 3. BASELINE 2 – BiLSTM + WORD EMBEDDING (PYTORCH)
# ======================================================

# 3.1. Xây vocab từ train
def simple_tokenize(text):
    # ở đây dùng split theo khoảng trắng vì text đã clean
    return text.strip().split()

counter = Counter()
for t in train_df["text"]:
    counter.update(simple_tokenize(t))

max_vocab_size = 30000
most_common = counter.most_common(max_vocab_size - 2)  # trừ <pad>, <unk>

itos = ["<pad>", "<unk>"] + [w for w, _ in most_common]
stoi = {w: i for i, w in enumerate(itos)}

pad_idx = stoi["<pad>"]
unk_idx = stoi["<unk>"]
vocab_size = len(itos)
print("Vocab size:", vocab_size)

# 3.2. Encode câu -> sequence id (padding)
def encode_sentence(text, max_len=50):
    tokens = simple_tokenize(text)
    ids = [stoi.get(tok, unk_idx) for tok in tokens][:max_len]
    if len(ids) < max_len:
        ids += [pad_idx] * (max_len - len(ids))
    return ids

max_len = 128   # có thể chỉnh sau nếu muốn

# 3.3. Mapping label -> id (giữ cố định để dùng lại)
label_list = sorted(train_df["label"].unique())
label2id = {lb: i for i, lb in enumerate(label_list)}
id2label = {i: lb for lb, i in label2id.items()}
num_labels = len(label_list)
print("Labels:", label2id)


class VSMECDataset(Dataset):
    def __init__(self, df, max_len):
        self.texts = df["text"].tolist()
        self.labels = [label2id[lb] for lb in df["label"].tolist()]
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        x = torch.tensor(encode_sentence(self.texts[idx], max_len=self.max_len),
                         dtype=torch.long)
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y

train_dataset = VSMECDataset(train_df, max_len=max_len)
valid_dataset = VSMECDataset(valid_df, max_len=max_len)
test_dataset  = VSMECDataset(test_df,  max_len=max_len)

batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)


Vocab size: 4644
Labels: {'Anger': 0, 'Disgust': 1, 'Enjoyment': 2, 'Fear': 3, 'Other': 4, 'Sadness': 5, 'Surprise': 6}


In [7]:
# 3.4. Định nghĩa mô hình BiLSTM

class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_labels, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=1,
            bidirectional=True,
            batch_first=True,
        )
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim * 2, num_labels)

    def forward(self, input_ids):
        x = self.embedding(input_ids)         # (B, T, E)
        output, (h_n, c_n) = self.lstm(x)     # h_n: (2, B, H) vì bidirectional & 1 layer
        h_forward = h_n[-2]                   # (B, H)
        h_backward = h_n[-1]                  # (B, H)
        h_cat = torch.cat([h_forward, h_backward], dim=1)  # (B, 2H)
        logits = self.fc(self.dropout(h_cat))             # (B, num_labels)
        return logits

embed_dim = 200
hidden_dim = 128

model_bilstm = BiLSTMClassifier(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    hidden_dim=hidden_dim,
    num_labels=num_labels,
    pad_idx=pad_idx,
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_bilstm.parameters(), lr=1e-3)

print(model_bilstm)


BiLSTMClassifier(
  (embedding): Embedding(4644, 200, padding_idx=0)
  (lstm): LSTM(200, 128, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=7, bias=True)
)


In [8]:
# 3.5. Hàm evaluate cho BiLSTM

def evaluate_bilstm(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for x_batch, y_batch in data_loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            logits = model(x_batch)
            preds = torch.argmax(logits, dim=-1)
            all_preds.extend(preds.cpu().numpy().tolist())
            all_labels.extend(y_batch.cpu().numpy().tolist())
    acc = accuracy_score(all_labels, all_preds)
    macro_f1 = f1_score(all_labels, all_preds, average="macro")
    weighted_f1 = f1_score(all_labels, all_preds, average="weighted")
    return acc, macro_f1, weighted_f1, np.array(all_labels), np.array(all_preds)


In [9]:
# 3.6. Train loop + Early Stopping (theo macro F1 valid)

from copy import deepcopy

n_epochs = 20
best_val_macro_f1 = 0.0
best_state_dict = None
patience = 3
patience_counter = 0

for epoch in range(1, n_epochs + 1):
    model_bilstm.train()
    total_loss = 0.0

    for x_batch, y_batch in train_loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        logits = model_bilstm(x_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * x_batch.size(0)

    train_loss = total_loss / len(train_loader.dataset)
    val_acc, val_macro_f1, val_weighted_f1, _, _ = evaluate_bilstm(model_bilstm, valid_loader)

    print(f"Epoch {epoch:02d} | "
          f"train_loss={train_loss:.4f} | "
          f"val_acc={val_acc:.4f} | "
          f"val_macro_f1={val_macro_f1:.4f} | "
          f"val_weighted_f1={val_weighted_f1:.4f}")

    # Early stopping
    if val_macro_f1 > best_val_macro_f1:
        best_val_macro_f1 = val_macro_f1
        best_state_dict = deepcopy(model_bilstm.state_dict())
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping!")
            break

# Load best weights
if best_state_dict is not None:
    model_bilstm.load_state_dict(best_state_dict)


Epoch 01 | train_loss=1.7607 | val_acc=0.3557 | val_macro_f1=0.1626 | val_weighted_f1=0.2748
Epoch 02 | train_loss=1.5159 | val_acc=0.4184 | val_macro_f1=0.2964 | val_weighted_f1=0.3886
Epoch 03 | train_loss=1.3495 | val_acc=0.4475 | val_macro_f1=0.3427 | val_weighted_f1=0.4337
Epoch 04 | train_loss=1.0820 | val_acc=0.4504 | val_macro_f1=0.3918 | val_weighted_f1=0.4472
Epoch 05 | train_loss=0.8464 | val_acc=0.4927 | val_macro_f1=0.4251 | val_weighted_f1=0.4842
Epoch 06 | train_loss=0.6583 | val_acc=0.4913 | val_macro_f1=0.4267 | val_weighted_f1=0.4867
Epoch 07 | train_loss=0.4899 | val_acc=0.4971 | val_macro_f1=0.4306 | val_weighted_f1=0.4929
Epoch 08 | train_loss=0.3472 | val_acc=0.5102 | val_macro_f1=0.4529 | val_weighted_f1=0.5026
Epoch 09 | train_loss=0.2491 | val_acc=0.5029 | val_macro_f1=0.4264 | val_weighted_f1=0.4929
Epoch 10 | train_loss=0.1748 | val_acc=0.5000 | val_macro_f1=0.4469 | val_weighted_f1=0.4961
Epoch 11 | train_loss=0.1330 | val_acc=0.5102 | val_macro_f1=0.4555 | 

In [10]:
# 3.7. ĐÁNH GIÁ BiLSTM TRÊN TEST

test_acc, test_macro_f1, test_weighted_f1, y_true, y_pred = evaluate_bilstm(model_bilstm, test_loader)

print("\n===== BiLSTM Test Results =====")
print(f"Accuracy   : {test_acc:.4f}")
print(f"Macro F1   : {test_macro_f1:.4f}")
print(f"Weighted F1: {test_weighted_f1:.4f}")

print("\nClassification report (BiLSTM):")
print(classification_report(
    y_true,
    y_pred,
    target_names=[id2label[i] for i in range(num_labels)]
))



===== BiLSTM Test Results =====
Accuracy   : 0.4675
Macro F1   : 0.4271
Weighted F1: 0.4648

Classification report (BiLSTM):
              precision    recall  f1-score   support

       Anger       0.33      0.25      0.29        40
     Disgust       0.44      0.49      0.46       132
   Enjoyment       0.56      0.54      0.55       193
        Fear       0.53      0.50      0.52        46
       Other       0.37      0.45      0.41       129
     Sadness       0.49      0.48      0.48       116
    Surprise       0.54      0.19      0.28        37

    accuracy                           0.47       693
   macro avg       0.47      0.42      0.43       693
weighted avg       0.47      0.47      0.46       693

