In [27]:
import pandas as pd
from pathlib import Path

In [28]:

# Đường dẫn tới thư mục dataset
DATA_DIR = Path(r"D:\003. HK1 - Năm 3\04. NLP\CS221_NLP_SA\UIT-ViHSD-preprocessed")

# Load các tập
train_df = pd.read_csv(DATA_DIR / "train.csv")
dev_df   = pd.read_csv(DATA_DIR / "dev.csv")
test_df  = pd.read_csv(DATA_DIR / "test.csv")


In [29]:
# Tách text và label
X_train = train_df["free_text"]
y_train = train_df["label_id"]

X_dev = dev_df["free_text"]
y_dev = dev_df["label_id"]

X_test = test_df["free_text"]
y_test = test_df["label_id"]


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, classification_report, confusion_matrix

In [31]:
# Định nghĩa hàm dev
def evaluate_on_dev(model, X_train_vec, X_dev_vec, y_train, y_dev):
    """
    Train model trên TRAIN, đánh giá F1-macro trên DEV
    """
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_dev_vec)
    return f1_score(y_dev, y_pred, average="macro")


In [32]:
# cấu hình TF-IDF
tfidf_configs = [
    {
        "name": "word",
        "vectorizer": TfidfVectorizer(
            analyzer="word",
            ngram_range=(1, 2),
            max_features=30000,
            min_df=2
        )
    },
    {
        "name": "char",
        "vectorizer": TfidfVectorizer(
            analyzer="char",
            ngram_range=(3, 5),
            max_features=50000,
            min_df=2
        )
    }
]


### KNN

In [33]:
from sklearn.neighbors import KNeighborsClassifier

knn_k_values = [3, 5, 7, 9]

best_knn = None
best_knn_score = 0
best_knn_desc = ""

for tfidf_cfg in tfidf_configs:
    vec = tfidf_cfg["vectorizer"]
    
    # FIT trên TRAIN
    X_train_vec = vec.fit_transform(X_train)
    X_dev_vec = vec.transform(X_dev)
    
    for k in knn_k_values:
        model = KNeighborsClassifier(
            n_neighbors=k,
            metric="cosine"
        )
        
        score = evaluate_on_dev(
            model,
            X_train_vec,
            X_dev_vec,
            y_train,
            y_dev
        )
        
        if score > best_knn_score:
            best_knn_score = score
            best_knn = (tfidf_cfg["name"], vec, model)
            best_knn_desc = f"KNN | {tfidf_cfg['name']} | k={k}"

print("Best KNN config:", best_knn_desc)
print("Best DEV F1-macro:", best_knn_score)


Best KNN config: KNN | word | k=3
Best DEV F1-macro: 0.4749910758906261


### Decision tree

In [34]:
from sklearn.tree import DecisionTreeClassifier

dt_depths = [10, 20, 30, None]

best_dt = None
best_dt_score = 0
best_dt_desc = ""

for tfidf_cfg in tfidf_configs:
    vec = tfidf_cfg["vectorizer"]
    
    X_train_vec = vec.fit_transform(X_train)
    X_dev_vec = vec.transform(X_dev)
    
    for depth in dt_depths:
        model = DecisionTreeClassifier(
            max_depth=depth,
            min_samples_leaf=5,
            random_state=42
        )
        
        score = evaluate_on_dev(
            model,
            X_train_vec,
            X_dev_vec,
            y_train,
            y_dev
        )
        
        if score > best_dt_score:
            best_dt_score = score
            best_dt = (tfidf_cfg["name"], vec, model)
            best_dt_desc = f"DT | {tfidf_cfg['name']} | depth={depth}"

print("Best Decision Tree config:", best_dt_desc)
print("Best DEV F1-macro:", best_dt_score)


Best Decision Tree config: DT | char | depth=None
Best DEV F1-macro: 0.5388300520189583


### SVM

In [35]:
from sklearn.svm import LinearSVC

svm_C_values = [0.1, 1, 10]

best_svm = None
best_svm_score = 0
best_svm_desc = ""

for tfidf_cfg in tfidf_configs:
    vec = tfidf_cfg["vectorizer"]
    
    X_train_vec = vec.fit_transform(X_train)
    X_dev_vec = vec.transform(X_dev)
    
    for C in svm_C_values:
        model = LinearSVC(
            C=C,
            class_weight="balanced",
            max_iter=5000
        )
        
        score = evaluate_on_dev(
            model,
            X_train_vec,
            X_dev_vec,
            y_train,
            y_dev
        )
        
        if score > best_svm_score:
            best_svm_score = score
            best_svm = (tfidf_cfg["name"], vec, model)
            best_svm_desc = f"SVM | {tfidf_cfg['name']} | C={C}"

print("Best SVM config:", best_svm_desc)
print("Best DEV F1-macro:", best_svm_score)


Best SVM config: SVM | char | C=1
Best DEV F1-macro: 0.6265566993162505


### Đánh giá trên test

In [37]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score

def evaluate_on_test(best_tuple, model_name):
    tfidf_name, vec, model = best_tuple
    
    # Fit TF-IDF trên TRAIN
    X_train_vec = vec.fit_transform(X_train)
    X_test_vec = vec.transform(X_test)
    
    # Train model
    model.fit(X_train_vec, y_train)
    
    # Predict
    y_pred = model.predict(X_test_vec)
    
    print(f"\nFINAL TEST RESULT — {model_name}")
    print("TF-IDF type:", tfidf_name)
    print("F1-macro:", f1_score(y_test, y_pred, average="macro"))
    print(classification_report(y_test, y_pred, digits=4))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))


In [38]:
evaluate_on_test(best_knn, "KNN")
evaluate_on_test(best_dt, "Decision Tree")
evaluate_on_test(best_svm, "SVM")



FINAL TEST RESULT — KNN
TF-IDF type: word
F1-macro: 0.4607764142946842
              precision    recall  f1-score   support

           0     0.8639    0.9740    0.9157      5548
           1     0.3897    0.1712    0.2379       444
           2     0.4565    0.1526    0.2288       688

    accuracy                         0.8361      6680
   macro avg     0.5701    0.4326    0.4608      6680
weighted avg     0.7905    0.8361    0.7999      6680

Confusion Matrix:
[[5404   67   77]
 [ 320   76   48]
 [ 531   52  105]]

FINAL TEST RESULT — Decision Tree
TF-IDF type: char
F1-macro: 0.5165624034781642
              precision    recall  f1-score   support

           0     0.8898    0.9355    0.9120      5548
           1     0.3609    0.2162    0.2704       444
           2     0.4010    0.3387    0.3672       688

    accuracy                         0.8262      6680
   macro avg     0.5506    0.4968    0.5166      6680
weighted avg     0.8043    0.8262    0.8133      6680

Confusion M

### Lưu model

In [39]:
import joblib
from pathlib import Path

MODEL_DIR = Path("./saved_models")
MODEL_DIR.mkdir(exist_ok=True)


In [40]:
def train_and_save_model(best_tuple, model_name, save_path):
    tfidf_name, vectorizer, model = best_tuple
    
    # Fit TF-IDF trên TRAIN
    X_train_vec = vectorizer.fit_transform(X_train)
    
    # Train model trên TRAIN
    model.fit(X_train_vec, y_train)
    
    # Đóng gói tất cả lại
    package = {
        "model_name": model_name,
        "tfidf_type": tfidf_name,
        "vectorizer": vectorizer,
        "model": model
    }
    
    joblib.dump(package, save_path)
    print(f"Saved {model_name} model to {save_path}")


In [41]:
train_and_save_model(
    best_knn,
    model_name="KNN",
    save_path=MODEL_DIR / "knn_best.joblib"
)

train_and_save_model(
    best_dt,
    model_name="DecisionTree",
    save_path=MODEL_DIR / "dt_best.joblib"
)

train_and_save_model(
    best_svm,
    model_name="SVM",
    save_path=MODEL_DIR / "svm_best.joblib"
)


Saved KNN model to saved_models\knn_best.joblib
Saved DecisionTree model to saved_models\dt_best.joblib
Saved SVM model to saved_models\svm_best.joblib
