# Phiên bản khác multi naive bayes

In [1]:
# Hate Speech Detection - TFIDF + Multinomial Naive Bayes (MultinomialNB)
# - Fit trên train.csv
# - Tune hyperparam dựa trên dev.csv (PredefinedSplit)
# - Chấm điểm 1 lần trên test.csv
# Dữ liệu đã preprocessing sẵn, cột: free_text, label_id

import os
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import PredefinedSplit, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    f1_score, accuracy_score,
    classification_report, confusion_matrix
)
from scipy.stats import loguniform


# ===================== CONFIG =====================
TRAIN_PATH = "/home/uit2023/LuuTru/Thuchd/cs221/CS221_NLP_SA/UIT-ViHSD-preprocessed/train.csv"
DEV_PATH   = "/home/uit2023/LuuTru/Thuchd/cs221/CS221_NLP_SA/UIT-ViHSD-preprocessed/dev.csv"
TEST_PATH  = "/home/uit2023/LuuTru/Thuchd/cs221/CS221_NLP_SA/UIT-ViHSD-preprocessed/test.csv"

TEXT_COL  = "free_text"
LABEL_COL = "label_id"

SCORING = "f1_macro"
SAVE_MODEL_PATH = "./models/final_best_mnb_tfidf.joblib"
SAVE_INFO_PATH  = "./models/final_best_mnb_tfidf_info.json"

RANDOM_STATE = 42
N_ITER_WORD = 60
N_ITER_CHAR = 60

# alpha của NB (Laplace smoothing)
ALPHA_DIST = loguniform(1e-3, 1e1)


# ===================== LOAD =====================
def load_xy(path):
    df = pd.read_csv(path).dropna(subset=[TEXT_COL, LABEL_COL]).copy()
    df[TEXT_COL]  = df[TEXT_COL].astype(str)
    df[LABEL_COL] = df[LABEL_COL].astype(int)
    return df[TEXT_COL].tolist(), df[LABEL_COL].tolist()

X_train, y_train = load_xy(TRAIN_PATH)
X_dev,   y_dev   = load_xy(DEV_PATH)
X_test,  y_test  = load_xy(TEST_PATH)

def overlap_count(a, b):
    sa, sb = set(a), set(b)
    return len(sa & sb)

print("Overlap train-dev :", overlap_count(X_train, X_dev))
print("Overlap train-test:", overlap_count(X_train, X_test))
print("Overlap dev-test  :", overlap_count(X_dev, X_test))


# ===================== PIPELINE TEMPLATE =====================
def make_pipeline():
    return Pipeline([
        ("tfidf", TfidfVectorizer(sublinear_tf=False,smooth_idf=False)),
        ("clf", MultinomialNB())
    ])

def eval_and_print(name, y_true, y_pred):
    print(f"\n===== {name} =====")
    print("f1_macro:", f1_score(y_true, y_pred, average="macro"))
    print("acc     :", accuracy_score(y_true, y_pred))
    print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nReport:\n", classification_report(y_true, y_pred, digits=4))


# ===================== PredefinedSplit: train=-1, dev=0 =====================
X_train_dev = X_train + X_dev
y_train_dev = y_train + y_dev
test_fold = [-1] * len(X_train) + [0] * len(X_dev)
ps = PredefinedSplit(test_fold=test_fold)


# ===================== PARAM DISTRIBUTIONS =====================
# NOTE:
# - MultinomialNB phù hợp với feature >= 0 (TF-IDF ok).
# - "tfidf__norm" để None hoặc "l2" đều được.
param_dist_word = {
    "tfidf__analyzer": ["word"],
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "tfidf__min_df": [1, 2, 5],
    "tfidf__max_df": [0.9, 0.95, 1.0],
    # "tfidf__use_idf": [True, False],
    # "tfidf__norm": ["l2", None],
    # "tfidf__binary": [False, True],
    "tfidf__max_features": [50000, 100000, None],

    "clf__alpha": ALPHA_DIST,
    "clf__fit_prior": [True, False],
}

# param_dist_char = {
#     "tfidf__analyzer": ["char_wb"],
#     "tfidf__ngram_range": [(3, 5), (4, 6)],
#     "tfidf__min_df": [1, 2, 5],
#     "tfidf__max_df": [0.9, 0.95, 1.0],
#     #"tfidf__use_idf": [True, False],
#     # "tfidf__norm": ["l2", None],
#     # "tfidf__binary": [False, True],
#     "tfidf__max_features": [50000, 100000, None],

#     "clf__alpha": ALPHA_DIST,
#     "clf__fit_prior": [True, False],
# }


# ===================== RANDOMIZED SEARCH (tune theo DEV) =====================
def run_random_search(param_dist, n_iter, tag):
    rs = RandomizedSearchCV(
        estimator=make_pipeline(),
        param_distributions=param_dist,
        n_iter=n_iter,
        scoring=SCORING,
        cv=ps,                 # validate trên dev
        n_jobs=-1,
        verbose=2,
        random_state=RANDOM_STATE,
        refit=False,           # không refit train+dev để tránh “dính dev”
        error_score=np.nan
    )
    rs.fit(X_train_dev, y_train_dev)
    print(f"\n[{tag}] best_dev_score({SCORING}) = {rs.best_score_}")
    print(f"[{tag}] best_params = {rs.best_params_}")
    return rs.best_score_, rs.best_params_

word_score, word_params = run_random_search(param_dist_word, N_ITER_WORD, "WORD")
#char_score, char_params = run_random_search(param_dist_char, N_ITER_CHAR, "CHAR")

# if char_score >= word_score:
#     best_params = char_params
#     best_dev_score = char_score
#     best_tag = "CHAR"
# else:
best_params = word_params
best_dev_score = word_score
best_tag = "WORD"

print("\n===== BEST (by DEV via RandomizedSearchCV) =====")
print("best_tag      :", best_tag)
print("best_dev_score:", best_dev_score)
print("best_params   :", best_params)


# ===================== Fit on TRAIN -> Report DEV =====================
best_model_train = make_pipeline()
best_model_train.set_params(**best_params)
best_model_train.fit(X_train, y_train)

dev_pred = best_model_train.predict(X_dev)
eval_and_print("DEV (train-only fit)", y_dev, dev_pred)


# ===================== FINAL: Fit TRAIN only -> TEST =====================
final_model = make_pipeline()
final_model.set_params(**best_params)

# Fit CHỈ trên TRAIN (không dùng DEV)
final_model.fit(X_train, y_train)

test_pred = final_model.predict(X_test)
eval_and_print("TEST (train-only final)", y_test, test_pred)


# ===================== SAVE =====================
Path(os.path.dirname(SAVE_MODEL_PATH)).mkdir(parents=True, exist_ok=True)
joblib.dump(final_model, SAVE_MODEL_PATH)

info = {
    "model": "TFIDF + MultinomialNB",
    "best_tag": best_tag,
    "best_params": best_params,
    "best_dev_score": float(best_dev_score),
    "test_f1_macro": float(f1_score(y_test, test_pred, average="macro")),
    "test_acc": float(accuracy_score(y_test, test_pred)),
}
with open(SAVE_INFO_PATH, "w", encoding="utf-8") as f:
    json.dump(info, f, ensure_ascii=False, indent=2)

print(f"\nSaved model: {SAVE_MODEL_PATH}")
print(f"Saved info : {SAVE_INFO_PATH}")


Overlap train-dev : 381
Overlap train-test: 897
Overlap dev-test  : 134
Fitting 1 folds for each of 60 candidates, totalling 60 fits
[CV] END clf__alpha=0.7726718477963437, clf__fit_prior=False, tfidf__analyzer=word, tfidf__max_df=0.95, tfidf__max_features=100000, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.3s
[CV] END clf__alpha=0.03148911647956861, clf__fit_prior=True, tfidf__analyzer=word, tfidf__max_df=1.0, tfidf__max_features=None, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.3s
[CV] END clf__alpha=0.016480446427978974, clf__fit_prior=False, tfidf__analyzer=word, tfidf__max_df=0.9, tfidf__max_features=50000, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.3s
[CV] END clf__alpha=0.2801635158716261, clf__fit_prior=False, tfidf__analyzer=word, tfidf__max_df=1.0, tfidf__max_features=100000, tfidf__min_df=2, tfidf__ngram_range=(1, 1); total time=   0.3s
[CV] END clf__alpha=0.2975390947349387, clf__fit_prior=False, tfidf__analyzer=word, tfidf__