In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# ============================
# 1. Setup: download FastText
# ============================
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!gunzip cc.en.300.bin.gz   # extracts cc.en.300.bin

In [None]:
import fasttext
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, f1_score
from datasets import load_dataset
import itertools
from tqdm import tqdm
import json
import os

# -------------------
# 0. Create checkpoint folders
# -------------------
os.makedirs("checkpoints/phase2/best", exist_ok=True)
os.makedirs("checkpoints/phase2/final", exist_ok=True)

# -------------------
# 1. Load pretrained fastText
# -------------------
print("Loading fastText model...")
model = fasttext.load_model("cc.en.300.bin")

def get_embedding(text):
    return model.get_sentence_vector(str(text))

# Map nhÃ£n {-1,0,1} -> {0,1,2}
label2id = {-1: 0, 0: 1, 1: 2}
id2label = {v: k for k, v in label2id.items()}

def build_features(split_ds, aspect):
    X, y = [], []
    for row in tqdm(split_ds, desc=f"Building features for {aspect}"):
        v1 = get_embedding(row["sentences_1"])
        v2 = get_embedding(row["sentences_2"])
        vec = np.concatenate([v1, v2])
        X.append(vec)
        y.append(label2id[int(row[aspect])])
    return np.array(X), np.array(y)

# -------------------
# 2. Train models for each aspect
# -------------------
aspects = ["appearance", "aroma", "palate", "taste"]
results = {}

param_grid = {
    "max_depth": [3],
    "eta": [0.1],
    "subsample": [0.8],
    "colsample_bytree": [0.8],
    "min_child_weight": [1],
    "num_boost_round": [1000]
}

for aspect in aspects:
    print(f"\n{'='*50}")
    print(f"Training model for aspect: {aspect}")
    print(f"{'='*50}")

    dataset = load_dataset(f"trungpq/rlcc-new-data-{aspect}")

    def not_null(example):
        return (
            example["sentences_1"] is not None
            and example["sentences_2"] is not None
            and example[aspect] is not None
        )

    dataset = dataset.filter(not_null)

    X_train, y_train = build_features(dataset["train"], aspect)
    X_valid, y_valid = build_features(dataset["validation"], aspect)
    X_test, y_test = build_features(dataset["test"], aspect)

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)

    num_classes = len(label2id)
    best_params = None
    best_model = None
    best_f1 = -1

    print(f"Searching best hyperparameters for {aspect}...")
    param_combinations = list(itertools.product(*param_grid.values()))

    for values in tqdm(param_combinations, desc=f"Hyperparameter search - {aspect}"):
        params = dict(zip(param_grid.keys(), values))
        num_boost_round = params.pop("num_boost_round")

        params.update({
            "objective": "multi:softmax",
            "eval_metric": "mlogloss",
            "num_class": num_classes,
            "reg_lambda": 1.0,
            "reg_alpha": 0.1
        })

        bst = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=num_boost_round,
            evals=[(dvalid, "valid")],
            early_stopping_rounds=500,
            verbose_eval=False
        )

        y_pred_val = bst.predict(dvalid)
        f1 = f1_score(y_valid, y_pred_val, average="macro")

        if f1 > best_f1:
            best_f1 = f1
            best_params = params
            best_model = bst

            print(f"New best F1={f1:.4f} for {aspect}")

            # ðŸ”¥ Save best model checkpoint inside phase2 folder
            best_model.save_model(f"checkpoints/phase2/best/{aspect}_best_model.json")

    print(f"\nBest Params for {aspect}:", best_params)
    print(f"Best Validation F1 for {aspect}:", best_f1)

    # -------------------
    # Retrain with train+validation
    # -------------------
    print(f"\nRetraining {aspect} model with best params on train+validation...")
    X_train_full = np.vstack([X_train, X_valid])
    y_train_full = np.concatenate([y_train, y_valid])

    dtrain_full = xgb.DMatrix(X_train_full, label=y_train_full)
    dtest = xgb.DMatrix(X_test, label=y_test)

    final_model = xgb.train(
        params=best_params,
        dtrain=dtrain_full,
        num_boost_round=2000,
        evals=[(dtest, "test")],
        early_stopping_rounds=250,
        verbose_eval=False
    )

    # ðŸ”¥ Save final retrain model
    final_model.save_model(f"checkpoints/phase2/final/{aspect}_final_model.json")

    # -------------------
    # Final eval
    # -------------------
    y_pred_ids = final_model.predict(dtest)
    y_pred = np.array([id2label[int(i)] for i in y_pred_ids])
    y_test_true = np.array([id2label[int(i)] for i in y_test])

    acc = accuracy_score(y_test_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test_true, y_pred, average="macro")

    results[aspect] = {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "best_params": best_params
    }

    print(f"\nFinal Test Evaluation for {aspect}:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(classification_report(y_test_true, y_pred, digits=4))

# -------------------
# Save hyperparams
# -------------------
with open("phase2_best_hyperparams_xgboost_fasttext.json", "w") as f:
    json.dump(results, f, indent=4)

print("\nAll Phase 2 checkpoint saved into:")
print("  checkpoints/phase2/best/")
print("  checkpoints/phase2/final/")
print("Hyperparameters saved â†’ phase2_best_hyperparams_xgboost_fasttext.json")
