In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# ============================
# 1. Setup: download FastText
# ============================
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!gunzip cc.en.300.bin.gz   # extracts cc.en.300.bin

In [None]:
import pandas as pd
import numpy as np
import fasttext
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV
from datasets import load_dataset
from tqdm import tqdm
import json
import pickle
import os

# -----------------
# 0. Setup Phase2 checkpoint folders
# -----------------
os.makedirs("checkpoints/phase2/best", exist_ok=True)

# -----------------
# 1. Load FastText model
# -----------------
print("Loading fastText model...")
fasttext_model = fasttext.load_model("cc.en.300.bin")

def get_embedding(text):
    return fasttext_model.get_sentence_vector(str(text))

def build_features(df):
    """Concatenate FastText embeddings of sentences_1 & sentences_2"""
    X = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Building features"):
        v1 = get_embedding(row["text1"])
        v2 = get_embedding(row["text2"])
        vec = np.concatenate([v1, v2])
        X.append(vec)
    return np.array(X)

# -----------------
# 2. Setup aspects & tracking
# -----------------
aspects = ["appearance", "aroma", "palate", "taste"]
results = {}
all_y_test = []
all_y_test_pred = []

# -----------------
# 3. SVM hyperparameters
# -----------------
param_grid = {
    "kernel": ["linear", "rbf"],
    "C": [0.1, 1, 10],
    "gamma": ["scale", "auto", 0.01, 0.001]
}

# -----------------
# 4. Train models for each aspect
# -----------------
for aspect in aspects:
    print(f"\n{'='*50}")
    print(f"Training Phase2 SVM model for aspect: {aspect}")
    print(f"{'='*50}")
    
    # Load roc-language dataset
    dataset = load_dataset(f"trungpq/rlcc-new-data-{aspect}")

    df_train = dataset["train"].to_pandas()
    df_val   = dataset["validation"].to_pandas()
    df_test  = dataset["test"].to_pandas()

    # Normalize column names
    for df in [df_train, df_val, df_test]:
        df.rename(columns={
            "sentences_1": "text1",
            "sentences_2": "text2",
            aspect: "label"
        }, inplace=True)
        df.dropna(subset=["text1", "text2", "label"], inplace=True)

    # FastText embeddings
    print(f"Building FastText features for {aspect}...")
    X_train = build_features(df_train)
    X_val   = build_features(df_val)
    X_test  = build_features(df_test)
    
    y_train = df_train["label"].values
    y_val   = df_val["label"].values
    y_test  = df_test["label"].values

    # Track for combined metrics
    all_y_test.extend(y_test)

    # Define base SVM
    svm_model = SVC(class_weight="balanced", random_state=42)

    # GridSearchCV on train+val
    print(f"Performing grid search for {aspect}...")
    X_trainval = np.vstack([X_train, X_val])
    y_trainval = np.concatenate([y_train, y_val])

    grid_search = GridSearchCV(
        svm_model,
        param_grid,
        scoring="f1_macro",
        cv=5,
        n_jobs=-1,
        verbose=2
    )

    with tqdm(total=1, desc=f"Grid search {aspect}") as pbar:
        grid_search.fit(X_trainval, y_trainval)
        pbar.update(1)

    print(f"Best Params for {aspect}:", grid_search.best_params_)
    print(f"Best CV Score for {aspect}:", grid_search.best_score_)

    # Evaluate on test set
    best_model = grid_search.best_estimator_
    y_test_pred = best_model.predict(X_test)

    all_y_test_pred.extend(y_test_pred)

    acc = accuracy_score(y_test, y_test_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_test_pred, average="macro")

    # Save metrics
    results[aspect] = {
        "accuracy": float(acc),
        "precision": float(precision),
        "recall": float(recall),
        "f1_score": float(f1),
        "best_params": grid_search.best_params_,
        "best_cv_score": float(grid_search.best_score_),
    }
    
    # Print evaluation
    print(f"\nFinal Test Evaluation for {aspect}:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(classification_report(y_test, y_test_pred, digits=4))

    # -----------------
    # Save PHASE 2 SVM checkpoint
    # -----------------
    save_path = f"checkpoints/phase2/best/{aspect}_best_model.pkl"
    with open(save_path, "wb") as f:
        pickle.dump(best_model, f)
    print(f"Saved {aspect} model to: {save_path}")

# -----------------
# 4.5 Calculate global metrics
# -----------------
precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
    all_y_test, all_y_test_pred, average="macro"
)
precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
    all_y_test, all_y_test_pred, average="micro"
)

results["combined_metrics"] = {
    "macro_precision": float(precision_macro),
    "macro_recall": float(recall_macro),
    "macro_f1_score": float(f1_macro),
    "micro_precision": float(precision_micro),
    "micro_recall": float(recall_micro),
    "micro_f1_score": float(f1_micro),
}

# -----------------
# 6. Save JSON summary
# -----------------
with open("phase2_fasttext_svm_results.json", "w") as f:
    json.dump(results, f, indent=4)

print("\nSaved → phase2_fasttext_svm_results.json")
print("All Phase2 SVM checkpoints saved → checkpoints/phase2/best/")