In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# ============================
# 1. Setup: download FastText
# ============================
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!gunzip cc.en.300.bin.gz   # extracts cc.en.300.bin

--2025-11-13 08:54:27--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.227.219.33, 13.227.219.59, 13.227.219.70, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.227.219.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: ‘cc.en.300.bin.gz’


2025-11-13 08:54:42 (283 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]



In [2]:
import pandas as pd
import numpy as np
import fasttext
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV
from datasets import load_dataset
from tqdm import tqdm
import json
import pickle
import os

# -----------------
# 0. Setup checkpoint folders
# -----------------
os.makedirs("checkpoints/phase1/best", exist_ok=True)

# -----------------
# 1. Load FastText model
# -----------------
print("Loading fastText model...")
fasttext_model = fasttext.load_model("cc.en.300.bin")

def get_embedding(text):
    return fasttext_model.get_sentence_vector(str(text))

def build_features(df):
    X = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Building features"):
        v = get_embedding(row["reviewSentence"])
        X.append(v)
    return np.array(X)

# -----------------
# 2. Setup aspects and result storage
# -----------------
aspects = ["appearance", "aroma", "palate", "taste"]
results = {}
all_y_test = []
all_y_test_pred = []

# -----------------
# 3. GridSearchCV parameters
# -----------------
param_grid = {
    "kernel": ["linear", "rbf"],
    "C": [0.1, 1, 10],
    "gamma": ["scale", "auto", 0.01, 0.001]
}

# -----------------
# 4. Train SVM models for each aspect
# -----------------
for aspect in aspects:
    print(f"\n{'='*60}")
    print(f"Training Phase1 SVM model for aspect: {aspect}")
    print(f"{'='*60}")

    # -----------------
    # Load dataset beer-com-sentences
    # -----------------
    dataset = load_dataset("lengocquangLAB/beer-com-sentences")

    df_train = dataset["train"].to_pandas()
    df_val   = dataset["validation"].to_pandas()
    df_test  = dataset["test"].to_pandas()

    # Rename label column to unify
    for df in [df_train, df_val, df_test]:
        df["label"] = df[aspect]
        df.dropna(subset=["reviewSentence", "label"], inplace=True)

    # -----------------
    # Build embeddings
    # -----------------
    print(f"Building FastText features for {aspect}...")

    X_train = build_features(df_train)
    X_val   = build_features(df_val)
    X_test  = build_features(df_test)

    y_train = df_train["label"].values.astype(int)
    y_val   = df_val["label"].values.astype(int)
    y_test  = df_test["label"].values.astype(int)

    # keep test labels for global metrics
    all_y_test.extend(y_test)

    # -----------------
    # Grid Search (train + val)
    # -----------------
    print(f"Running GridSearchCV for {aspect}...")

    X_trainval = np.vstack([X_train, X_val])
    y_trainval = np.concatenate([y_train, y_val])

    svm_model = SVC(class_weight="balanced", random_state=42)

    grid_search = GridSearchCV(
        svm_model,
        param_grid,
        scoring="f1_macro",
        cv=5,
        n_jobs=-1,
        verbose=1
    )

    with tqdm(total=1, desc=f"GridSearch for {aspect}") as pbar:
        grid_search.fit(X_trainval, y_trainval)
        pbar.update(1)

    best_model = grid_search.best_estimator_
    y_test_pred = best_model.predict(X_test)

    # store for combined metrics
    all_y_test_pred.extend(y_test_pred)

    # -----------------
    # Evaluate on test
    # -----------------
    acc = accuracy_score(y_test, y_test_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_test_pred, average="macro"
    )

    results[aspect] = {
        "best_params": grid_search.best_params_,
        "best_cv_score": float(grid_search.best_score_),
        "accuracy": float(acc),
        "precision": float(precision),
        "recall": float(recall),
        "f1_score": float(f1)
    }

    print(f"\nFinal Test Evaluation for {aspect}:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(classification_report(y_test, y_test_pred, digits=4))

    # -----------------
    # Save checkpoint (Phase1)
    # -----------------
    save_path = f"checkpoints/phase1/best/{aspect}_best_model.pkl"
    with open(save_path, "wb") as f:
        pickle.dump(best_model, f)
    print(f"Saved best model to: {save_path}")

# -----------------
# 5. Global metrics across all aspects
# -----------------
precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
    all_y_test, all_y_test_pred, average="macro"
)
precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
    all_y_test, all_y_test_pred, average="micro"
)

results["combined_metrics"] = {
    "macro_precision": float(precision_macro),
    "macro_recall": float(recall_macro),
    "macro_f1_score": float(f1_macro),
    "micro_precision": float(precision_micro),
    "micro_recall": float(recall_micro),
    "micro_f1_score": float(f1_micro),
}

print("\nGLOBAL METRICS ACROSS ALL ASPECTS")
print("Macro F1:", f1_macro)
print("Micro F1:", f1_micro)

# -----------------
# 6. Save result JSON
# -----------------
with open("phase1_fasttext_svm_results.json", "w") as f:
    json.dump(results, f, indent=4)

print("\nSaved → phase1_fasttext_svm_results.json")
print("All Phase1 SVM checkpoints saved → checkpoints/phase1/best/")


Loading fastText model...

Training Phase1 SVM model for aspect: appearance


README.md:   0%|          | 0.00/986 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/663k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/93.4k [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/88.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12036 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1612 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1547 [00:00<?, ? examples/s]

Building FastText features for appearance...


Building features: 100%|██████████| 12036/12036 [00:01<00:00, 7528.07it/s]
Building features: 100%|██████████| 1547/1547 [00:00<00:00, 7830.04it/s]
Building features: 100%|██████████| 1612/1612 [00:00<00:00, 9705.23it/s]


Running GridSearchCV for appearance...


GridSearch for appearance:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting 5 folds for each of 24 candidates, totalling 120 fits


GridSearch for appearance: 100%|██████████| 1/1 [20:20<00:00, 1220.43s/it]



Final Test Evaluation for appearance:
Accuracy:  0.9634
Precision: 0.9245
Recall:    0.9344
F1-score:  0.9294
              precision    recall  f1-score   support

           0     0.9809    0.9759    0.9784      1369
           1     0.8680    0.8930    0.8803       243

    accuracy                         0.9634      1612
   macro avg     0.9245    0.9344    0.9294      1612
weighted avg     0.9639    0.9634    0.9636      1612

Saved best model to: checkpoints/phase1/best/appearance_best_model.pkl

Training Phase1 SVM model for aspect: aroma
Building FastText features for aroma...


Building features: 100%|██████████| 12034/12034 [00:01<00:00, 7236.51it/s]
Building features: 100%|██████████| 1547/1547 [00:00<00:00, 8186.97it/s]
Building features: 100%|██████████| 1612/1612 [00:00<00:00, 6914.62it/s]


Running GridSearchCV for aroma...


GridSearch for aroma:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting 5 folds for each of 24 candidates, totalling 120 fits


GridSearch for aroma: 100%|██████████| 1/1 [23:07<00:00, 1387.41s/it]



Final Test Evaluation for aroma:
Accuracy:  0.9442
Precision: 0.8579
Recall:    0.9084
F1-score:  0.8806
              precision    recall  f1-score   support

           0     0.9797    0.9561    0.9677      1411
           1     0.7362    0.8607    0.7936       201

    accuracy                         0.9442      1612
   macro avg     0.8579    0.9084    0.8806      1612
weighted avg     0.9493    0.9442    0.9460      1612

Saved best model to: checkpoints/phase1/best/aroma_best_model.pkl

Training Phase1 SVM model for aspect: palate
Building FastText features for palate...


Building features: 100%|██████████| 12036/12036 [00:01<00:00, 9001.52it/s]
Building features: 100%|██████████| 1547/1547 [00:00<00:00, 9231.17it/s]
Building features: 100%|██████████| 1612/1612 [00:00<00:00, 6765.45it/s]


Running GridSearchCV for palate...


GridSearch for palate:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting 5 folds for each of 24 candidates, totalling 120 fits


GridSearch for palate: 100%|██████████| 1/1 [27:03<00:00, 1623.22s/it]



Final Test Evaluation for palate:
Accuracy:  0.9305
Precision: 0.8072
Recall:    0.8823
F1-score:  0.8388
              precision    recall  f1-score   support

           0     0.9777    0.9437    0.9604      1439
           1     0.6368    0.8208    0.7172       173

    accuracy                         0.9305      1612
   macro avg     0.8072    0.8823    0.8388      1612
weighted avg     0.9411    0.9305    0.9343      1612

Saved best model to: checkpoints/phase1/best/palate_best_model.pkl

Training Phase1 SVM model for aspect: taste
Building FastText features for taste...


Building features: 100%|██████████| 12036/12036 [00:01<00:00, 8277.65it/s]
Building features: 100%|██████████| 1547/1547 [00:00<00:00, 8808.23it/s]
Building features: 100%|██████████| 1612/1612 [00:00<00:00, 8526.03it/s]


Running GridSearchCV for taste...


GridSearch for taste:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting 5 folds for each of 24 candidates, totalling 120 fits


GridSearch for taste: 100%|██████████| 1/1 [27:04<00:00, 1624.68s/it]



Final Test Evaluation for taste:
Accuracy:  0.8970
Precision: 0.8625
Recall:    0.8775
F1-score:  0.8695
              precision    recall  f1-score   support

           0     0.9406    0.9185    0.9294      1190
           1     0.7844    0.8365    0.8096       422

    accuracy                         0.8970      1612
   macro avg     0.8625    0.8775    0.8695      1612
weighted avg     0.8997    0.8970    0.8981      1612

Saved best model to: checkpoints/phase1/best/taste_best_model.pkl

GLOBAL METRICS ACROSS ALL ASPECTS
Macro F1: 0.8828668940469468
Micro F1: 0.9337779156327544

Saved → phase1_fasttext_svm_results.json
All Phase1 SVM checkpoints saved → checkpoints/phase1/best/
