In [45]:
# ======================================================
# Wine–Food Pairing - CV Evaluation (RMSE & MAE)
# ======================================================

import os, yaml
import pandas as pd, numpy as np
import joblib
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from lightgbm import LGBMRegressor

# --------------------------------------------------------
# 1. Load configuration file and dataset
# --------------------------------------------------------
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

df0 = pd.read_csv(config['input_data']['file'])

# --------------------------------------------------------
# 2. Define target & group columns
# --------------------------------------------------------
target_col = "pairing_quality"
group_cols = [
    c for c in ["wine_type", "wine_category", "food_item", 
                "food_category", "cuisine"] 
    if c in df0.columns
]

# --------------------------------------------------------
# 3. Aggregate duplicates and scale target
# --------------------------------------------------------
df = df0.groupby(group_cols)[target_col].mean().reset_index()

min_r, max_r = df[target_col].min(), df[target_col].max()
df["target_scaled"] = (df[target_col] - min_r) / (max_r - min_r)

# --------------------------------------------------------
# 4. Feature Engineering (ID, target encoding, crosses)
# --------------------------------------------------------
cat_cols = group_cols.copy()

# IDs
for c in cat_cols:
    df[c + "_id"] = pd.factorize(df[c])[0]

# Target encoding
for c in cat_cols:
    te = df.groupby(c)["target_scaled"].mean()
    df[c + "_te"] = df[c].map(te)

# Cross features
def make_cross(a, b):
    return pd.factorize(a.astype(str) + "||" + b.astype(str))[0]

df["wine_cuisine_cross_id"] = make_cross(df["wine_type"], df["cuisine"])

df["wine_foodcat_cross_id"] = make_cross(df["wine_type"], df["food_category"])

# --------------------------------------------------------
# 5. Select features
# --------------------------------------------------------
feature_cols = []
for c in cat_cols:
    feature_cols += [c + "_id", c + "_te"]

feature_cols += [c for c in ["wine_cuisine_cross_id", "wine_foodcat_cross_id"]]

X = df[feature_cols]
y = df["target_scaled"].values

# --------------------------------------------------------
# 6. CV Evaluation (LightGBM)
# --------------------------------------------------------
def cv_quick(X, y, params, n_splits=5, n_estimators=200):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=0)
    rmses, maes = [], []

    for train_idx, val_idx in kf.split(X):
        Xtr, Xv = X.iloc[train_idx], X.iloc[val_idx]
        ytr, yv = y[train_idx], y[val_idx]

        model = LGBMRegressor(**params, n_estimators=n_estimators, random_state=0)
        model.fit(Xtr, ytr)

        p = model.predict(Xv)

        # convert predictions back to original scale
        p_orig = p * (max_r - min_r) + min_r
        yv_orig = yv * (max_r - min_r) + min_r

        rmses.append(np.sqrt(mean_squared_error(yv_orig, p_orig)))
        maes.append(mean_absolute_error(yv_orig, p_orig))

    return np.mean(rmses), np.mean(maes)

# --------------------------------------------------------
# 7. Compute RMSE and MAE for a baseline model
# --------------------------------------------------------
params = {"learning_rate": 0.05, "num_leaves": 31}
rmse, mae = cv_quick(X, y, params, n_splits=5, n_estimators=300)

print("\n================ CV RESULTS ================")
print("RMSE:", rmse)
print("MAE :", mae)
print("===========================================\n")


# ======================================================
# 8. Train FINAL model and save it as joblib
# ======================================================

final_model = LGBMRegressor(
    learning_rate=0.05,
    num_leaves=31,
    n_estimators=300,
    random_state=0
)

final_model.fit(X, y)

# Save model
joblib.dump(final_model, "lgb_pairing_model.joblib")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000552 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 588
[LightGBM] [Info] Number of data points in the train set: 6588, number of used features: 12
[LightGBM] [Info] Start training from score 0.505278
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000341 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 587
[LightGBM] [Info] Number of data points in the train set: 6588, number of used features: 12
[LightGBM] [Info] Start training from score 0.504396
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000538 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 588
[LightGBM] [Info] Number of data points in the train set: 658

['lgb_pairing_model.joblib']

In [52]:
# ======================================================
# Wine–Food Pairing - Recommendation System (Top 5)
# ======================================================

# ------------------------------
# Load trained model
# ------------------------------
model = joblib.load("lgb_pairing_model.joblib")

# ======================================================
# 1. Utility functions - Replicating Feature Engineering
# ======================================================

def factorize_and_map(df_full, df_query, col):
    """
    Replicates factorize() used during training.
    Ensures query values receive consistent IDs.
    Unknown values -> -1.
    """
    _, uniques = pd.factorize(df_full[col])
    mapping = {v: i for i, v in enumerate(uniques)}
    return df_query[col].map(mapping).fillna(-1).astype(int)


def target_encode(df_full, df_query, col):
    """
    Target encoding (mean of target_scaled grouped by column).
    Unknown values -> global mean.
    """
    te_map = df_full.groupby(col)["target_scaled"].mean()
    global_mean = df_full["target_scaled"].mean()
    return df_query[col].map(te_map).fillna(global_mean)


def make_cross_for_query(df_full, df_query, colA, colB, new_col_name):
    """
    Generates cross-feature IDs exactly like during training.
    Unseen combinations -> -1.
    """
    full_cross = (df_full[colA].astype(str) + "||" + df_full[colB].astype(str))
    _, uniques = pd.factorize(full_cross)
    map_cross = {v: i for i, v in enumerate(uniques)}

    query_cross = (df_query[colA].astype(str) + "||" + df_query[colB].astype(str))
    df_query[new_col_name] = query_cross.map(map_cross).fillna(-1).astype(int)
    return df_query


# ======================================================
# 2. Recommendation Function
# ======================================================

def recommend_wines(food_item, food_category, cuisine, preferred_wine_category=None):
    """
    Returns the Top-5 wine recommendations for a given food combination.
    Uses known dataset scores when available, otherwise model predictions.
    """

    # ----------------------------------------------
    # Step 1: Build wine candidates
    # ----------------------------------------------
    wine_df = df.copy()
    wine_df = wine_df[["wine_type", "wine_category"]].drop_duplicates()

    # Optional user preference filter
    if preferred_wine_category is not None:
        wine_df = wine_df[wine_df["wine_category"] == preferred_wine_category]

    if wine_df.empty:
        raise ValueError("No wines found for the given preferred wine_category.")

    # ----------------------------------------------
    # Step 2: Build query dataframe
    # ----------------------------------------------
    q = wine_df.copy()
    q["food_item"] = food_item
    q["food_category"] = food_category
    q["cuisine"] = cuisine

    # ----------------------------------------------
    # Step 3: Reapply feature engineering
    # ----------------------------------------------
    for c in cat_cols:
        q[c + "_id"] = factorize_and_map(df, q, c)

    for c in cat_cols:
        q[c + "_te"] = target_encode(df, q, c)

    if "wine_type" in df.columns and "cuisine" in df.columns:
        q = make_cross_for_query(df, q, "wine_type", "cuisine", "wine_cuisine_cross_id")

    if "wine_type" in df.columns and "food_category" in df.columns:
        q = make_cross_for_query(df, q, "wine_type", "food_category", "wine_foodcat_cross_id")

    # ----------------------------------------------
    # Step 4: Predict scores
    # ----------------------------------------------
    Xq = q[feature_cols]
    q["pred_scaled"] = model.predict(Xq)
    q["pred_score"] = q["pred_scaled"] * (max_r - min_r) + min_r

    # ----------------------------------------------
    # Step 5: Use real dataset score if available
    # ----------------------------------------------
    merged = q.merge(
        df[group_cols + [target_col]],
        on=group_cols,
        how="left"
    )

    merged["final_score"] = merged[target_col].fillna(merged["pred_score"])

    # ----------------------------------------------
    # Step 6: Return Top-5 (no pred_score displayed)
    # ----------------------------------------------
    top5 = (
        merged[["wine_type", "wine_category", "final_score"]]
        .sort_values("final_score", ascending=False)
        .head(5)
        .reset_index(drop=True)
    )

    return top5