In [47]:
# Collaborative Filtering - Recomendation system

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
import yaml

# ------------------------------------------------------------------
# 1 - Load train/test paths from config.yaml
# ------------------------------------------------------------------
with open("../config.yaml", "r") as file:
    config = yaml.safe_load(file)

train_path = config['output_data']['train_file']
test_path = config['output_data']['test_file']

print("Loading data from config paths:")
print(f"  Train file: {train_path}")
print(f"  Test file : {test_path}")

Loading data from config paths:
  Train file: ../data/clean/train_file.csv
  Test file : ../data/clean/test_file.csv


In [48]:
# ------------------------------------------------------------------
# 2 - Load clean training & test data
# ------------------------------------------------------------------
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Ensure essential columns exist
required_cols = ["wine_id", "food_id", "pairing_quality"]
for col in required_cols:
    assert col in train_df.columns, f"‚ùå Missing column '{col}' in train data!"
    assert col in test_df.columns, f"‚ùå Missing column '{col}' in test data!"

# Keep only collaborative filtering columns
train_cf = train_df[["wine_id", "food_id", "pairing_quality"]]
test_cf = test_df[["wine_id", "food_id", "pairing_quality"]]

print(f"‚úÖ Train shape: {train_cf.shape}, Test shape: {test_cf.shape}")
display(train_cf.head())

‚úÖ Train shape: (27946, 3), Test shape: (6987, 3)


Unnamed: 0,wine_id,food_id,pairing_quality
0,21,32,2
1,15,16,1
2,3,7,5
3,20,37,1
4,21,33,1


In [70]:
# ------------------------------------------------------------------
# 3Ô∏è - Create user‚Äìitem matrix (wines √ó foods)
# ------------------------------------------------------------------
from sklearn.metrics.pairwise import cosine_similarity

# Pivot table: rows = wines, columns = foods, values = pairing scores
wine_food_matrix = train_cf.pivot_table(
    index="wine_id",
    columns="food_id",
    values="pairing_quality"
)

print("Wine‚ÄìFood matrix shape:", wine_food_matrix.shape)

# Compute cosine similarity between wines (users)
wine_similarity = cosine_similarity(wine_food_matrix.fillna(0))
wine_similarity_df = pd.DataFrame(
    wine_similarity,
    index=wine_food_matrix.index,
    columns=wine_food_matrix.index
)

print("‚úÖ Wine similarity matrix computed:", wine_similarity_df.shape)

Wine‚ÄìFood matrix shape: (29, 38)
‚úÖ Wine similarity matrix computed: (29, 29)


In [71]:
# ------------------------------------------------------------------
# 4 - Predict ratings for unseen wine‚Äìfood pairs
# ------------------------------------------------------------------
def predict_cf(user_item_matrix, similarity_matrix):
    """
    Collaborative filtering prediction using weighted sum of similar users' ratings.
    """
    mean_user_rating = np.nanmean(user_item_matrix, axis=1)
    ratings_diff = np.where(np.isnan(user_item_matrix), 0, user_item_matrix - mean_user_rating[:, np.newaxis])
    pred = mean_user_rating[:, np.newaxis] + similarity_matrix.dot(ratings_diff) / np.abs(similarity_matrix).sum(axis=1)[:, np.newaxis]
    return np.nan_to_num(pred)

# Generate predictions for all wine‚Äìfood pairs
wine_food_pred = predict_cf(wine_food_matrix.values, wine_similarity)
wine_food_pred_df = pd.DataFrame(
    wine_food_pred,
    index=wine_food_matrix.index,
    columns=wine_food_matrix.columns
)

print("‚úÖ Collaborative filtering predictions computed.")

‚úÖ Collaborative filtering predictions computed.


In [72]:
# ------------------------------------------------------------------
# 5 - Evaluate Collaborative Filtering model
# ------------------------------------------------------------------
true_ratings = []
pred_ratings = []

for _, row in test_cf.iterrows():
    u, i, true_score = int(row.wine_id), int(row.food_id), row.pairing_quality
    if (u in wine_food_pred_df.index) and (i in wine_food_pred_df.columns):
        pred_score = wine_food_pred_df.loc[u, i]
        true_ratings.append(true_score)
        pred_ratings.append(pred_score)

if len(pred_ratings) > 0:
    rmse = sqrt(mean_squared_error(true_ratings, pred_ratings))
    mae = mean_absolute_error(true_ratings, pred_ratings)
    print(f"üìä Collaborative Filtering Evaluation:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE : {mae:.4f}")
else:
    print("‚ö†Ô∏è No overlapping test ratings found for evaluation ‚Äî data may be too sparse.")

üìä Collaborative Filtering Evaluation:
  RMSE: 1.3989
  MAE : 1.1765


In [73]:
# ------------------------------------------------------------------
# 6 - Load the labels_df
# ------------------------------------------------------------------

with open("../config.yaml", "r") as file:
    config = yaml.safe_load(file)

labels_path = config['output_data']['labels_file']

labels_df = pd.read_csv(labels_path)

In [74]:
# ------------------------------------------------------------------
# 7 - Recommend Top-3 foods for a given wine
# ------------------------------------------------------------------

def recommend_foods_for_wine(wine_id, n=5):
    """
    Recommend top-N foods for a given wine based on predicted pairing scores,
    using names from the 'labels_df' DataFrame.
    """
    # --- Check if the wine_id exists in the prediction matrix ---
    if wine_id not in wine_food_pred_df.index:
        print(f"‚ö†Ô∏è Wine ID {wine_id} not found in training data.")
        return None

    # --- Get the wine name from labels_df ---
    wine_name = labels_df.loc[labels_df.wine_id == wine_id, "wine_type"].values
    wine_name = wine_name[0] if len(wine_name) > 0 else f"Wine {wine_id}"

    # --- Get predicted pairing scores for all foods ---
    preds = wine_food_pred_df.loc[wine_id]

    # --- Get foods that this wine already paired with in the training data ---
    known_foods = train_cf.loc[train_cf.wine_id == wine_id, "food_id"].tolist()

    # --- Remove already-known pairings from recommendations ---
    preds = preds.drop(index=known_foods, errors="ignore")

    # --- Select top-N foods with the highest predicted scores ---
    top_foods = preds.sort_values(ascending=False).head(n)

    # --- Match food IDs with their names from labels_df ---
    recommendations = (
        pd.DataFrame({
            "food_id": top_foods.index,
            "predicted_score": top_foods.values
        })
        .merge(
            labels_df[["food_id", "food_item"]].drop_duplicates(),
            on="food_id", how="left"
        )
        .loc[:, ["food_item", "predicted_score"]]
    )

    # --- Display the final recommendations ---
    print(f"\nüç∑ Top {n} recommended foods for wine '{wine_name}' (ID={wine_id}):")

    return recommendations

In [85]:
recommend_foods_for_wine(6, n=5)


üç∑ Top 5 recommended foods for wine 'Chenin Blanc' (ID=6):


Unnamed: 0,food_item,predicted_score
0,lemon tart,3.088572
1,shrimp scampi,3.088154
2,grilled salmon,3.085839
3,duck √† l‚Äôorange,3.085549
4,Indian vindaloo,3.085455
