In [47]:
# Collaborative Filtering - Recomendation system

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
import yaml

# ------------------------------------------------------------------
# 1 - Load train/test paths from config.yaml
# ------------------------------------------------------------------
with open("../config.yaml", "r") as file:
    config = yaml.safe_load(file)

train_path = config['output_data']['train_file']
test_path = config['output_data']['test_file']

print("Loading data from config paths:")
print(f"  Train file: {train_path}")
print(f"  Test file : {test_path}")

Loading data from config paths:
  Train file: ../data/clean/train_file.csv
  Test file : ../data/clean/test_file.csv


In [162]:
train_df

Unnamed: 0,quality_label,description,wine_id,food_id,wine_type_Barbera,wine_type_Cabernet Sauvignon,wine_type_Cava,wine_type_Champagne,wine_type_Chardonnay,wine_type_Chenin Blanc,...,cuisine_Italian,cuisine_Japanese,cuisine_Korean,cuisine_Mexican,cuisine_Middle Eastern,cuisine_Moroccan,cuisine_Spanish,cuisine_Thai,cuisine_Vietnamese,pairing_quality
0,Poor,Delicate wine overwhelmed by red meat.,21,32,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,2
1,Terrible,Too lean for creamy dish.,15,16,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
2,Excellent,Idealized perfect pairing example for contrast.,3,7,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,5
3,Terrible,Deliberately bad pairing example for contrast.,20,37,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,1
4,Terrible,Tannic reds clash with delicate seafood.,21,33,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27941,Neutral,Tannic reds clash with delicate seafood.,11,33,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,3
27942,Poor,Heuristic pairing assessment.,6,22,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,2
27943,Neutral,Delicate wine overwhelmed by red meat.,16,32,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,3
27944,Poor,Heuristic pairing assessment.,23,34,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,2


In [48]:
# ------------------------------------------------------------------
# 2 - Load clean training & test data
# ------------------------------------------------------------------
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Ensure essential columns exist
required_cols = ["wine_id", "food_id", "pairing_quality"]
for col in required_cols:
    assert col in train_df.columns, f"‚ùå Missing column '{col}' in train data!"
    assert col in test_df.columns, f"‚ùå Missing column '{col}' in test data!"

# Keep only collaborative filtering columns
train_cf = train_df[["wine_id", "food_id", "pairing_quality"]]
test_cf = test_df[["wine_id", "food_id", "pairing_quality"]]

print(f"‚úÖ Train shape: {train_cf.shape}, Test shape: {test_cf.shape}")
display(train_cf.head())

‚úÖ Train shape: (27946, 3), Test shape: (6987, 3)


Unnamed: 0,wine_id,food_id,pairing_quality
0,21,32,2
1,15,16,1
2,3,7,5
3,20,37,1
4,21,33,1


In [131]:
# ------------------------------------------------------------------
# 3Ô∏è - Create user‚Äìitem matrix (wines √ó foods)
# ------------------------------------------------------------------
from sklearn.metrics.pairwise import cosine_similarity

# Pivot table: rows = wines, columns = foods, values = pairing scores
wine_food_matrix = train_cf.pivot_table(
    index="wine_id",
    columns="food_id",
    values="pairing_quality"
)

print("Wine‚ÄìFood matrix shape:", wine_food_matrix.shape)

# Compute cosine similarity between wines (users)
wine_similarity = cosine_similarity(wine_food_matrix.fillna(0))
wine_similarity_df = pd.DataFrame(
    wine_similarity,
    index=wine_food_matrix.index,
    columns=wine_food_matrix.index
)

print("‚úÖ Wine similarity matrix computed:", wine_similarity_df.shape)

Wine‚ÄìFood matrix shape: (29, 38)
‚úÖ Wine similarity matrix computed: (29, 29)


In [132]:
# ------------------------------------------------------------------
# 4 - Predict ratings for unseen wine‚Äìfood pairs
# ------------------------------------------------------------------
def predict_cf(user_item_matrix, similarity_matrix):
    """
    Collaborative filtering prediction using weighted sum of similar users' ratings.
    """
    mean_user_rating = np.nanmean(user_item_matrix, axis=1)
    ratings_diff = np.where(np.isnan(user_item_matrix), 0, user_item_matrix - mean_user_rating[:, np.newaxis])
    pred = mean_user_rating[:, np.newaxis] + similarity_matrix.dot(ratings_diff) / np.abs(similarity_matrix).sum(axis=1)[:, np.newaxis]
    return np.nan_to_num(pred)

# Generate predictions for all wine‚Äìfood pairs
wine_food_pred = predict_cf(wine_food_matrix.values, wine_similarity)
wine_food_pred_df = pd.DataFrame(
    wine_food_pred,
    index=wine_food_matrix.index,
    columns=wine_food_matrix.columns
)

print("‚úÖ Collaborative filtering predictions computed.")

‚úÖ Collaborative filtering predictions computed.


In [133]:
# ------------------------------------------------------------------
# 5 - Evaluate Collaborative Filtering model
# ------------------------------------------------------------------
true_ratings = []
pred_ratings = []

for _, row in test_cf.iterrows():
    u, i, true_score = int(row.wine_id), int(row.food_id), row.pairing_quality
    if (u in wine_food_pred_df.index) and (i in wine_food_pred_df.columns):
        pred_score = wine_food_pred_df.loc[u, i]
        true_ratings.append(true_score)
        pred_ratings.append(pred_score)

if len(pred_ratings) > 0:
    rmse = sqrt(mean_squared_error(true_ratings, pred_ratings))
    mae = mean_absolute_error(true_ratings, pred_ratings)
    print(f"üìä Collaborative Filtering Evaluation:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE : {mae:.4f}")
else:
    print("‚ö†Ô∏è No overlapping test ratings found for evaluation ‚Äî data may be too sparse.")

üìä Collaborative Filtering Evaluation:
  RMSE: 1.3989
  MAE : 1.1765


In [134]:
# ------------------------------------------------------------------
# 6 - Load the labels_df
# ------------------------------------------------------------------

with open("../config.yaml", "r") as file:
    config = yaml.safe_load(file)

labels_path = config['output_data']['labels_file']

labels_df = pd.read_csv(labels_path)

In [153]:
combined_cf = pd.concat([train_cf, test_cf], ignore_index=True)

In [154]:
test_cf

Unnamed: 0,wine_id,food_id,pairing_quality
0,7,29,2
1,3,19,2
2,21,28,1
3,28,15,5
4,4,5,2
...,...,...,...
6982,4,18,3
6983,12,24,3
6984,28,11,4
6985,26,20,5


In [155]:
train_cf

Unnamed: 0,wine_id,food_id,pairing_quality
0,21,32,2
1,15,16,1
2,3,7,5
3,20,37,1
4,21,33,1
...,...,...,...
27941,11,33,3
27942,6,22,2
27943,16,32,3
27944,23,34,2


In [156]:
combined_cf

Unnamed: 0,wine_id,food_id,pairing_quality
0,21,32,2
1,15,16,1
2,3,7,5
3,20,37,1
4,21,33,1
...,...,...,...
34928,4,18,3
34929,12,24,3
34930,28,11,4
34931,26,20,5


In [160]:
wine_food_pred_df

food_id,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
wine_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2.979064,2.977465,2.97052,2.915631,2.970468,2.70878,2.957427,2.951218,2.970822,2.960001,...,2.964425,2.980049,2.987618,2.954717,2.708889,2.995716,2.960116,2.952013,2.974907,2.980681
1,3.066951,3.060338,3.051886,3.01739,3.053967,2.818356,3.035955,3.03338,3.042182,3.047159,...,3.04678,3.053352,3.065947,3.047868,2.811473,3.069632,3.038401,3.044502,3.055427,3.061431
2,3.154549,3.148353,3.14032,3.107513,3.141295,2.92033,3.120487,3.119476,3.124909,3.139981,...,3.135239,3.139044,3.154651,3.131927,2.913911,3.154819,3.123473,3.135589,3.143199,3.149172
3,3.030228,3.027105,3.019541,2.973435,3.021181,2.766207,3.005351,2.996104,3.01328,3.012061,...,3.01211,3.02168,3.038499,3.005836,2.765329,3.044332,3.007991,3.009717,3.022857,3.028172
4,3.036084,3.032776,3.025194,2.979516,3.026857,2.772597,3.010943,3.001828,3.018725,3.017902,...,3.017797,3.02729,3.044075,3.011755,2.771595,3.049699,3.013582,3.015677,3.028489,3.033843
5,3.152529,3.149895,3.142306,3.107297,3.14291,2.900096,3.128289,3.119594,3.135179,3.146766,...,3.136033,3.140841,3.156692,3.139843,2.897889,3.156004,3.130839,3.13522,3.145011,3.15231
6,3.085455,3.078836,3.070391,3.035872,3.072465,2.836806,3.054466,3.051849,3.060679,3.065647,...,3.065286,3.071867,3.084457,3.066363,2.829989,3.088154,3.056913,3.062998,3.073939,3.079943
7,3.029334,3.025563,3.017569,2.975514,3.019381,2.771095,3.002784,2.997396,3.010641,3.009806,...,3.011813,3.020044,3.033582,3.008625,2.76702,3.04033,3.005294,3.005818,3.021459,3.026855
8,2.839583,2.831245,2.820282,2.786836,2.823862,2.585458,2.8056,2.80607,2.810148,2.815448,...,2.815935,2.819889,2.836242,2.816995,2.579207,2.836844,2.807944,2.813897,2.824206,2.829939
9,2.993357,2.990624,2.982961,2.940041,2.985356,2.740761,2.96876,2.958619,2.97569,2.979652,...,2.975025,2.982804,3.002348,2.976416,2.740514,3.000501,2.971341,2.976568,2.985824,2.991327


In [157]:
# ------------------------------------------------------------------
# 7 - Recommend Top-3 foods for a given wine
# ------------------------------------------------------------------

def recommend_foods_for_wine(wine_id, labels_df, wine_food_pred_df, combined_cf, n=5):
    """
    Recommend top-N foods for a given wine based on predicted pairing scores,
    using names from the 'labels_df' DataFrame.
    """
    # --- Check if the wine_id exists in the prediction matrix ---
    if wine_id not in wine_food_pred_df.index:
        print(f"‚ö†Ô∏è Wine ID {wine_id} not found in training data.")
        return None

    # --- Get the wine name from labels_df ---
    wine_name = labels_df.loc[labels_df.wine_id == wine_id, "wine_type"].values
    wine_name = wine_name[0] if len(wine_name) > 0 else f"Wine {wine_id}"

    # --- Get predicted pairing scores for all foods ---
    preds = wine_food_pred_df.loc[wine_id]

    # --- Get foods that this wine already paired with in the training data ---
    known_foods = combined_cf.loc[combined_cf.wine_id == wine_id, "food_id"].tolist()

    # --- Remove already-known pairings from recommendations ---
    preds = preds.drop(index=known_foods, errors="ignore")

    # --- Select top-N foods with the highest predicted scores ---
    top_foods = preds.sort_values(ascending=False).head(n)

    # --- Match food IDs with their names from labels_df ---
    recommendations = (
        pd.DataFrame({
            "food_id": top_foods.index,
            "predicted_score": top_foods.values
        })
        .merge(
            labels_df[["food_id", "food_item"]].drop_duplicates(),
            on="food_id", how="left"
        )
        .loc[:, ["food_item", "predicted_score"]]
    )

    # --- Display the final recommendations ---
    print(f"\nüç∑ Top {n} recommended foods for wine '{wine_name}' (ID={wine_id}):")

    return recommendations

In [159]:
recommend_foods_for_wine(3, labels_df, wine_food_pred_df, combined_cf, n=5)


üç∑ Top 5 recommended foods for wine 'Cava' (ID=3):


Unnamed: 0,food_item,predicted_score
0,lemon tart,3.043961
1,pork tenderloin,3.038499
2,duck √† l‚Äôorange,3.032329
3,Indian vindaloo,3.030228
4,vegetable curry,3.028172
