In [1]:
# Collaborative Filtering - Recomendation system

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
import yaml

# ------------------------------------------------------------------
# 1 - Load train/test paths from config.yaml
# ------------------------------------------------------------------
with open("../config.yaml", "r") as file:
    config = yaml.safe_load(file)

train_path = config['output_data']['train_file']
test_path = config['output_data']['test_file']

print("Loading data from config paths:")
print(f"  Train file: {train_path}")
print(f"  Test file : {test_path}")

Loading data from config paths:
  Train file: ../data/clean/train_file.csv
  Test file : ../data/clean/test_file.csv


In [50]:
# ------------------------------------------------------------------
# 2 - Load clean training & test data
# ------------------------------------------------------------------
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Ensure essential columns exist
required_cols = ['wine_type_id', 'wine_category_id', 'food_item_id', 'food_category_id', 'cuisine_id', 'pairing_quality']
for col in required_cols:
    assert col in train_df.columns, f"‚ùå Missing column '{col}' in train data!"
    assert col in test_df.columns, f"‚ùå Missing column '{col}' in test data!"

# Keep only collaborative filtering columns
train_cf = train_df[['wine_type_id', 'wine_category_id', 'food_item_id', 'food_category_id', 'cuisine_id', 'pairing_quality']]
test_cf = test_df[['wine_type_id', 'wine_category_id', 'food_item_id', 'food_category_id', 'cuisine_id', 'pairing_quality']]

print(f"‚úÖ Train shape: {train_cf.shape}, Test shape: {test_cf.shape}")

‚úÖ Train shape: (27946, 6), Test shape: (6987, 6)


In [51]:
# Create the base pivot using only existing data
test_cf['wine_pair'] = (
    test_cf['wine_type_id'].astype(str) + '-' + test_cf['wine_category_id'].astype(str)
)
test_cf['food_pair'] = (
    test_cf['food_item_id'].astype(str) + '-' +
    test_cf['food_category_id'].astype(str) + '-' +
    test_cf['cuisine_id'].astype(str)
)

display(train_cf.head())
display(test_cf.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_cf['wine_pair'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_cf['food_pair'] = (


Unnamed: 0,wine_type_id,wine_category_id,food_item_id,food_category_id,cuisine_id,pairing_quality
0,21,0,32,6,8,2
1,15,2,16,2,7,1
2,3,4,7,6,3,5
3,20,2,37,11,10,1
4,21,0,33,8,13,1


Unnamed: 0,wine_type_id,wine_category_id,food_item_id,food_category_id,cuisine_id,pairing_quality,wine_pair,food_pair
0,7,2,29,8,11,2,7-2,29-8-11
1,3,4,19,1,5,2,3-4,19-1-5
2,21,0,28,7,14,1,21-0,28-7-14
3,28,2,15,0,8,5,28-2,15-0-8
4,4,4,5,6,3,2,4-4,5-6-3


In [52]:
import itertools

# Get unique combinations
wine_pairs = (
    train_cf[['wine_type_id', 'wine_category_id']]
    .drop_duplicates()
    .assign(wine_pair=lambda df: df['wine_type_id'].astype(str) + '-' + df['wine_category_id'].astype(str))
)

food_combos = (
    train_cf[['food_item_id', 'food_category_id']]
    .drop_duplicates()
)

cuisines = train_cf['cuisine_id'].unique()

# Create all possible food pairs = (food_item_id, food_category_id) x (cuisine_id)
food_pairs_all = pd.DataFrame(
    list(itertools.product(
        food_combos['food_item_id'], 
        food_combos['food_category_id'], 
        cuisines
    )),
    columns=['food_item_id', 'food_category_id', 'cuisine_id']
)

food_pairs_all['food_pair'] = (
    food_pairs_all['food_item_id'].astype(str) + '-' +
    food_pairs_all['food_category_id'].astype(str) + '-' +
    food_pairs_all['cuisine_id'].astype(str)
)

# Create the base pivot using only existing data
train_cf['wine_pair'] = (
    train_cf['wine_type_id'].astype(str) + '-' + train_cf['wine_category_id'].astype(str)
)
train_cf['food_pair'] = (
    train_cf['food_item_id'].astype(str) + '-' +
    train_cf['food_category_id'].astype(str) + '-' +
    train_cf['cuisine_id'].astype(str)
)

pair_matrix = train_cf.pivot_table(
    index='wine_pair',
    columns='food_pair',
    values='pairing_quality',
    aggfunc='mean'
)

# Reindex to include *all* possible food pairs
pair_matrix = pair_matrix.reindex(columns=food_pairs_all['food_pair'], fill_value=pd.NA)

print(pair_matrix.head())

food_pair    32-6-8    32-6-7    32-6-3   32-6-10   32-6-13   32-6-11  32-6-9  \
wine_pair                                                                       
0-5        1.500000  2.750000  3.000000  3.333333  2.750000  3.000000    2.75   
1-2        2.000000  2.750000  2.666667  2.000000  2.666667  2.000000    2.75   
10-5       2.750000  2.000000  2.750000  3.333333  2.000000  2.666667    2.75   
11-0       3.333333  2.666667  2.000000  3.000000  2.750000  2.750000    2.50   
12-1       2.000000       NaN  2.000000  2.750000  1.000000  2.666667    3.00   

food_pair    32-6-4    32-6-1  32-6-16  ...  1-10-4  1-10-1  1-10-16  1-10-2  \
wine_pair                               ...                                    
0-5        2.000000  2.750000     2.75  ...     NaN     NaN      NaN     NaN   
1-2        2.666667  2.750000     2.75  ...     NaN     NaN      NaN     NaN   
10-5       2.000000  3.000000     2.50  ...     NaN     NaN      NaN     NaN   
11-0       1.500000  2.666667   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_cf['wine_pair'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_cf['food_pair'] = (


In [53]:
# ------------------------------------------------------------------
# 3 - Compute wine‚Äìwine similarity (rows = wines, columns = foods)
# ------------------------------------------------------------------

# Replace pd.NA with np.nan and convert to float
pair_matrix_numeric = pair_matrix.replace({pd.NA: np.nan}).to_numpy(dtype=float)

# Fill missing values with 0 before computing similarity
pair_matrix_filled = np.nan_to_num(pair_matrix_numeric)

# Compute cosine similarity between wine rows
wine_similarity = cosine_similarity(pair_matrix_filled)

# Convert back to a labeled DataFrame for readability
wine_similarity_df = pd.DataFrame(
    wine_similarity,
    index=pair_matrix.index,    # wine_pair names (rows)
    columns=pair_matrix.index   # wine_pair names (columns)
)

print("‚úÖ Wine‚ÄìWine similarity matrix computed:")
print(wine_similarity_df.head())

  pair_matrix_numeric = pair_matrix.replace({pd.NA: np.nan}).to_numpy(dtype=float)


‚úÖ Wine‚ÄìWine similarity matrix computed:
wine_pair       0-5       1-2      10-5      11-0      12-1      13-2  \
wine_pair                                                               
0-5        1.000000  0.626354  0.971439  0.741712  0.736660  0.562448   
1-2        0.626354  1.000000  0.618121  0.527284  0.797522  0.864050   
10-5       0.971439  0.618121  1.000000  0.735951  0.732912  0.551261   
11-0       0.741712  0.527284  0.735951  1.000000  0.660170  0.597705   
12-1       0.736660  0.797522  0.732912  0.660170  1.000000  0.711247   

wine_pair      14-2      15-2      16-2      17-1  ...      26-5      27-3  \
wine_pair                                          ...                       
0-5        0.567207  0.692315  0.627030  0.731167  ...  0.541315  0.622006   
1-2        0.733887  0.659425  0.768113  0.777884  ...  0.643001  0.953159   
10-5       0.562273  0.688951  0.619416  0.723676  ...  0.547399  0.606643   
11-0       0.630577  0.627158  0.648772  0.655994  ...

In [54]:
# ------------------------------------------------------------------
# 4 - Predict ratings for unseen wine‚Äìfood pairs
# ------------------------------------------------------------------
def predict_cf(user_item_matrix, similarity_matrix):
    """
    Collaborative filtering prediction using weighted sum of similar users' ratings.
    """
    # Convert to DataFrame and ensure all missing values are np.nan
    user_item_df = pd.DataFrame(user_item_matrix).replace({pd.NA: np.nan})
    
    # Convert safely to a numeric NumPy array
    user_item_matrix = user_item_df.to_numpy(dtype=float)
    
    # Compute mean rating per user (ignoring NaNs)
    mean_user_rating = np.nanmean(user_item_matrix, axis=1)
    
    # Center ratings by subtracting the user mean
    ratings_diff = np.where(np.isnan(user_item_matrix), 0,
                            user_item_matrix - mean_user_rating[:, np.newaxis])
    
    # Weighted sum of neighbors' deviations
    pred = mean_user_rating[:, np.newaxis] + (similarity_matrix.dot(ratings_diff) / np.abs(similarity_matrix).sum(axis=1)[:, np.newaxis])
    
    # Replace remaining NaN with zeros (optional)
    return np.nan_to_num(pred)


# ------------------------------------------------------------------
# Generate predictions for all wine‚Äìfood pairs
# ------------------------------------------------------------------
# Replace pd.NA ‚Üí np.nan before calling the function
pair_matrix = pair_matrix.replace({pd.NA: np.nan})

wine_food_pred = predict_cf(pair_matrix.values, wine_similarity)

# Convert predictions back to a DataFrame
wine_food_pred_df = pd.DataFrame(
    wine_food_pred,
    index=pair_matrix.index,
    columns=pair_matrix.columns
)

print("‚úÖ Collaborative filtering predictions computed.")

print(wine_food_pred_df.head())

  pair_matrix = pair_matrix.replace({pd.NA: np.nan})


‚úÖ Collaborative filtering predictions computed.
food_pair    32-6-8    32-6-7    32-6-3   32-6-10   32-6-13   32-6-11  \
wine_pair                                                               
0-5        2.536666  2.750118  2.503231  2.885580  2.608578  2.813128   
1-2        2.658318  2.895145  2.596269  3.003427  2.738998  2.946653   
10-5       2.568234  2.778701  2.534467  2.915664  2.636811  2.842479   
11-0       2.655040  2.833770  2.574451  2.943081  2.699790  2.898145   
12-1       2.648185  2.877628  2.570760  3.000661  2.694708  2.932100   

food_pair    32-6-9    32-6-4    32-6-1   32-6-16  ...    1-10-4    1-10-1  \
wine_pair                                          ...                       
0-5        2.744646  2.663656  2.656960  2.742140  ...  2.937438  2.928785   
1-2        2.859471  2.735077  2.723295  2.852205  ...  3.018373  3.021899   
10-5       2.773956  2.693933  2.688268  2.771864  ...  2.968434  2.959473   
11-0       2.806258  2.750382  2.726217  2.84237

In [55]:
from math import sqrt
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd

# ------------------------------------------------------------------
# 5 - Evaluate Collaborative Filtering model
# ------------------------------------------------------------------
true_ratings = []
pred_ratings = []

for _, row in test_cf.iterrows():
    u, i, true_score = row.wine_pair, row.food_pair, row.pairing_quality

    # Check if the pair exists in prediction matrix
    if (u in wine_food_pred_df.index) and (i in wine_food_pred_df.columns):
        pred_score = wine_food_pred_df.loc[u, i]

        # If it's a Series (multiple values), take the first
        if isinstance(pred_score, pd.Series):
            pred_score = pred_score.iloc[0]

        # Try to convert safely to float
        try:
            pred_score = float(pred_score)
            true_score = float(true_score)
        except (TypeError, ValueError):
            continue  # skip invalid rows

        # Add only finite values
        if np.isfinite(pred_score) and np.isfinite(true_score):
            true_ratings.append(true_score)
            pred_ratings.append(pred_score)

# ------------------------------------------------------------------
# Compute metrics
# ------------------------------------------------------------------
if len(pred_ratings) > 0:
    rmse = sqrt(mean_squared_error(true_ratings, pred_ratings))
    mae = mean_absolute_error(true_ratings, pred_ratings)
    print(f"üìä Collaborative Filtering Evaluation:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE : {mae:.4f}")
    print(f"  ‚úÖ Evaluated {len(pred_ratings)} test pairs successfully.")
else:
    print("‚ö†Ô∏è No overlapping test ratings found for evaluation ‚Äî data may be too sparse.")

üìä Collaborative Filtering Evaluation:
  RMSE: 1.4192
  MAE : 1.1973
  ‚úÖ Evaluated 6987 test pairs successfully.


In [26]:
# ------------------------------------------------------------------
# 5 - Evaluate Collaborative Filtering model
# ------------------------------------------------------------------
true_ratings = []
pred_ratings = []

for _, row in test_cf.iterrows():
    u, i, true_score = int(row.wine_id), int(row.food_id), row.pairing_quality
    if (u in wine_food_pred_df.index) and (i in wine_food_pred_df.columns):
        pred_score = wine_food_pred_df.loc[u, i]
        true_ratings.append(true_score)
        pred_ratings.append(pred_score)

if len(pred_ratings) > 0:
    rmse = sqrt(mean_squared_error(true_ratings, pred_ratings))
    mae = mean_absolute_error(true_ratings, pred_ratings)
    print(f"üìä Collaborative Filtering Evaluation:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE : {mae:.4f}")
else:
    print("‚ö†Ô∏è No overlapping test ratings found for evaluation ‚Äî data may be too sparse.")

AttributeError: 'Series' object has no attribute 'wine_id'

In [56]:
# ------------------------------------------------------------------
# 6 - Load the labels_df
# ------------------------------------------------------------------

with open("../config.yaml", "r") as file:
    config = yaml.safe_load(file)

labels_path = config['output_data']['labels_file']

labels_df = pd.read_csv(labels_path)

In [57]:
combined_cf = pd.concat([train_cf, test_cf], ignore_index=True)

In [58]:
test_cf

Unnamed: 0,wine_type_id,wine_category_id,food_item_id,food_category_id,cuisine_id,pairing_quality,wine_pair,food_pair
0,7,2,29,8,11,2,7-2,29-8-11
1,3,4,19,1,5,2,3-4,19-1-5
2,21,0,28,7,14,1,21-0,28-7-14
3,28,2,15,0,8,5,28-2,15-0-8
4,4,4,5,6,3,2,4-4,5-6-3
...,...,...,...,...,...,...,...,...
6982,4,4,18,8,4,3,4-4,18-8-4
6983,12,1,24,3,8,3,12-1,24-3-8
6984,28,2,11,1,5,4,28-2,11-1-5
6985,26,5,20,0,1,5,26-5,20-0-1


In [59]:
train_cf

Unnamed: 0,wine_type_id,wine_category_id,food_item_id,food_category_id,cuisine_id,pairing_quality,wine_pair,food_pair
0,21,0,32,6,8,2,21-0,32-6-8
1,15,2,16,2,7,1,15-2,16-2-7
2,3,4,7,6,3,5,3-4,7-6-3
3,20,2,37,11,10,1,20-2,37-11-10
4,21,0,33,8,13,1,21-0,33-8-13
...,...,...,...,...,...,...,...,...
27941,11,0,33,8,16,3,11-0,33-8-16
27942,6,5,22,6,5,2,6-5,22-6-5
27943,16,2,32,6,13,3,16-2,32-6-13
27944,23,2,34,9,7,2,23-2,34-9-7


In [60]:
combined_cf

Unnamed: 0,wine_type_id,wine_category_id,food_item_id,food_category_id,cuisine_id,pairing_quality,wine_pair,food_pair
0,21,0,32,6,8,2,21-0,32-6-8
1,15,2,16,2,7,1,15-2,16-2-7
2,3,4,7,6,3,5,3-4,7-6-3
3,20,2,37,11,10,1,20-2,37-11-10
4,21,0,33,8,13,1,21-0,33-8-13
...,...,...,...,...,...,...,...,...
34928,4,4,18,8,4,3,4-4,18-8-4
34929,12,1,24,3,8,3,12-1,24-3-8
34930,28,2,11,1,5,4,28-2,11-1-5
34931,26,5,20,0,1,5,26-5,20-0-1


In [61]:
wine_food_pred_df

food_pair,32-6-8,32-6-7,32-6-3,32-6-10,32-6-13,32-6-11,32-6-9,32-6-4,32-6-1,32-6-16,...,1-10-4,1-10-1,1-10-16,1-10-2,1-10-0,1-10-14,1-10-15,1-10-12,1-10-6,1-10-5
wine_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0-5,2.536666,2.750118,2.503231,2.88558,2.608578,2.813128,2.744646,2.663656,2.65696,2.74214,...,2.937438,2.928785,2.910867,2.96438,2.960721,2.937743,2.942803,2.928156,2.97181,2.944753
1-2,2.658318,2.895145,2.596269,3.003427,2.738998,2.946653,2.859471,2.735077,2.723295,2.852205,...,3.018373,3.021899,3.007295,3.050657,3.054867,3.029201,3.040263,3.016533,3.060608,3.038383
10-5,2.568234,2.778701,2.534467,2.915664,2.636811,2.842479,2.773956,2.693933,2.688268,2.771864,...,2.968434,2.959473,2.941476,2.995179,2.991358,2.968471,2.973361,2.958944,3.002612,2.975415
11-0,2.65504,2.83377,2.574451,2.943081,2.69979,2.898145,2.806258,2.750382,2.726217,2.842374,...,3.018199,3.008423,2.991255,3.042449,3.038007,3.017007,3.020839,3.008445,3.049181,3.023215
12-1,2.648185,2.877628,2.57076,3.000661,2.694708,2.9321,2.855439,2.741716,2.733107,2.844842,...,3.027066,3.030288,3.016894,3.056752,3.060527,3.036985,3.047133,3.025386,3.065775,3.045407
13-2,2.820829,3.041703,2.742055,3.139123,2.890275,3.095192,3.011358,2.885354,2.857587,3.011534,...,3.154293,3.157469,3.143392,3.185359,3.189198,3.164508,3.175121,3.152526,3.19461,3.173334
14-2,2.663652,2.876622,2.637417,2.995094,2.752354,2.949506,2.836989,2.751953,2.750623,2.867196,...,3.030057,3.022872,3.004018,3.060349,3.057774,3.032298,3.038921,3.021381,3.068766,3.040323
15-2,2.608808,2.840847,2.59368,2.957453,2.714599,2.91944,2.810909,2.732585,2.719516,2.796973,...,3.001371,2.98656,2.960899,3.03775,3.030825,2.99939,3.005163,2.986918,3.047236,3.008692
16-2,2.719732,2.913428,2.625443,3.018655,2.764202,2.977732,2.861695,2.773999,2.745752,2.919978,...,3.054103,3.057391,3.0442,3.083474,3.087306,3.063986,3.074115,3.052587,3.092322,3.072348
17-1,2.422897,2.650981,2.343487,2.776028,2.468734,2.70846,2.627739,2.516531,2.50811,2.6181,...,2.80173,2.805315,2.791968,2.831958,2.835802,2.811989,2.822455,2.800402,2.840699,2.820559


In [62]:
# ------------------------------------------------------------------
# 7 - Recommend Top-5 foods for a given wine
# ------------------------------------------------------------------

def recommend_foods_for_wine(wine_id, labels_df, wine_food_pred_df, combined_cf, n=5):
    """
    Recommend top-N foods for a given wine based on predicted pairing scores,
    using names from the 'labels_df' DataFrame.
    """
    # --- Check if the wine_id exists in the prediction matrix ---
    if wine_id not in wine_food_pred_df.index:
        print(f"‚ö†Ô∏è Wine ID {wine_id} not found in training data.")
        return None

    # --- Get the wine name from labels_df ---
    wine_name = labels_df.loc[labels_df.wine_id == wine_id, "wine_type"].values
    wine_name = wine_name[0] if len(wine_name) > 0 else f"Wine {wine_id}"

    # --- Get predicted pairing scores for all foods ---
    preds = wine_food_pred_df.loc[wine_id]

    # --- Get foods that this wine already paired with in the training data ---
    known_foods = combined_cf.loc[combined_cf.wine_id == wine_id, "food_id"].tolist()

    # --- Remove already-known pairings from recommendations ---
    preds = preds.drop(index=known_foods, errors="ignore")

    # --- Select top-N foods with the highest predicted scores ---
    top_foods = preds.sort_values(ascending=False).head(n)

    # --- Match food IDs with their names from labels_df ---
    recommendations = (
        pd.DataFrame({
            "food_id": top_foods.index,
            "predicted_score": top_foods.values
        })
        .merge(
            labels_df[["food_id", "food_item"]].drop_duplicates(),
            on="food_id", how="left"
        )
        .loc[:, ["food_item", "predicted_score"]]
    )

    # --- Display the final recommendations ---
    print(f"\nüç∑ Top {n} recommended foods for wine '{wine_name}' (ID={wine_id}):")

    return recommendations

In [63]:
recommend_foods_for_wine(3, labels_df, wine_food_pred_df, combined_cf, n=5)

‚ö†Ô∏è Wine ID 3 not found in training data.
