Objective:- Build a recommendation system

In [1]:
pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-macosx_10_9_x86_64.whl size=526446 sha256=68d08027f6a56fb5a0ac8784d96e76f61705b09b93e84f26faef0be83888a281
  Stored in directory: /Users/kuriankgeorge/Library/Caches/pip/wheels/2a/8f/6e/7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Suc

import libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

Load data

In [2]:
recipe_df = pd.read_csv('/Users/kuriankgeorge/Desktop/aws/Capstone/06_NutriGro/data/NutriGro/core-data_recipe.csv')  
train_interactions = pd.read_csv('/Users/kuriankgeorge/Desktop/aws/Capstone/06_NutriGro/data/NutriGro/core-data-train_rating.csv')

Collaborative Filtering using Surprise SVD

In [3]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(train_interactions[['user_id', 'recipe_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

Train the SVD model

In [4]:
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1411b86d0>

Evaluate on test set

In [5]:
predictions = svd.test(testset)
print("RMSE for Collaborative Filtering:")
accuracy.rmse(predictions)

RMSE for Collaborative Filtering:
RMSE: 0.8127


0.8126654328532433

Use Nearest Neighbors with cosine similarity

In [7]:
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(tfidf_matrix)

Function to recommend recipes using Nearest Neighbors

In [8]:
def recommend_recipes_knn(recipe_id, top_n=10):
    idx = recipe_df[recipe_df['recipe_id'] == recipe_id].index[0]
    distances, indices = knn.kneighbors(tfidf_matrix[idx], n_neighbors=top_n+1)  # +1 to exclude the recipe itself
    recipe_indices = indices.flatten()[1:]  # Exclude the first element (the recipe itself)
    return recipe_df['recipe_name'].iloc[recipe_indices]

Example: Recommend recipes for recipe_id 240488

In [9]:
recommendations_knn = recommend_recipes_knn(240488) 
print("KNN-based recommendations:", recommendations_knn)

KNN-based recommendations: 1245      Slow Cooker Lancaster County Pork and Sauerkraut
1331     Slow Cooker German-Style Pork Roast with Sauer...
10863                     Slow Cooker Sauerkraut Pork Loin
11008                            Pork Roast and Sauerkraut
7189              Bill and Annette's One Pot Kraut Special
6167                             Slow Cooker Kielbasa Stew
11227                           Balsamic Roasted Pork Loin
9104          Pork Roast with Apples, Beer, and Sauerkraut
42580                        Smoked Sausage and Apple Bake
7247              Vern's Roasted Pork Loin Over Sauerkraut
Name: recipe_name, dtype: object


Hybrid Approach (Combine Collaborative Filtering and Content-based Filtering)

In [10]:
def hybrid_recommender(user_id, recipe_id, svd_model, knn, top_n=10, alpha=0.5):
    # Collaborative filtering score
    cf_score = svd_model.predict(user_id, recipe_id).est
    
    # Content-based score using NearestNeighbors
    idx = recipe_df[recipe_df['recipe_id'] == recipe_id].index[0]
    distances, indices = knn.kneighbors(tfidf_matrix[idx], n_neighbors=top_n+1)
    content_score = 1 - distances.flatten().mean()  # Higher similarity leads to lower distance, so invert it
    
    # Hybrid score: weighted combination of both
    return alpha * cf_score + (1 - alpha) * content_score

Example: Hybrid recommendation for user_id 11174581 and recipe_id 240488

In [12]:
hybrid_score = hybrid_recommender(user_id=11174581, recipe_id=240488, svd_model=svd, knn=knn)
print(f"Hybrid recommendation score for user 11174581 and recipe 240488: {hybrid_score}")

Hybrid recommendation score for user 11174581 and recipe 240488: 2.50169762184958
