# Collaborative Based Recommendation
## Funk Singular Value Decomposition (FunkSVD)

In [1]:
# Import the basic packages
import numpy as np 
import pandas as pd 

# Import the surprise packages
from surprise import Dataset
from surprise.reader import Reader
from surprise.prediction_algorithms.matrix_factorization import SVD as FunkSVD

In [2]:
rating_df = pd.read_pickle("clean_interactions.pkl")
recipes_df = pd.read_pickle("clean_recipes.pkl")

In [3]:
# Check the recipe dataset
display(rating_df.head())
print('Shape: ', rating_df.shape)
print('Number of Unique Users:',len(rating_df['user_id'].unique()))
print('Number of Unique recipes:',len(rating_df['recipe_id'].unique()))

Unnamed: 0,user_id,recipe_id,date,rating,review,review_month,review_year
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...,Feb,2003
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall...",Dec,2011
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...,Dec,2002
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...,Feb,2010
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin...",Oct,2011


Shape:  (1071351, 7)
Number of Unique Users: 195947
Number of Unique recipes: 226583


In [4]:
# Sample 0.5% of the dataset
sampled_data = rating_df.sample(frac=0.005, random_state=42)

# Print the shape of the sampled dataset
print("Shape of Sampled Dataset:", sampled_data.shape)

Shape of Sampled Dataset: (5357, 7)


In [5]:
# Check the min and max value of rating columns
print(f'Min:{sampled_data["rating"].min()}')
print(f'Max:{sampled_data["rating"].max()}')

Min:1
Max:5


In [6]:
# check if there is null values in the rating
sampled_data[sampled_data['rating'].isna()]

Unnamed: 0,user_id,recipe_id,date,rating,review,review_month,review_year


In [7]:
sampled_data['user_id'].value_counts()

424680     39
37449      37
383346     26
169430     24
498271     22
           ..
798119      1
630047      1
202555      1
209255      1
1444456     1
Name: user_id, Length: 3618, dtype: int64

In [8]:
# Check number of recipes rated by user 39835
user_39835 = sampled_data[sampled_data['user_id'] == '39835']
user_39835.shape

(10, 7)

In [9]:
# find the number of recipe user 39835 rated as 5/5
sum(user_39835['rating'] == 5)

5

In [10]:
# sort the recipes of user 39835 by the rating in descending order (5->1)
user_39835.sort_values("rating", ascending=False).head(10)

Unnamed: 0,user_id,recipe_id,date,rating,review,review_month,review_year
216942,39835,209365,2007-05-31,5,We really enjoyed this simple to make dish whi...,May,2007
1099953,39835,54698,2003-03-09,5,We loved this simple-to-make skillet supper. W...,Mar,2003
533607,39835,116382,2006-01-08,5,Delicious--a differet wonderful pork stew! We ...,Jan,2006
538052,39835,64629,2004-06-17,5,Delicious and perfect for apricot lovers! I ma...,Jun,2004
778761,39835,962,2005-01-23,5,Absolutely delicious! I added some leftover ch...,Jan,2005
1028376,39835,62634,2003-05-26,4,Although I used the vegetables that were liste...,May,2003
705547,39835,464515,2011-11-29,4,Very easy and good -- and this recipe uses thi...,Nov,2011
954346,39835,385916,2009-09-24,4,This was a great with burgers! I changed the ...,Sep,2009
621865,39835,493179,2013-01-20,4,Easy and good! Thanks for sharing!,Jan,2013
423779,39835,70110,2006-05-15,3,Good for a quick weeknight meal but I think I ...,May,2006


In [11]:
# Set the reader with accurate rating scale
my_reader = Reader(rating_scale=(1, 5))

# Set the dataset
# Remember that the df parameter has to have 3 columns:
# User ids, Item ids (recipe), Ratings
my_dataset = Dataset.load_from_df(sampled_data[["user_id", "recipe_id", "rating"]], my_reader)
my_dataset

<surprise.dataset.DatasetAutoFolds at 0x2723b632910>

In [12]:
# Import GridSearchCV for algorithm tuning
from surprise.model_selection import GridSearchCV

# Set the parameter grid
param_grid = {
    'n_factors': [100, 150],
    'n_epochs': [10, 20],
    'lr_all': [0.005, 0.1],
    'biased': [False] } #The parameter indicates to the algorithm that all latent information must be stored.

# Set GridSearchCV with 3 cross validation
GS = GridSearchCV(FunkSVD, param_grid, measures=['fcp'], cv=3)

# Fit the model
GS.fit(my_dataset)

In [13]:
# Check the FCP accuracy score (1.0 is ideal and 0 is worst)
GS.best_score['fcp']

0.3440389436137017

In [14]:
# Check the best parameters
GS.best_params['fcp']

{'n_factors': 100, 'n_epochs': 10, 'lr_all': 0.005, 'biased': False}

In [15]:
# Import train_test_split
from surprise.model_selection import train_test_split

# Split train test set
trainset, testset = train_test_split(my_dataset, test_size=0.25)

# Set the algorithm
my_svd = FunkSVD(n_factors=100, 
                 n_epochs=20, 
                 lr_all=0.005, 
                 biased=False,
                 verbose=0)
# Fit train set
my_svd.fit(trainset)

# Test the algorithm using test set
my_pred = my_svd.test(testset)

In [16]:
# Put my_pred result in a dataframe
df_prediction = pd.DataFrame(my_pred, columns=['user_id',
                                                     'recipe_id',
                                                     'actual',
                                                     'prediction',
                                                     'details'])

# Calculate the difference of actual and prediction into diff column
df_prediction['diff'] = abs(df_prediction['prediction'] - 
                            df_prediction['actual'])

In [17]:
# Check the df_prediction
df_prediction.head()

Unnamed: 0,user_id,recipe_id,actual,prediction,details,diff
0,2000748342,274173,5.0,4.661688,"{'was_impossible': True, 'reason': 'User and i...",0.338312
1,222564,51730,5.0,4.661688,"{'was_impossible': True, 'reason': 'User and i...",0.338312
2,451226,78350,4.0,4.661688,"{'was_impossible': True, 'reason': 'User and i...",0.661688
3,1791723,261935,5.0,4.661688,"{'was_impossible': True, 'reason': 'User and i...",0.338312
4,56061,31235,5.0,4.661688,"{'was_impossible': True, 'reason': 'User and i...",0.338312


In [18]:
# See the worst 10 predictions
df_prediction.sort_values(by='diff')[-10:]

Unnamed: 0,user_id,recipe_id,actual,prediction,details,diff
16,32772,44730,5.0,1.0,{'was_impossible': False},4.0
431,98994,69173,5.0,1.0,{'was_impossible': False},4.0
42,193516,26215,5.0,1.0,{'was_impossible': False},4.0
990,52448,110583,5.0,1.0,{'was_impossible': False},4.0
976,180898,40118,5.0,1.0,{'was_impossible': False},4.0
781,561908,263983,5.0,1.0,{'was_impossible': False},4.0
1277,47892,101101,5.0,1.0,{'was_impossible': False},4.0
813,229850,29480,5.0,1.0,{'was_impossible': False},4.0
1303,30534,18725,5.0,1.0,{'was_impossible': False},4.0
362,586291,325674,5.0,1.0,{'was_impossible': False},4.0


In [19]:
# Check total rows with same actual and prediction ratings
df_prediction[df_prediction['diff'] <= 0]

Unnamed: 0,user_id,recipe_id,actual,prediction,details,diff


In [20]:
(df_prediction['diff'] == 0).mean()

0.0

In [21]:
(df_prediction["diff"] <= 1).mean()

0.9171641791044776

In [22]:
# Build full trainset
full_trainset = my_dataset.build_full_trainset()

# Build the SVD algorithm
my_svd = FunkSVD(n_factors=100, 
                 n_epochs=20, 
                 lr_all=0.005,    
                 biased=False, 
                 verbose=0)

# Fit with full trainset
my_svd.fit(full_trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x272046c6460>

In [23]:
# Define the full test set
full_testset = full_trainset.build_anti_testset(fill=-1)

In [24]:
# Set the prediction
my_prediction = my_svd.test(full_testset)

In [25]:
# Put into a dataframe
df_prediction = pd.DataFrame(my_prediction, columns=['user_id',
                                                     'recipe_id',
                                                     'actual',
                                                     'prediction',
                                                     'details'])

In [26]:
# Check user id `39835` predictions
df = df_prediction[df_prediction['user_id'] == '39835']\
    .sort_values(by=['prediction'], ascending=False)\
    .head()

display(df)

Unnamed: 0,user_id,recipe_id,actual,prediction,details
3458874,39835,108819,-1.0,1.0,{'was_impossible': False}
3462114,39835,141437,-1.0,1.0,{'was_impossible': False}
3462121,39835,43722,-1.0,1.0,{'was_impossible': False}
3462120,39835,304293,-1.0,1.0,{'was_impossible': False}
3462119,39835,136990,-1.0,1.0,{'was_impossible': False}


In [27]:
# Merge with the recipe data
merge_df = df.merge(recipes_df[["id", "name", "tags", "description"]].drop_duplicates(), how='left',
                    left_on=['recipe_id'], right_on=['id'])

# Check recipe of user 39835
merge_df

Unnamed: 0,user_id,recipe_id,actual,prediction,details,id,name,tags,description
0,39835,108819,-1.0,1.0,{'was_impossible': False},108819,good for you berry truffles,"['lactose', '30-minutes-or-less', 'time-to-mak...",this is just the thing for when you get a choc...
1,39835,141437,-1.0,1.0,{'was_impossible': False},141437,simply fruit,"['15-minutes-or-less', 'time-to-make', 'course...","from light and tasty jan 2003. very simple, re..."
2,39835,43722,-1.0,1.0,{'was_impossible': False},43722,spinach salad with blue cheese,"['15-minutes-or-less', 'time-to-make', 'course...",if you like blue cheese you'll love this salad...
3,39835,304293,-1.0,1.0,{'was_impossible': False},304293,m m popcorn cake,"['60-minutes-or-less', 'time-to-make', 'course...","this is from ingrid hoffman, ""simply delicioso..."
4,39835,136990,-1.0,1.0,{'was_impossible': False},136990,mexican squash and ground beef casserole,"['60-minutes-or-less', 'time-to-make', 'course...","a simple combination of yellow squash, ground ..."


In [30]:
merge_df.sort_values("prediction", ascending=False).head(10)

Unnamed: 0,user_id,recipe_id,actual,prediction,details,id,name,tags,description
0,39835,108819,-1.0,1.0,{'was_impossible': False},108819,good for you berry truffles,"['lactose', '30-minutes-or-less', 'time-to-mak...",this is just the thing for when you get a choc...
1,39835,141437,-1.0,1.0,{'was_impossible': False},141437,simply fruit,"['15-minutes-or-less', 'time-to-make', 'course...","from light and tasty jan 2003. very simple, re..."
2,39835,43722,-1.0,1.0,{'was_impossible': False},43722,spinach salad with blue cheese,"['15-minutes-or-less', 'time-to-make', 'course...",if you like blue cheese you'll love this salad...
3,39835,304293,-1.0,1.0,{'was_impossible': False},304293,m m popcorn cake,"['60-minutes-or-less', 'time-to-make', 'course...","this is from ingrid hoffman, ""simply delicioso..."
4,39835,136990,-1.0,1.0,{'was_impossible': False},136990,mexican squash and ground beef casserole,"['60-minutes-or-less', 'time-to-make', 'course...","a simple combination of yellow squash, ground ..."


In [29]:
user = rating_df[rating_df['user_id'] == '39835']
test = user.merge(recipes_df[["id", "name", "tags", "description"]].drop_duplicates(), how='left',
                    left_on=['recipe_id'], right_on=['id'])
test.sort_values("rating", ascending=False).head(10)

Unnamed: 0,user_id,recipe_id,date,rating,review,review_month,review_year,id,name,tags,description
1124,39835,126160,2005-06-17,5,Delicious and so easy! I increased the ingredi...,Jun,2005,126160.0,cheesy chive blossom omelet,"['15-minutes-or-less', 'time-to-make', 'course...","from recipesource, this is a lovely way to use..."
1314,39835,62291,2007-06-10,5,This is wonderful custard!!! I used 2 % milk a...,Jun,2007,62291.0,coconut custard with mango and whipped cream,"['60-minutes-or-less', 'time-to-make', 'course...","simple, easy and tropical. i think i need a va..."
1274,39835,46332,2003-03-23,5,"We all loved this chili, my ds said it was the...",Mar,2003,,,,
1272,39835,184846,2011-03-19,5,Very good and more filling than I expected -- ...,Mar,2011,184846.0,crock pot spinach tomato vegetable soup,"['time-to-make', 'course', 'main-ingredient', ...",modified version of a recipe from the weight w...
1271,39835,69631,2003-11-13,5,Delicious! I admit I had doubts that these wou...,Nov,2003,69631.0,cheesecake cupcakes 1,"['weeknight', 'time-to-make', 'course', 'main-...",these cupcakes are delicious! i got the recipe...
1270,39835,392343,2010-01-02,5,Delicious and hearty soup! We served this with...,Jan,2010,392343.0,sun dried tomato soup with wild mushrooms and ...,"['time-to-make', 'course', 'preparation', 'low...",an earthy and robust soup that is heart health...
1265,39835,53914,2014-02-01,5,Delicious! I used the tilapia and the rest of ...,Feb,2014,53914.0,mama s supper club tilapia parmesan,"['60-minutes-or-less', 'time-to-make', 'course...",you asked for it. here it is. thanks so much t...
1263,39835,79001,2006-01-19,5,Very good! The changes I've made are using bab...,Jan,2006,79001.0,chicken n rice gumbo,"['time-to-make', 'course', 'cuisine', 'prepara...",this recipe is from betty crocker slow cooker ...
1258,39835,335386,2009-01-08,5,Delicious and wonderful on a cold night! The c...,Jan,2009,335386.0,crock pot harvest stoup,"['course', 'preparation', 'healthy', 'main-dis...",this is really good. my family raved about it ...
1257,39835,34719,2003-04-23,5,Easy and great to have muffins ready to pop in...,Apr,2003,34719.0,refrigerator bran muffins mix,"['30-minutes-or-less', 'time-to-make', 'course...",this is really great to have on hand when you ...
