# Collaborative Based Recommendation
## Funk Singular Value Decomposition (FunkSVD)

In [2]:
# Import the basic packages
import numpy as np 
import pandas as pd 

# Import the surprise packages
from surprise import Dataset
from surprise.reader import Reader
from surprise.prediction_algorithms.matrix_factorization import SVD as FunkSVD

In [3]:
rating_df = pd.read_pickle("clean_interactions.pkl")
recipes_df = pd.read_pickle("clean_recipes.pkl")

In [4]:
# Check the recipe dataset
display(rating_df.head())
print('Shape: ', rating_df.shape)
print('Number of Unique Users:',len(rating_df['user_id'].unique()))
print('Number of Unique recipes:',len(rating_df['recipe_id'].unique()))

Unnamed: 0,user_id,recipe_id,date,rating,review,review_month,review_year
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...,Feb,2003
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall...",Dec,2011
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...,Dec,2002
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...,Feb,2010
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin...",Oct,2011


Shape:  (1071351, 7)
Number of Unique Users: 195947
Number of Unique recipes: 226583


In [5]:
# Sample 10% of the dataset
sampled_data = rating_df.sample(frac=0.1, random_state=42)

# Print the shape of the sampled dataset
print("Shape of Sampled Dataset:", sampled_data.shape)

Shape of Sampled Dataset: (107135, 7)


In [6]:
# Check the min and max value of rating columns
print(f'Min:{sampled_data["rating"].min()}')
print(f'Max:{sampled_data["rating"].max()}')

Min:1
Max:5


In [7]:
# check if there is null values in the rating
sampled_data[sampled_data['rating'].isna()]

Unnamed: 0,user_id,recipe_id,date,rating,review,review_month,review_year


In [8]:
sampled_data['user_id'].value_counts()

424680        799
37449         546
383346        471
169430        416
128473        362
             ... 
412587          1
255665          1
1574335         1
2001273343      1
2001718129      1
Name: user_id, Length: 37123, dtype: int64

In [17]:
# Check number of recipes rated by user 383346
user_383346 = sampled_data[sampled_data['user_id'] == '383346']
user_383346.shape

(471, 7)

In [18]:
# find the number of recipe user 424680 rated as 5/5
sum(user_383346['rating'] == 5)

373

In [19]:
# sort the recipes of user 383346 by the rating in descending order (5->1)
user_383346.sort_values("rating", ascending=False).head(10)

Unnamed: 0,user_id,recipe_id,date,rating,review,review_month,review_year
722134,383346,83812,2010-09-26,5,I love salmon pie and this one is healthier th...,Sep,2010
341442,383346,14860,2008-01-28,5,I used instant low-sodium chicken bouillon. T...,Jan,2008
578544,383346,422484,2010-06-09,5,I loved it. I omitted the grenadine. Thanks ...,Jun,2010
291386,383346,467609,2011-11-15,5,I made half the recipe for the 3 of us. It wa...,Nov,2011
760464,383346,309754,2008-07-03,5,I used only all purpose flour. I had no fat p...,Jul,2008
1037130,383346,387194,2009-09-12,5,wow this is a delicious grilled cheese. We wa...,Sep,2009
477151,383346,365408,2011-06-07,5,That's a lot of vanilla but it made them taste...,Jun,2011
513654,383346,202388,2008-07-14,5,I love the taste of this coffee. It was for D...,Jul,2008
723000,383346,457033,2011-05-27,5,The taste is really good. Thanks alligirl :) ...,May,2011
1103783,383346,433220,2010-10-03,5,This is great. I had it in hot dogs. I will ...,Oct,2010


In [20]:
# recipes with lowest rating by user
user_383346.sort_values("rating", ascending=False).tail(10)

Unnamed: 0,user_id,recipe_id,date,rating,review,review_month,review_year
375330,383346,261673,2013-06-03,4,This salad was good. But we found that someth...,Jun,2013
716884,383346,189523,2013-07-17,4,These muffins had a great taste but I think th...,Jul,2013
911707,383346,476615,2012-04-18,4,The taste of these waffles are very good. Not...,Apr,2012
1019487,383346,44080,2010-05-24,3,I used a small cabbage coarsely chopped. Afte...,May,2010
361435,383346,348429,2009-01-24,3,I used 1 tbs of extra virgin olive oil to cook...,Jan,2009
703992,383346,253919,2008-01-14,3,Sorry for this rating Chia. I used big shrimp...,Jan,2008
693332,383346,502884,2013-08-09,3,I&#039;m so sorry but for us it didn&#039;t wo...,Aug,2013
441160,383346,429272,2011-10-10,3,I wish I'd like them more. They were hard to ...,Oct,2011
15946,383346,7859,2010-12-27,3,I had to add more pineapple juice and a little...,Dec,2010
937599,383346,168001,2007-10-22,3,Sorry for the not so good rating. I didn't li...,Oct,2007


In [23]:
# Set the reader with accurate rating scale
my_reader = Reader(rating_scale=(1, 10))

# Set the dataset
# Remember that the df parameter has to have 3 columns:
# User ids, Item ids (anime), Ratings
my_dataset = Dataset.load_from_df(sampled_data[["user_id", "recipe_id", "rating"]], my_reader)
my_dataset

<surprise.dataset.DatasetAutoFolds at 0x14ae8168910>

In [24]:
# Import GridSearchCV for algorithm tuning
from surprise.model_selection import GridSearchCV

# Set the parameter grid
param_grid = {
    'n_factors': [100, 150], 
    'n_epochs': [10, 20],
    'lr_all': [0.005, 0.1],
    'biased': [False] } #The parameter indicates to the algorithm that all latent information must be stored. 

# Set GridSearchCV with 3 cross validation
GS = GridSearchCV(FunkSVD, param_grid, measures=['fcp'], cv=3)

# Fit the model
GS.fit(my_dataset)

In [25]:
# Check the FCP accuracy score (1.0 is ideal and 0 is worst)
GS.best_score['fcp']

0.37736715162957685

In [26]:
# Check the best parameters
GS.best_params['fcp']

{'n_factors': 150, 'n_epochs': 20, 'lr_all': 0.1, 'biased': False}

In [27]:
# Import train_test_split
from surprise.model_selection import train_test_split

# Split train test set
trainset, testset = train_test_split(my_dataset, test_size=0.25)

# Set the algorithm
my_svd = FunkSVD(n_factors=100, 
                 n_epochs=20, 
                 lr_all=0.005, 
                 biased=False,
                 verbose=0)
# Fit train set
my_svd.fit(trainset)

# Test the algorithm using test set
my_pred = my_svd.test(testset)

In [28]:
# Put my_pred result in a dataframe
df_prediction = pd.DataFrame(my_pred, columns=['user_id',
                                                     'recipe_id',
                                                     'actual',
                                                     'prediction',
                                                     'details'])

# Calculate the difference of actual and prediction into diff column
df_prediction['diff'] = abs(df_prediction['prediction'] - 
                            df_prediction['actual'])

In [29]:
# Check the df_prediction
df_prediction.head()

Unnamed: 0,user_id,recipe_id,actual,prediction,details,diff
0,965948,133511,5.0,4.660029,"{'was_impossible': True, 'reason': 'User and i...",0.339971
1,1529208,85328,5.0,4.660029,"{'was_impossible': True, 'reason': 'User and i...",0.339971
2,538524,232785,5.0,4.660029,"{'was_impossible': True, 'reason': 'User and i...",0.339971
3,1304040,19559,5.0,1.0,{'was_impossible': False},4.0
4,2002219147,536270,5.0,4.660029,"{'was_impossible': True, 'reason': 'User and i...",0.339971


In [30]:
# See the worst 10 predictions
df_prediction.sort_values(by='diff')[-10:]

Unnamed: 0,user_id,recipe_id,actual,prediction,details,diff
3033,1057310,14593,5.0,1.0,{'was_impossible': False},4.0
7181,153472,132350,5.0,1.0,{'was_impossible': False},4.0
6015,166371,200025,5.0,1.0,{'was_impossible': False},4.0
20432,146486,82477,5.0,1.0,{'was_impossible': False},4.0
15789,27783,99534,5.0,1.0,{'was_impossible': False},4.0
20430,28649,8509,5.0,1.0,{'was_impossible': False},4.0
3038,1066322,8782,5.0,1.0,{'was_impossible': False},4.0
3039,1072593,214165,5.0,1.0,{'was_impossible': False},4.0
7186,527607,242877,5.0,1.0,{'was_impossible': False},4.0
14725,49561,140842,5.0,1.0,{'was_impossible': False},4.0


In [31]:
# Check total rows with same actual and prediction ratings
df_prediction[df_prediction['diff'] <= 0]

Unnamed: 0,user_id,recipe_id,actual,prediction,details,diff
460,205730,189267,1.0,1.0,{'was_impossible': False},0.0
952,218535,57535,1.0,1.0,{'was_impossible': False},0.0
1182,2695,83287,1.0,1.0,{'was_impossible': False},0.0
1222,218535,26370,1.0,1.0,{'was_impossible': False},0.0
1987,1802936715,153647,1.0,1.0,{'was_impossible': False},0.0
2340,585706,860,1.0,1.0,{'was_impossible': False},0.0
2946,959007,290028,1.0,1.0,{'was_impossible': False},0.0
3684,197151,29679,1.0,1.0,{'was_impossible': False},0.0
3893,211679,22221,1.0,1.0,{'was_impossible': False},0.0
4038,8629,40767,1.0,1.0,{'was_impossible': False},0.0


In [32]:
(df_prediction['diff'] == 0).mean()

0.0018667861409796893

In [33]:
(df_prediction["diff"] <= 1).mean()

0.5925179211469535

In [34]:
# Build full trainset
full_trainset = my_dataset.build_full_trainset()

# Build the SVD algorithm
my_svd = FunkSVD(n_factors=100, 
                 n_epochs=20, 
                 lr_all=0.005,    
                 biased=False, 
                 verbose=0)

# Fit with full trainset
my_svd.fit(full_trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x14ad4a89ee0>

In [35]:
# Define the full test set
full_testset = full_trainset.build_anti_testset(fill=-1)

MemoryError: 

In [None]:
# Set the prediction
my_prediction = my_svd.test(full_testset)

In [None]:
# Put into a dataframe
df_prediction = pd.DataFrame(my_prediction, columns=['user_id',
                                                     'recipe_id',
                                                     'actual',
                                                     'prediction',
                                                     'details'])

In [None]:
# Check user id `1497` predictions
df = df_prediction[df_prediction['user_id'] == 169430]\
    .sort_values(by=['prediction'], ascending=False)\
    .head()

display(df)

In [None]:
# Merge with the anime data
merge_df = df.merge(recipes_df[["recipe_id", "name", "tags", "description"]].drop_duplicates(), how='left', 
                    left_on=['recipe_id'], right_on=['recipe_id'])

# Check anime of user 1497
merge_df