# Collaborative Based Recommendation
## Funk Singular Value Decomposition (FunkSVD)

In [1]:
# Import the basic packages
import numpy as np
import pandas as pd

# Import the surprise packages
#!pip install scikit-surprise
from surprise import Dataset
from surprise.reader import Reader
from surprise.prediction_algorithms.matrix_factorization import SVD as FunkSVD

In [2]:
rating_df = pd.read_pickle("clean_interactions.pkl")
recipes_df = pd.read_pickle("clean_recipes.pkl")

In [3]:
# Check the recipe dataset
display(rating_df.head())
print('Shape: ', rating_df.shape)
print('Number of Unique Users:',len(rating_df['user_id'].unique()))
print('Number of Unique recipes:',len(rating_df['recipe_id'].unique()))

Unnamed: 0,user_id,recipe_id,date,rating,review,review_month,review_year
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...,Feb,2003
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall...",Dec,2011
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...,Dec,2002
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...,Feb,2010
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin...",Oct,2011


Shape:  (1071351, 7)
Number of Unique Users: 195947
Number of Unique recipes: 226583


In [4]:
# Sample 10% of the dataset
sampled_data = rating_df.sample(frac=0.01, random_state=42)

# Print the shape of the sampled dataset
print("Shape of Sampled Dataset:", sampled_data.shape)

Shape of Sampled Dataset: (10714, 7)


In [5]:
# Check the min and max value of rating columns
print(f'Min:{sampled_data["rating"].min()}')
print(f'Max:{sampled_data["rating"].max()}')

Min:1
Max:5


In [6]:
# check if there is null values in the rating
sampled_data[sampled_data['rating'].isna()]

Unnamed: 0,user_id,recipe_id,date,rating,review,review_month,review_year


In [7]:
sampled_data['user_id'].value_counts()

424680     75
37449      64
383346     52
128473     43
169430     37
           ..
31261       1
1704786     1
295299      1
268932      1
561328      1
Name: user_id, Length: 6320, dtype: int64

In [8]:
# Check number of recipes rated by user 383346
user_383346 = sampled_data[sampled_data['user_id'] == '383346']
user_383346.shape

(52, 7)

In [9]:
# find the number of recipe user 383346 rated as 5/5
sum(user_383346['rating'] == 5)

46

In [10]:
# sort the recipes of user 383346 by the rating in descending order (5->1)
user_383346.sort_values("rating", ascending=False).head(10)

Unnamed: 0,user_id,recipe_id,date,rating,review,review_month,review_year
732206,383346,294524,2011-01-04,5,I made this drink for DH and he liked it so mu...,Jan,2011
849428,383346,280273,2010-04-19,5,This is so yummy. I used the white bread I ma...,Apr,2010
887712,383346,301070,2009-06-09,5,I used raspberry jelly. This was a very good ...,Jun,2009
618591,383346,176934,2007-09-06,5,This is a very easy marinade to do. No veggie...,Sep,2007
511420,383346,454534,2011-05-02,5,This drink is great. Perfect for a hot aftern...,May,2011
248971,383346,248610,2007-09-18,5,I didn't add grenadine in the shaker. But I d...,Sep,2007
1008960,383346,209800,2007-11-11,5,"I used all purpose flour also. I used butter,...",Nov,2007
438658,383346,192285,2012-02-20,5,This is so easy to do. It's amazing how it's ...,Feb,2012
323181,383346,305691,2013-05-08,5,For me this dipping oil was perfect. I liked ...,May,2013
1025996,383346,408718,2011-01-16,5,This is so easy to do. I like anything with r...,Jan,2011


In [11]:
# recipes with lowest rating by user
user_383346.sort_values("rating", ascending=False).tail(10)

Unnamed: 0,user_id,recipe_id,date,rating,review,review_month,review_year
602805,383346,323013,2009-01-31,5,I did this filling only for 5 servings cause I...,Jan,2009
496703,383346,328203,2009-06-15,5,It was so yummy. I used triple sec and unswee...,Jun,2009
825665,383346,382952,2009-09-13,5,It's a great appetizer. The chili powder is g...,Sep,2009
680861,383346,410839,2010-07-25,5,This bacon is addicting. I made a small batch...,Jul,2010
230085,383346,117908,2011-02-13,4,I made this recipe for DH. He liked the mayo ...,Feb,2011
333029,383346,135210,2007-06-25,4,I sliced all my mushrooms. I used hongarian p...,Jun,2007
182720,383346,229441,2009-06-16,4,This is very refreshing but another time I may...,Jun,2009
927561,383346,170320,2012-03-19,4,That's great that they have less fat than some...,Mar,2012
911707,383346,476615,2012-04-18,4,The taste of these waffles are very good. Not...,Apr,2012
937599,383346,168001,2007-10-22,3,Sorry for the not so good rating. I didn't li...,Oct,2007


In [12]:
# Set the reader with accurate rating scale
my_reader = Reader(rating_scale=(1, 5))

# Set the dataset
# Remember that the df parameter has to have 3 columns:
# User ids, Item ids (recipe), Ratings
my_dataset = Dataset.load_from_df(sampled_data[["user_id", "recipe_id", "rating"]], my_reader)
my_dataset

<surprise.dataset.DatasetAutoFolds at 0x19a16abe1c0>

In [13]:
# Import GridSearchCV for algorithm tuning
from surprise.model_selection import GridSearchCV

# Set the parameter grid
param_grid = {
    'n_factors': [100, 150],
    'n_epochs': [10, 20],
    'lr_all': [0.005, 0.1],
    'biased': [False] } #The parameter indicates to the algorithm that all latent information must be stored.

# Set GridSearchCV with 3 cross validation
GS = GridSearchCV(FunkSVD, param_grid, measures=['fcp'], cv=3)

# Fit the model
GS.fit(my_dataset)

In [14]:
# Check the FCP accuracy score (1.0 is ideal and 0 is worst)
GS.best_score['fcp']

0.38475075974151185

In [15]:
# Check the best parameters
GS.best_params['fcp']

{'n_factors': 100, 'n_epochs': 20, 'lr_all': 0.005, 'biased': False}

In [16]:
# Import train_test_split
from surprise.model_selection import train_test_split

# Split train test set
trainset, testset = train_test_split(my_dataset, test_size=0.25)

# Set the algorithm
my_svd = FunkSVD(n_factors=100,
                 n_epochs=20,
                 lr_all=0.005,
                 biased=False,
                 verbose=0)
# Fit train set
my_svd.fit(trainset)

# Test the algorithm using test set
my_pred = my_svd.test(testset)

In [17]:
# Put my_pred result in a dataframe
df_prediction = pd.DataFrame(my_pred, columns=['user_id',
                                                     'recipe_id',
                                                     'actual',
                                                     'prediction',
                                                     'details'])

# Calculate the difference of actual and prediction into diff column
df_prediction['diff'] = abs(df_prediction['prediction'] -
                            df_prediction['actual'])

In [18]:
# Check the df_prediction
df_prediction.head()

Unnamed: 0,user_id,recipe_id,actual,prediction,details,diff
0,400420,199136,4.0,4.667704,"{'was_impossible': True, 'reason': 'User and i...",0.667704
1,739451,32614,5.0,4.667704,"{'was_impossible': True, 'reason': 'User and i...",0.332296
2,1008402,82505,5.0,4.667704,"{'was_impossible': True, 'reason': 'User and i...",0.332296
3,440735,437187,4.0,4.667704,"{'was_impossible': True, 'reason': 'User and i...",0.667704
4,422893,104182,5.0,4.667704,"{'was_impossible': True, 'reason': 'User and i...",0.332296


In [19]:
# See the worst 10 predictions
df_prediction.sort_values(by='diff')[-10:]

Unnamed: 0,user_id,recipe_id,actual,prediction,details,diff
358,353131,94674,5.0,1.0,{'was_impossible': False},4.0
826,57400,20374,5.0,1.0,{'was_impossible': False},4.0
1842,29418,17118,5.0,1.0,{'was_impossible': False},4.0
835,31499,12572,5.0,1.0,{'was_impossible': False},4.0
347,658542,87420,5.0,1.0,{'was_impossible': False},4.0
2370,260325,607,5.0,1.0,{'was_impossible': False},4.0
842,103876,111212,5.0,1.0,{'was_impossible': False},4.0
1573,250031,55673,5.0,1.0,{'was_impossible': False},4.0
847,469786,37437,5.0,1.0,{'was_impossible': False},4.0
637,86855,8794,5.0,1.0,{'was_impossible': False},4.0


In [20]:
# Check total rows with same actual and prediction ratings
df_prediction[df_prediction['diff'] <= 0]

Unnamed: 0,user_id,recipe_id,actual,prediction,details,diff
2025,350577,56103,1.0,1.0,{'was_impossible': False},0.0


In [21]:
(df_prediction['diff'] == 0).mean()

0.0003732736095558044

In [22]:
(df_prediction["diff"] <= 1).mean()

0.8678611422172452

In [23]:
# Build full trainset
full_trainset = my_dataset.build_full_trainset()

# Build the SVD algorithm
my_svd = FunkSVD(n_factors=100,
                 n_epochs=20,
                 lr_all=0.005,
                 biased=False,
                 verbose=0)

# Fit with full trainset
my_svd.fit(full_trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x19a16c71a90>

In [24]:
# Define the full test set
full_testset = full_trainset.build_anti_testset(fill=-1)

In [None]:
# Set the prediction
my_prediction = my_svd.test(full_testset)

In [None]:
# Put into a dataframe
df_prediction = pd.DataFrame(my_prediction, columns=['user_id',
                                                     'recipe_id',
                                                     'actual',
                                                     'prediction',
                                                     'details'])

In [None]:
# Check user id `383346` predictions
df = df_prediction[df_prediction['user_id'] == 383346]\
    .sort_values(by=['prediction'], ascending=False)\
    .head()

display(df)

In [None]:
# Merge with the anime data
merge_df = df.merge(recipes_df[["recipe_id", "name", "tags", "description"]].drop_duplicates(), how='left',
                    left_on=['recipe_id'], right_on=['recipe_id'])

# Check anime of user 1497
merge_df