# **Recommendation Tools - Group Project**

# 1. Set-up the enviroment and reading data

In [2]:
import pandas as pd 
import numpy as np 
from ieseg_recsys import eval, model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from surprise import Dataset, Reader, KNNBasic, SVD
from collections import Counter

# NLP packages
import nltk # pip install nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
import numpy as np
import re
import ast
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure necessary NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/hanhtran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hanhtran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/hanhtran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hanhtran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/hanhtran/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Reading data

# Train data
data= pd.read_csv('train.csv')

# Meta data
meta= pd.read_csv('metadata.csv')

In [4]:
print("Length of Train data:", len(data))
print("Length of Meta data:", len(meta))

Length of Train data: 165226
Length of Meta data: 231637


In [5]:
# Merge data
filtered_meta = pd.merge(meta[['id', 'ingredients','nutrition']], data, left_on='id', right_on='recipe_id', how='inner')

# Convert the format of some column to list
filtered_meta['ingredients'] = filtered_meta['ingredients'].apply(ast.literal_eval)
filtered_meta['nutrition'] = filtered_meta['nutrition'].apply(ast.literal_eval)

In [6]:
# Calculate the average ingredient for each recipe (we can use it later for evaluation metric)
filtered_meta['ingredient_count'] = filtered_meta['ingredients'].apply(lambda x: len(x))
print(filtered_meta['ingredient_count'].mean())

# Drop unnecesary col
filtered_meta = filtered_meta.drop(columns=['ingredient_count'])

8.964043189328557


**We will choose top-N = 8 as 8 is the average ingredient for each recipe**

In [7]:
# Calculate the healthiness score
def calculate_health_score(row):
    calories, total_fat, sugar, sodium, protein, sat_fat, carbs = row['nutrition']
    return (protein - total_fat - sugar - sodium / 240 - sat_fat)  # Penalize both total fat and saturated fat

# Apply the health score calculation
filtered_meta['health_score'] = filtered_meta.apply(calculate_health_score, axis=1)

In [8]:
# Normalizing such that scores are scaled between 1 and 2
filtered_meta['normalized_health_score'] = 1 + (filtered_meta['health_score'] - filtered_meta['health_score'].min()) / (filtered_meta['health_score'].max() - filtered_meta['health_score'].min())

# 2. Solution 1: 
## Non-personalized ingredient recommendation

### Step 1: Data Preparation 

In [9]:
filtered_meta.dtypes

id                          object
ingredients                 object
nutrition                   object
user_id                     object
recipe_id                   object
date                        object
rating                       int64
review                      object
health_score               float64
normalized_health_score    float64
dtype: object

In [10]:
# Check for missing values and handle them
filtered_meta = filtered_meta.dropna(subset=['ingredients', 'rating', 'review'])

# Normalize the ingredients text
filtered_meta['ingredients'] = filtered_meta['ingredients'].apply(lambda x: [i.lower().strip() for i in x]) 

In [11]:
# Convert date to datetime format
filtered_meta['date'] = pd.to_datetime(filtered_meta['date'])

### Step 2: Analyze User Preferences

In [12]:
from textblob import TextBlob

# Function to get the sentiment of the review
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Apply sentiment analysis
filtered_meta['sentiment'] = filtered_meta['review'].apply(get_sentiment)

# Consider reviews with positive sentiment and high ratings
df_positive = filtered_meta[(filtered_meta['sentiment'] > 0) & (filtered_meta['rating'] >= 4)]

### Step 3: Non-Personal Recommendation

In [13]:
# Flatten the list of ingredients in positive reviews
all_ingredients = [item for sublist in df_positive['ingredients'].tolist() for item in sublist]

# Count the frequency of each ingredient
ingredient_counts = Counter(all_ingredients)

# Get the most common ingredients
top_ingredients = ingredient_counts.most_common(10)  

# Display top ingredients
print("Top 10 ingredients based on positive reviews and high ratings:")
print(top_ingredients)

Top 10 ingredients based on positive reviews and high ratings:
[('salt', 62328), ('butter', 40258), ('sugar', 31460), ('onion', 26781), ('water', 24737), ('eggs', 24218), ('flour', 21854), ('olive oil', 18786), ('milk', 18622), ('brown sugar', 15746)]


### Step 4: Non-Personal Recommendation with Recency

In [14]:
# Compute the difference in days
newest = filtered_meta['date'].max()
filtered_meta['days_diff'] = (newest - filtered_meta['date']).dt.days

# Normalize the 'days_diff' to a score between 0 and 1
max_days = filtered_meta['days_diff'].max()
filtered_meta['recency_score'] = 1 - (filtered_meta['days_diff'] / max_days)

# Add recency score
filtered_meta['adj_rating'] = filtered_meta['rating'] * filtered_meta['recency_score']

# Explode the ingredients list into individual rows
expanded_meta = filtered_meta.explode('ingredients')

# Compute the mean of adjusted ratings
ingredient_ratings = expanded_meta.groupby('ingredients').agg({
    'adj_rating': 'mean'
}).sort_values('adj_rating', ascending=False)

# Display top ingredients
 = ingredient_ratings.nlargest(10, 'adj_rating')
print("Top 10 Ingredients Based on Recency:")
print(top_ingredients)

Top 10 Ingredients Based on Recency:
                                       adj_rating
ingredients                                      
cranberry muffin mix                     5.983784
peanut butter filled pretzels            5.971310
mistake                                  5.968815
nestle swirled holiday morsels           5.967568
peameal bacon                            5.958836
ritz bits mini peanut butter crackers    5.942620
vegetable crackers                       5.942620
butter-flavored oil                      5.942620
korean radishes                          5.931393
wontons                                  5.911435


# 3. Solution 2: 
## Personalized ingredient recommendations 

### Step 1: Data Preparation 

In [14]:
# Convert the 'ingredient' column from list of ingredients to separate rows
df = filtered_meta.explode('ingredients')

### Step 2: Calculate weight-rating for ingredient

How it works?
1. Assign Weight: Ratings from recipes with fewer ingredients might be given more weight for each ingredient, assuming that the taste of each ingredient has a stronger impact in simpler recipes.
2. Calculate Weighted Average: For each ingredient, sum the product of the ratings and their corresponding weights, and then divide by the sum of the weights.

In [15]:
# Assign basic weights based on recipe size
df['weight'] = 1 / df.groupby('recipe_id')['ingredients'].transform('size')

# Calculate the weighted rating and sentiment using adjusted weights
df['weighted_rating'] = df['rating'] * df['weight']

# Calculate weighted ratings, sum of weights, and average sentiment using adjusted weights
final_df = df.groupby(['user_id', 'ingredients']).apply(
    lambda x: pd.Series({
        'ingredient_rating': x['weighted_rating'].sum() / x['weight'].sum(),
        'reviews': " | ".join(x['review'].astype(str)) 
    })
).reset_index()

  final_df = df.groupby(['user_id', 'ingredients']).apply(


In [16]:
# Round the 'ingredient_rating' to the nearest integer and convert to int
final_df['ingredient_rating_int'] = final_df['ingredient_rating'].round().astype(int)

In [17]:
# Inspecting the new range of rating
min_rating = final_df['ingredient_rating'].min()
max_rating = final_df['ingredient_rating'].max()

print("Minimum ingredient rating:", min_rating)
print("Maximum ingredient rating:", max_rating)

Minimum ingredient rating: 1.0
Maximum ingredient rating: 6.000000000000003


### Step 3: Collaborative Filtering Model

### Preprocessing data

In [18]:
from sklearn.model_selection import train_test_split

# Extract only necessary cols
data = final_df[['user_id', 'ingredients', 'ingredient_rating']]

# train-test split
train, test = train_test_split(data, test_size=0.3, random_state=42)

# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [19]:
# Inspecting data
print('------ Train ------')
print('Nr items:', len(train["ingredients"].unique()))
print('Nr users:', len(train["user_id"].unique()))

print('\n------ Test ------')
print('Nr items:', len(test["ingredients"].unique()))
print('Nr users:', len(test["user_id"].unique()))

print('\nNr new items:', len(set(test["ingredients"])-set(train["ingredients"])))
print('Nr new users:', len(set(test["user_id"])-set(train["user_id"])))

------ Train ------
Nr items: 9233
Nr users: 11346

------ Test ------
Nr items: 7416
Nr users: 11344

Nr new items: 725
Nr new users: 0


In [20]:
# Formatting the train-test set

# Update the rating scale 
reader = Reader(rating_scale=(min_rating, max_rating))

# Loading and Preparing Training Data
df_train = Dataset.load_from_df(train, reader).build_full_trainset()

# Loading and Preparing Testing Data
df_test = list(test.itertuples(index=False, name=None))

### User-based CF

In [21]:
from surprise import KNNBasic, Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV

In [103]:
# Set the scale
reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(train[['user_id', 'ingredients', 'ingredient_rating']], reader)

# Define the parameter grid
param_grid = {
    'k': [10, 15, 20], 
    'sim_options': {
        'name': ['cosine'],  
        'user_based': [True]  }
}

# Set up GridSearchCV
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], cv=3)

# Perform grid search
gs.fit(data)

# Output the best score and parameters
print('Best RMSE:', gs.best_score['rmse'])
print('Best parameters:', gs.best_params['rmse'])

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Best RMSE: 1.07021764728033
Best parameters: {'k': 20, 'sim_options': {'name': 'cosine', 'user_based': True}}


In [106]:
# Initialize the best ib model 
ub_best_model = KNNBasic(k=20, min_k=5, sim_options={'name': 'cosine', 'user_based': True}, random_state=42)

# Fit on training set
ub_best_model.fit(df_train)

# Predict on testset
ub_pred = ub_best_model.test(df_test)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [107]:
# evaluate
ub_res = eval.evaluate(ub_pred, topn=8, rating_cutoff=4).rename(columns={'value':'User_based_15'})
ub_res

Unnamed: 0,User_based_15
RMSE,1.078202
MAE,0.555197
Recall,0.99991
Precision,0.9411
F1,0.969614
NDCG@8,0.836824


### Item-based CF

In [99]:
# Set the scale
reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(train[['user_id', 'ingredients', 'ingredient_rating']], reader)

# Define the parameter grid
param_grid = {
    'k': [5, 10, 15, 20], 
    'sim_options': {
        'name': ['cosine'],  
        'user_based': [False]  }
}

# Set up GridSearchCV
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], cv=3)

# Perform grid search
gs.fit(data)

# Output the best score and parameters
print('Best RMSE:', gs.best_score['rmse'])
print('Best parameters:', gs.best_params['rmse'])

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Best RMSE: 0.9519515892555729
Best parameters: {'k': 20, 'sim_options': {'name': 'cosine

In [100]:
# Initialize the best ib model 
ib_best_model = KNNBasic(k=20, sim_options={'name': 'cosine', 'user_based': False}, random_state=42)

# Fit on training set
ib_best_model.fit(df_train)

# Predict on testset
ib_pred = ib_best_model.test(df_test)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [101]:
# Evaluate
ib_res = eval.evaluate(ib_pred, topn=8, rating_cutoff=4).rename(columns={'value':'Item_based_15'})
ib_res

Unnamed: 0,Item_based_15
RMSE,0.936958
MAE,0.559275
Recall,0.990618
Precision,0.949343
F1,0.969541
NDCG@8,0.854787


### Step 4: Matrix Factorization

In [84]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import GridSearchCV

In [96]:
# Set the scale
reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(train[['user_id', 'ingredients', 'ingredient_rating']], reader)

# Define a grid of SVD hyperparameters
param_grid = {
    'n_factors': [10, 20, 50, 100], 
    'n_epochs': [10, 20, 30],       
    'reg_all': [0.1, 0.2, 0.4, 0.5], 
    'lr_all': [0.005, 0.01, 0.05]   
}

# Setup GridSearchCV
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

# Fit GridSearchCV using the Dataset object
gs.fit(data)

# Best RMSE score
print("Best RMSE:", gs.best_score['rmse'])
print("Best parameters:", gs.best_params['rmse'])

Best RMSE: 0.942863830043716
Best parameters: {'n_factors': 100, 'n_epochs': 30, 'reg_all': 0.1, 'lr_all': 0.01}


In [97]:
# Initialize the best SVD model 
mf_best_model = SVD(
    n_factors=100, 
    n_epochs=30, 
    reg_all=0.1, 
    lr_all=0.01
) 

# Fit on training set
mf_best_model.fit(df_train)

# Predict on testset
mf_pred = mf_best_model.test(df_test)

In [98]:
mf_res = eval.evaluate(mf_pred, topn=8, rating_cutoff=4).rename(columns={'value':'SVD_20'})
mf_res

Unnamed: 0,SVD_20
RMSE,0.921728
MAE,0.562667
Recall,0.994322
Precision,0.948492
F1,0.970866
NDCG@8,0.850724


### Step 5: Content based Model - TF-IDF

In [109]:
from ieseg_recsys import eval, model
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader, KNNBasic, SVD
import re

# NLP packages
import nltk # pip install nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package punkt to /home/hanhtran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hanhtran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### 5.1. Preprocessing text data

In [110]:
# Using only 60% of the data
content_data = final_df.sample(frac=0.6, random_state=42)

In [111]:
# Set-up data
text = content_data[['ingredients','reviews']]
data = content_data[['user_id', 'ingredients', 'ingredient_rating']]

In [112]:
# Train-test split
train, test = train_test_split(data, test_size=0.3, random_state=42)

In [113]:
# Initialize stopwords and stemmer
stop_words = set(stopwords.words('english'))
stop_words.add('nan')  
stemmer = SnowballStemmer("english")

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I | re.A)
    text = text.lower().strip()
    tokens = word_tokenize(text)
    filtered_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and len(token) > 2]
    return ' '.join(filtered_tokens)
    
# Apply the preprocessing function to each review
processed_reviews = text['reviews'].apply(preprocess_text)

#### 5.2. TF-IDF

In [114]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=100)
dtm = tfidf.fit_transform(processed_reviews)

# Convert DTM to DataFrame for content-based model
df_dtm = pd.DataFrame(dtm.toarray(), columns=tfidf.get_feature_names_out(), index=text['ingredients'])

# Convert all columns of the DataFrame to float32
df_dtm = df_dtm.astype('float32')
df_dtm.head()

Unnamed: 0_level_0,ad,add,also,bake,best,better,bit,bread,butter,cake,...,turn,use,want,way,well,whole,wonder,work,would,yummi
ingredients,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
low-fat flour tortillas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
yellow mustard,0.0,0.0,0.0,0.0,0.0,0.192898,0.0,0.0,0.0,0.0,...,0.0,0.094578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chili powder,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
asparagus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.336204,0.175431,0.0,0.345979,0.0,0.0,0.0,0.0,0.0,0.0
beef sirloin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.184706,0.0,0.0,0.0,0.0,0.324545,0.0,0.0,0.0


#### 5.3. Recommendation

In [115]:
# Load data for content-based model
reader = Reader(rating_scale=(min_rating, max_rating))
df_train = Dataset.load_from_df(train[['user_id', 'ingredients', 'ingredient_rating']], reader).build_full_trainset()
df_test = list(test.itertuples(index=False, name=None))

In [116]:
# Check for duplicate indices in df_dtm
if df_dtm.index.duplicated().any():
    print("Duplicate indices found:", df_dtm.index[df_dtm.index.duplicated()].unique())
else:
    print("No duplicate indices.")

Duplicate indices found: Index(['eggs', 'pepper', 'water', 'olive oil', 'parmesan cheese',
       'black pepper', 'butter', 'buttermilk', 'feta cheese', 'oatmeal',
       ...
       'fuyu persimmons', 'frozen potato slices',
       'duncan hines moist deluxe yellow cake mix', 'bear roast',
       'chicken consomme', 'butterfinger candy bar', 'sago',
       'maggi 2-minute noodles', 'diced tomatoes with seasonings',
       'colored sugar sprinkle'],
      dtype='object', name='ingredients', length=6786)


In [117]:
# Resolve duplicates by averaging the features of duplicate items
df_dtm = df_dtm.groupby(df_dtm.index).mean()

# Initialize and fit the content-based model
cb = model.ContentBased(NN=5)
cb.fit(df_dtm)
cb.fit_ratings(df_train)

# Predict test ratings
cb_pred = cb.test(df_test)

  self.prediction = (np.matmul(df_pivot.values, self.matrixNN) / denom) + self.user_avg[:,np.newaxis]


In [118]:
# Evaluate
cb_res = eval.evaluate(cb_pred, topn=8, rating_cutoff=4).rename(columns={'value':'CB_model'})
cb_res

Unnamed: 0,CB_model
RMSE,0.952692
MAE,0.576139
Recall,0.99229
Precision,0.948229
F1,0.969759
NDCG@8,0.854672


# 4. Finding best model

In [119]:
# Reset index if necessary
ib_res.reset_index(drop=True, inplace=True)
ub_res.reset_index(drop=True, inplace=True)
mf_res.reset_index(drop=True, inplace=True)
cb_res.reset_index(drop=True, inplace=True)

# Add a model identifier column directly (if not already done)
ib_res['Model'] = 'IB_model'
ub_res['Model'] = 'UB_model'
mf_res['Model'] = 'MF_model'
cb_res['Model'] = 'CB_TFIDF'

# Concatenate all DataFrames along the columns
comparison_df = pd.concat([ib_res, ub_res, mf_res, cb_res], axis=1, keys=['IB_model', 'UB_model', 'MF_model', 'CB_TFIDF'])

# Display the concatenated DataFrame
comparison_df

Unnamed: 0_level_0,IB_model,IB_model,UB_model,UB_model,MF_model,MF_model,CB_TFIDF,CB_TFIDF
Unnamed: 0_level_1,Item_based_15,Model,User_based_15,Model,SVD_20,Model,CB_model,Model
0,0.936958,IB_model,1.078202,UB_model,0.921728,MF_model,0.952692,CB_TFIDF
1,0.559275,IB_model,0.555197,UB_model,0.562667,MF_model,0.576139,CB_TFIDF
2,0.990618,IB_model,0.99991,UB_model,0.994322,MF_model,0.99229,CB_TFIDF
3,0.949343,IB_model,0.9411,UB_model,0.948492,MF_model,0.948229,CB_TFIDF
4,0.969541,IB_model,0.969614,UB_model,0.970866,MF_model,0.969759,CB_TFIDF
5,0.854787,IB_model,0.836824,UB_model,0.850724,MF_model,0.854672,CB_TFIDF


#### **Evaluation Metrics for Top-N Recommendation Systems**

To find the best model for a top-N recommendation task, we need to focus on metrics that best reflect the effectiveness of ranking and relevance in top-N recommendations. Typically, these include:

- **Precision**: This measures the accuracy of the recommendations provided to the user, focusing on the proportion of relevant items within the top-N suggested items.

- **Recall**: This metric evaluates how many of the relevant items are captured in the top-N recommendations compared to the total available relevant items.

- **NDCG (Normalized Discounted Cumulative Gain)**: NDCG accounts for the position of the correct recommendations within the list, providing a more nuanced view of the recommendation system's effectiveness at ranking higher quality recommendations higher in te list.
s in the top positions


#### **Model Performance Analysis Based on Key Metrics**

- **MF-model** appears to have a well-balanced performance across all metrics, with a particularly high F1 score and precision. It also has a good RMSE and NDCG@8, suggesting that it ranks items effectively and accurately predicts ratings.

- **Item-based** collaborative filtering shows strength in precision and RMSE compared to **User-based**, which indicates better prediction and ranking accuracy.
**CB-model** performs well in terms of recall, which is beneficial if the priority is not to miss any potentially relevant recommendations.

#### **Conclusion**: 
**MF-model** would be the best choice given its strong overall performance in precision, F1 score, and NDCG@8. This model balances the trade-offs between different metrics effectively, making it suitable for generating top N recommendations.

# 5. Recipe suggestion based on the ingredient

Step 1: After identifying the best model - MF model for ingredient recommendation, we proceed to extract the top 15 ingredients for each user and then locate the corresponding matched recipes.

Step 2: Finally, based on the previously calculated healthy scores, we propose the best ingredients and recipes that promote healthier options.

### 5.1. Extract top 15 ingredient 

In [120]:
# Convert predictions (on test set) to DataFrame. Using prediction from matrix factorization model
data = {
    'user_id': [pred.uid for pred in mf_pred],
    'ingredient_id': [pred.iid for pred in mf_pred],
    'predicted_rating': [pred.r_ui for pred in mf_pred]}

pred_df = pd.DataFrame(data)

# Group by user_id and apply sorting within groups to get top 15 ingredients
top_ingredients_per_user = pred_df.groupby('user_id').apply(
    lambda x: x.nlargest(15, 'predicted_rating')
).reset_index(drop=True)

  top_ingredients_per_user = pred_df.groupby('user_id').apply(


In [121]:
# Create recipe dataframe
recipes_df = filtered_meta[['recipe_id','ingredients','normalized_health_score']]

# Remove duplicates:
recipes_df = recipes_df.drop_duplicates(subset=['recipe_id'])

### 5.2. Find matching and healthy recipes

In [122]:
def score_recipes(recipes_df, top_ingredients):
    """Calculate scores for recipes based on matching ingredients and health scores."""
    # Calculate match scores 
    recipes_df['match_score'] = recipes_df['ingredients'].apply(lambda x: sum(ingredient in x for ingredient in top_ingredients))
    # Calculate final scoring with heathy score
    recipes_df['final_score'] = recipes_df['match_score'] * recipes_df['normalized_health_score']
    # Sort by final score 
    return recipes_df.sort_values(by='final_score', ascending=False)

def recommend_recipes_for_user(user_id, top_ingredients_per_user, recipes_df):
    """Find and recommend top recipes for a specific user based on their top ingredients."""
    if user_id in top_ingredients_per_user['user_id'].values:
        top_ingredients = top_ingredients_per_user.loc[top_ingredients_per_user['user_id'] == user_id, 'ingredient_id'].tolist()
        recommended_recipes = score_recipes(recipes_df.copy(), top_ingredients)
        return recommended_recipes.head()
    else:
        return pd.DataFrame() 

In [123]:
# Example usage:
user_id_example = 'U5879070'  # Use an actual user ID from your dataset
recommended_recipes = recommend_recipes_for_user(user_id_example, top_ingredients_per_user, recipes_df)

if not recommended_recipes.empty:
    print(recommended_recipes)  
else:
    print('No matching recipes found for user:', user_id_example)

       recipe_id                                        ingredients  \
130097  R9184992  [medium pasta shell, salt, vegetable oil, baco...   
164921  R5004845  [hard-boiled eggs, all-purpose flour, pork sau...   
15128   R9349626  [lean lamb stew meat, salt, pepper, all-purpos...   
45733   R6500995  [rabbit, olive oil, potatoes, carrots, onion, ...   
41289   R8699701  [vegetable oil, carrots, onion, green pepper, ...   

        normalized_health_score  match_score  final_score  
130097                 1.948310            5     9.741548  
164921                 1.947599            5     9.737995  
15128                  1.951225            4     7.804899  
45733                  1.950642            4     7.802568  
41289                  1.950475            4     7.801899  
