In [None]:
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import os
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split

from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error, precision_score, recall_score, roc_auc_score
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [None]:
# Load dataframes
df_recipes = pd.read_csv("data/df_with_diets.csv")
df_reviews = pd.read_csv("data/reviews.csv")

### Adjust rating values

In [None]:
# Change rating in order to make initial given 0 count unlike the 0 given for mising ratings
def update_rating(value, updated):
    if updated:
        return value
    if value == 0:
        return 1, True
    elif value == 1:
        return 2, True
    elif value == 2:
        return 3, True
    elif value == 3:
        return 4, True
    elif value == 4:
        return 5, True
    elif value == 5:
        return 6, True
    else:
        return value, False

# Creates a new column to check if values are already adjusted
df_reviews['Updated'] = False

# # Apply the function to the entire column "Rating" in the DataFrame "df_reviews"
df_reviews['Rating'], df_reviews['Updated'] = zip(*df_reviews.apply(lambda row: update_rating(row['Rating'], row['Updated']), axis=1))

# Removing the auxiliary column 'Updated
del df_reviews['Updated']

#### Reduce dataset by min. No of reviews for author & ratings

In [None]:
# Filter only recipes with more than 10 reviews
ratings = df_reviews.copy()
recipe_group = ratings.groupby("RecipeId").filter(lambda x: x["RecipeId"].count() > 10)

In [None]:
# Filter only authors with more than 5 reviews
user_rating_group = ratings.groupby(['AuthorId']).filter(lambda x: x["AuthorId"].count() > 5)

In [None]:
# Adjust initial dataset (ratings) according prerunned filters

ratings = ratings[ratings['AuthorId'].isin(user_rating_group['AuthorId'])]
ratings = ratings[ratings['RecipeId'].isin(recipe_group['RecipeId'])]


In [None]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 544469 entries, 5 to 1401942
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   ReviewId       544469 non-null  int64 
 1   RecipeId       544469 non-null  int64 
 2   AuthorId       544469 non-null  int64 
 3   AuthorName     544469 non-null  object
 4   Rating         544469 non-null  int64 
 5   Review         544465 non-null  object
 6   DateSubmitted  544469 non-null  object
 7   DateModified   544469 non-null  object
dtypes: int64(4), object(4)
memory usage: 37.4+ MB


In [None]:
# Create list with vegan recipeIds
vegan_recipes = df_recipes[df_recipes['vegan'] == 1]['RecipeId'].tolist()

# Keep only rows of RecipeIds which are element of vegan recipe list
ratings_filtered = ratings[ratings['RecipeId'].isin(vegan_recipes)]
vegan = ratings_filtered.copy()

In [None]:
# Cross validation if chicken recipe did not sneak into dataset
recipe_id = 76
is_in_vegan = recipe_id in vegan.index
print(is_in_vegan)

False


In [None]:
# Drop unrelevant columns
vegan.drop(['ReviewId', 'AuthorName', 'Review', 'DateSubmitted', 'DateModified'], axis=1, inplace=True)

In [None]:
vegan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21400 entries, 724 to 1401851
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   RecipeId  21400 non-null  int64
 1   AuthorId  21400 non-null  int64
 2   Rating    21400 non-null  int64
dtypes: int64(3)
memory usage: 1.2 MB


## Add test user

#### Add vegan & italien food lover

In [None]:
new_row = {'RecipeId': 29253, 'AuthorId': 10700, 'Rating': 6}

vegan.loc[len(vegan)] = new_row
print(vegan.tail(1))

       RecipeId  AuthorId  Rating
21400     29253     10700       6


In [None]:
new_row_1 = {'RecipeId': 358, 'AuthorId': 10700, 'Rating': 6}

vegan.loc[len(vegan)] = new_row_1
print(vegan.tail(2))

       RecipeId  AuthorId  Rating
21400     29253     10700       6
21401       358     10700       6


In [None]:
new_row_2 = {'RecipeId': 35805, 'AuthorId': 10700, 'Rating': 6}

vegan.loc[len(vegan)] = new_row_2
print(vegan.tail(2))

       RecipeId  AuthorId  Rating
21401       358     10700       6
21402     35805     10700       6


In [None]:
new_row_3 = {'RecipeId': 217734, 'AuthorId': 10700, 'Rating': 6}

vegan.loc[len(vegan)] = new_row_3
print(vegan.tail(2))

       RecipeId  AuthorId  Rating
21402     35805     10700       6
21403    217734     10700       6


In [None]:
new_row_4 = {'RecipeId': 5413, 'AuthorId': 10700, 'Rating': 6}

vegan.loc[len(vegan)] = new_row_4
print(vegan.tail(2))

       RecipeId  AuthorId  Rating
21403    217734     10700       6
21404      5413     10700       6


In [None]:
new_row_5 = {'RecipeId': 403282, 'AuthorId': 10700, 'Rating': 6}

vegan.loc[len(vegan)] = new_row_5
print(vegan.tail(2))

       RecipeId  AuthorId  Rating
21404      5413     10700       6
21405    403282     10700       6


In [None]:
new_row_6 = {'RecipeId': 115553, 'AuthorId': 10700, 'Rating': 6}

vegan.loc[len(vegan)] = new_row_6
print(vegan.tail(2))

       RecipeId  AuthorId  Rating
21405    403282     10700       6
21406    115553     10700       6


In [None]:
new_row_7 = {'RecipeId': 81419, 'AuthorId': 10700, 'Rating': 6}

vegan.loc[len(vegan)] = new_row_7
print(vegan.tail(2))

       RecipeId  AuthorId  Rating
21406    115553     10700       6
21407     81419     10700       6


In [None]:
new_row_8 = {'RecipeId': 175799, 'AuthorId': 10700, 'Rating': 6}

vegan.loc[len(vegan)] = new_row_8
print(vegan.tail(2))

       RecipeId  AuthorId  Rating
21407     81419     10700       6
21408    175799     10700       6


In [None]:
# Cross validation if chicken recipe did not sneak into dataset
recipe_id = 76
is_in_vegan = recipe_id in vegan.index
print(is_in_vegan)

False


In [None]:
vegan.head()

Unnamed: 0,RecipeId,AuthorId,Rating
724,8935,10678,6
821,8970,11455,1
822,9267,10678,5
966,9180,6338,5
1076,8949,6897,5


## Create matrix for models


In [None]:
# Create table with AuthorId as columns, RecipeId as index & ratings as value
vegan = vegan.pivot_table(index='RecipeId', columns='AuthorId', values='Rating')

# Replace NaN with 0
vegan.fillna(0, inplace=True)

In [None]:
# Make copy for NMF model
vegan_2 = vegan.copy()

## KNN

In [None]:
# Define AuthorId for which model should calculate recommendations
specific_author_id = 10700 # HERE THE AUTHORID NEEDS TO BE ADJUSTED!

# Extract Ratings for specific AuthorId
query_data = vegan.loc[:, specific_author_id].values.reshape(1, -1)

# Initialize and fit the Nearest Neighbors model
n_neighbors = min(600, vegan.shape[0]) # HERE NO OF RECOMMENDATIONS NEEDS TO BE ADJUSTED ACCORDING TO NEED (size probably does not need to be as big as for general model no needed)
knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_neighbors)
knn_model.fit(vegan.T)  # Transpose the DataFrame to obtain the correct shape

# Find the indices of the nearest neighbors for the specific AuthorId
distances, neighbor_indices = knn_model.kneighbors(query_data)

# Make sure that index corresponds with expected value
valid_indices = [idx for idx in neighbor_indices.flatten() if idx < vegan.shape[0]]

# Output the RecipeIds of the nearest neighbors
nearest_recipe_ids = vegan.index[valid_indices]

# Retrieve the corresponding names for the RecipeIds
nearest_recipe_names = df_recipes[df_recipes['RecipeId'].isin(nearest_recipe_ids)]['Name'].values

# Create a DataFrame with the RecipeIds and their names
nearest_recipes_df = pd.DataFrame({'RecipeId': nearest_recipe_ids, 'Name': nearest_recipe_names})

# Sort the DataFrame by the smallest distance
nearest_recipes_df_sorted = nearest_recipes_df.sort_values(by='RecipeId')

# Give RecipeIds & Names
print("Recommendations for AuthorId:", specific_author_id)
print(nearest_recipes_df_sorted)

Recommendations for AuthorId: 10700
    RecipeId                                               Name
44       519                           Delicious Scrambled Tofu
18      4284                   Garlic-Herb Portabella Mushrooms
17      6916                        Italian Lemon Ice (Granita)
48      8924         Spicy Stir-Fried Green Beans and Scallions
45     13126                           Vegan Spelt Banana Bread
25     13526          Tofu Lasagna Filling (ricotta Substitute)
0      14306                       Spicy Hash Browns - Homemade
42     16596      Winter Squash, Chickpea &amp; Red Lentil Stew
46     22827                      Very Simple Blueberry Muffins
7      29671                Lemon Roasted Potatoes, Greek Style
38     31972                                   Italian Herb Mix
14     32441                  Creole-Style Vegetarian Jambalaya
27     36431                                         Kale Chips
9      38584                                     Tofu Nut Balls
43  

In [None]:
# List of recommendation (maybe required for streamlit?)
nearest_recipe_ids

Index([   519, 123538, 287118,  36513, 295639,  34765, 201257, 213026, 258421,
        29671, 397640,  43473,  37889, 357905,  16587,  96581, 145024, 145464,
        58447, 123857, 129353,  12395, 131841, 134219,  14186, 123775, 173826,
        69474, 186145,  56643, 179945,  94673, 141807, 197922, 380852, 256081,
       137037,  85782,  51038, 136381, 329664,  31388, 207841,  92658, 101982,
        53809, 444946,  26602, 255175, 259057, 302367,  27140,  38125, 191126,
       285468, 137135,  23137, 175799, 140836, 238765, 173466, 134186, 195086,
        45108, 114217,  95135, 382231,  34319, 234333,  12339,  68832, 253308,
       256370, 131988, 158292, 276984, 134349, 193195, 490247,  22827, 453045,
        21105, 106056,  38766, 458008],
      dtype='int64', name='RecipeId')

### NMF


In [None]:
# Initalization of model

model = NMF(n_components=6)
model.fit(vegan_2)

In [None]:
# Calculation of 2 lower dimensional matrices
H = pd.DataFrame(model.components_)
W = pd.DataFrame(model.transform(vegan_2))

# Create new df with dot product of W & H
V = pd.DataFrame(np.dot(W, H), columns=vegan_2.columns)
V.index = vegan_2.index

V.tail()

AuthorId,1533,1535,1792,2148,2310,2312,2586,2695,3205,3288,...,2001928915,2002048922,2002084712,2002169932,2002256447,2002273175,2002375343,2002404048,2002448241,2002754832
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
484546,0.000483,0.002186,0.00012,4.7e-05,0.00155,5.9e-05,0.000691,0.000134,0.000631,0.000353,...,0.000322,3.3e-05,2.9e-05,0.000109,0.000572,0.000101,0.000555,5e-06,4.3e-05,0.000369
485857,0.001372,0.03487,0.000717,0.000248,0.031914,0.001026,0.0,0.000249,6.7e-05,0.000943,...,0.000812,0.000104,9.6e-05,0.000745,0.0,0.000464,0.001483,8.4e-05,0.000128,0.001237
490247,0.000815,0.014592,0.000616,0.000355,0.01031,0.00059,0.005288,0.000845,0.004826,0.000296,...,0.000698,4.7e-05,2.5e-05,0.000492,0.004375,0.000138,0.000465,3.4e-05,4.2e-05,0.000629
490362,0.000671,0.003451,0.000268,0.000128,0.001871,0.000131,0.002199,0.000353,0.002007,0.000426,...,0.000499,4.2e-05,3.6e-05,0.000187,0.001819,0.000121,0.00067,1e-05,5.1e-05,0.000484
503970,0.01122,0.04531,0.001485,0.000284,0.039588,0.000746,0.0,0.000949,0.0,0.009012,...,0.006779,0.00081,0.000752,0.002054,0.0,0.00257,0.014169,7.9e-05,0.001089,0.008937


In [None]:
# Output of some AuthorIds soley testing purposes
V.columns.unique()

In [None]:
# Create list with top 10 recommendations according to values of dot product
top_10_indices = V[1535].nlargest(10).index.tolist()

print(top_10_indices)

[50847, 35805, 55768, 46501, 29935, 81211, 47111, 75061, 14681, 28662]


In [None]:
# Create dataframe "df_recomm" for recommendation & fill with RecipeIds
df_recomm = pd.DataFrame({'RecipeId': top_10_indices})

print(df_recomm)

   RecipeId
0     50847
1     35805
2     55768
3     46501
4     29935
5     81211
6     47111
7     75061
8     14681
9     28662


In [None]:
# Merge information about recipe names with RecipeIds
df_recomm = df_recomm.merge(df_recipes[['RecipeId', 'Name']], on='RecipeId', how='left')
# Rename new column
df_recomm = df_recomm.rename(columns={'Name': 'RecipeName'})

print(df_recomm)

   RecipeId                                         RecipeName
0     50847                                  Roasted Asparagus
1     35805  Easy Peezy Pizza Dough (Bread Machine Pizza Do...
2     55768                    Baja Black Beans, Corn and Rice
3     46501                         Basic Machine French Bread
4     29935                                   1 Pan Fudge Cake
5     81211             Failproof French Bread (Bread Machine)
6     47111        Cinnamon Raisin Bread for the Bread Machine
7     75061      Awesome Homemade Crusty Bread (Bread Machine)
8     14681                                Oven-Dried Tomatoes
9     28662                      Healthy Cucumber-Tomato Salad
