In [34]:
import pandas as pd
import numpy as np

In [35]:
dataset = pd.read_csv('../dataset/preprocessed_data.csv')

In [36]:
dataset.head()

Unnamed: 0,user_id,recipe_id,date,rating,review,name,minutes,contributor_id,submitted,tags,...,sugar,sodium,protein,saturated_fat,carbohydrates,food_types,negative,neutral,positive,compound
0,7708,60599,2005-09-02,4,very good,kfc honey bbq strips,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.238,0.762,0.4927
1,27707,60599,2005-12-22,5,better than the real,kfc honey bbq strips,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.508,0.492,0.4404
2,35308,60599,2006-09-26,5,absolutely awesome i was speechless when i tri...,kfc honey bbq strips,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.883,0.117,0.659
3,19399,60599,2007-03-09,5,these taste absolutely wonderful my son in law...,kfc honey bbq strips,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.675,0.325,0.8908
4,43887,60599,2008-02-20,0,made my own buttermilk w vinegar and milk. use...,kfc honey bbq strips,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.929,0.071,0.4588


In [37]:
text_columns = ['name', 'review', 'tags', 'steps', 'description', 'ingredients']

In [38]:
# create a new dataframe with only text columns
text_data = dataset[text_columns]

In [39]:
# convert all text to str
text_data = text_data.astype(str)

In [40]:
# merge all text columns into one column
text_data['text'] = text_data[text_columns].apply(lambda x: ' '.join(x), axis=1)

In [41]:
text_data['text'][0]

"kfc honey bbq strips very good 60-minutes-or-less, time-to-make, main-ingredient, preparation, healthy, poultry, low-fat, chicken, dietary, low-saturated-fat, high-protein, high-in-something, low-in-something, meat mix flour, salt and pepper in bowl\nset aside\nplace buttermilk in seperate bowl\nheat oil for frying and dips chicken strips in flour then in buttermilk and then in flour again\nplace in hot oil and fry until lightly golden brown\nremove strips and drain\nplace all bbq sauce ingredients in a small sauce pan and simmer on low stirring frequently for 20 minutes\nlet sauce cool\ndip one chicken strip in sauce at a time and place in a baking pan that has been sprayed lightly with a non stick spray\nplace in a preheated oven at 350 and bake for 20 minutes these are so yummy and they do taste just like kfc's honey bbq strips. enjoy!! chicken tenders, flour, garlic powder, salt, ground pepper, buttermilk, vegetable oil, hickory flavored barbecue sauce, water, honey, ketchup, liqu

In [42]:
# remove everything except alphanumeric and spaces
text_data['text'] = text_data['text'].str.replace('[^A-Za-z0-9\s]+', '')

In [43]:
text_data['text'][0]

"kfc honey bbq strips very good 60-minutes-or-less, time-to-make, main-ingredient, preparation, healthy, poultry, low-fat, chicken, dietary, low-saturated-fat, high-protein, high-in-something, low-in-something, meat mix flour, salt and pepper in bowl\nset aside\nplace buttermilk in seperate bowl\nheat oil for frying and dips chicken strips in flour then in buttermilk and then in flour again\nplace in hot oil and fry until lightly golden brown\nremove strips and drain\nplace all bbq sauce ingredients in a small sauce pan and simmer on low stirring frequently for 20 minutes\nlet sauce cool\ndip one chicken strip in sauce at a time and place in a baking pan that has been sprayed lightly with a non stick spray\nplace in a preheated oven at 350 and bake for 20 minutes these are so yummy and they do taste just like kfc's honey bbq strips. enjoy!! chicken tenders, flour, garlic powder, salt, ground pepper, buttermilk, vegetable oil, hickory flavored barbecue sauce, water, honey, ketchup, liqu

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

max_features = 200
# create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)

In [45]:
# fit the vectorizer to text data
vectorizer.fit(text_data['text'])

In [46]:
# transform the text data using TF-IDF vectorizer
text_data_transformed = vectorizer.transform(text_data['text'])

In [47]:
# convert the transformed text data into an array
text_array = text_data_transformed.toarray()

In [48]:
text_array.shape

(942368, 200)

In [49]:
dataset.shape

(942368, 27)

In [50]:
from sklearn.decomposition import NMF

# create an instance of the NMF model
nmf_model = NMF(n_components=50, max_iter=200, solver='mu')
nmf_model

In [51]:
# fit the NMF model to the text data
nmf_model.fit(text_array)

In [52]:
# extract the factor matrices from the NMF model
W = nmf_model.transform(text_array)
H = nmf_model.components_

In [53]:
# print the shapes of the factor matrices
print("Shape of W matrix: ", W.shape)
print("Shape of H matrix: ", H.shape)

Shape of W matrix:  (942368, 50)
Shape of H matrix:  (50, 200)


In [54]:
# Evaluate the score of NMF
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score

print("MSE: ", mean_squared_error(text_array, np.dot(W, H)))
print("MAE: ", mean_absolute_error(text_array, np.dot(W, H)))
print("R2: ", r2_score(text_array, np.dot(W, H)))

MSE:  0.00145695224551483
MAE:  0.022880258083344783
R2:  0.39836355190812367


In [57]:
# recommend similar recipes given a recipe id
recipe_id = 60599

# get the index of the recipe id
recipe_index = dataset[dataset['recipe_id'] == recipe_id].index[0]

# get the factor values of the recipe
recipe_factors = W[recipe_index]

# calculate the similarity of this recipe with others
similarity = np.dot(W, recipe_factors)

# get the indices of the top 10 most similar recipes (excluding itself)
top10_indices = np.argsort(similarity)[-101:-1]

# print the top 10 most similar recipes
x = dataset.iloc[top10_indices]

# remove the recipe itself and repeated recipe_ids
x = x[x['recipe_id'] != recipe_id]
x = x.drop_duplicates(subset=['recipe_id'])

# print the top 10 most similar recipes
x.head(10)
# print json format
x.to_json(orient='records')

'[{"user_id":132284,"recipe_id":178297,"date":"2013-06-17","rating":0,"review":"this. is soooooo good didn039t have any buttermilk around so i just add a tbsp. of white vinegar to for each cup of milk wait10mins. amp voila you have butter milk. ","name":"kentucky fried chicken honey bbq chicken strips","minutes":30,"contributor_id":1761123,"submitted":"2011-01-07","tags":"weeknight, 30-minutes-or-less, time-to-make, course, main-ingredient, cuisine, preparation, occasion, north-american, healthy, main-dish, poultry, american, southern-united-states, low-fat, chicken, dietary, copycat, low-saturated-fat, low-in-something, meat","n_steps":11,"steps":"mix flour, salt, and pepper in a bowl\\nplace buttermilk in a seperate bowl\\nheat oil for frying\\ndip chicken into flour, then buttermilk, then flour again\\nfry until golden brown\\nfor sauce\\nplace all sauce ingredients in a small sauce pan\\nmix thouroughly and simmer on low heat, stirring frequently for about 20 minutes\\nlet cool sli

Unnamed: 0,user_id,recipe_id,date,rating,review,name,minutes,contributor_id,submitted,tags,...,sugar,sodium,protein,saturated_fat,carbohydrates,food_types,negative,neutral,positive,compound
0,7708,60599,2005-09-02,4,very good,kfc honey bbq strips,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.238,0.762,0.4927
1,27707,60599,2005-12-22,5,better than the real,kfc honey bbq strips,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.508,0.492,0.4404
2,35308,60599,2006-09-26,5,absolutely awesome i was speechless when i tri...,kfc honey bbq strips,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.883,0.117,0.659


In [58]:
# save the model
import pickle

with open('../backend/saved_models/nmf_model.pkl', 'wb') as f:
    pickle.dump(nmf_model, f)

In [59]:
# save the weights
np.save('../backend/saved_models/nmf_W.npy', W)
np.save('../backend/saved_models/nmf_H.npy', H)

In [None]:
W = np.load('../backend/saved_models/nmf_W.npy')