# Part 4: Modeling

In [1]:
import pandas as pd

from gensim.models.word2vec import Word2Vec

import numpy as np

from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity

from scipy import sparse

import re

import pickle

In [1]:
#Class inspired by Jack Leitch
class MeanEmbeddingVectorizer(object):


    def __init__(self, model):

        self.model = model
        self.vector_size = model.vector_size

    def fit(self):
        return self

    def transform(self,docs):

        doc_word_vector = self.doc_average_list(docs)
        return doc_word_vector

    def doc_average(self, doc):

        mean = []

        for word in doc:
            if word in self.model.wv.index_to_key:
                mean.append(self.model.wv.get_vector(word))

        if not mean:
                return np.zeros(self.vector_size)
        else:
            mean = np.array(mean).mean(axis = 0)
            return mean

    def doc_average_list(self,docs):

        return np.vstack([self.doc_average(doc) for doc in docs])


In [3]:
df = pd.read_csv('recipe_cleaned_spacy2.csv',index_col=0)

  mask |= (ar1 == a)


In [4]:
df.dropna(inplace = True)
df.reset_index(inplace = True)
df.drop(columns = ['index'], inplace = True)

In [5]:
df.drop(columns = ['level_0'],inplace = True)

##### Annotation:
This model above trains over 1 million recipes. It is the main model utilized to address the problem statement of this investigation. With that said, because the objective is to design a web app, I want to give the option of having faster results, therefore I made 3 models. The first model trains over 1 million recipes and gives the best recommendations. The second model trains on 500,00 recipes, and the third model trains on 100,000 recipes. It was seen on the web app that the first model takes over 3 minutes to output results. The second model takes about 1 minutes 45 seconds to output results. Finally the third model take about 30 seconds to output results. I believe that giving these options to users will facilitate the use of the app and make the overall experience more enjoyable. 

##### Model Hyperparameters: 
 - SG: It was observed that using skip-gram (sg = 1) was much more optimal than using a c-bow model (sg = 0)
 - Window: A value of 8 was chosen for this hyper-parameter because it was observed that the recommendations outputted were much more appropriate than using a smaller or bigger value. It was observed that using a larger value made recommendation too broad and a smaller value made recommendations too specific. After training the model several times, it was observed that a value of 8 was the most optimal hyper-parameter.
 - Min Count: It was intuitive to ignore all words that were not presently frequent more than once.
 - Workers: A value of 8 was used to train the model more rapidly. 

### Model 1


In [50]:
corpus1 = [i.split(', ') for i in df['raw_ingredients']]

In [51]:
model1 = Word2Vec(sentences=corpus1, sg = 1, window=8, min_count=1, workers=8, seed = 42)

In [104]:
# saving model
#pickle.dump(model1, open('Trained_Word2Vec_Model.pkl', 'wb'))

In [93]:
mev_model = MeanEmbeddingVectorizer(model1)

In [94]:
recipes = mev_model.transform(corpus1)
recipes = [recipe.reshape(1, -1) for recipe in recipes]
len(recipes)

1095610

In [105]:
#Saving trained recipes
#pickle.dump(recipes, open('Trained_Recipes.pkl', 'wb'))

In [106]:
# Input ingredients here:
ingredients = ['garlic']

In [107]:
ingredients_vec = mev_model.doc_average(ingredients)
ingredients_vec = ingredients_vec.reshape(1,-1)
ingredients_vec

array([[ 0.5838497 , -0.13494323, -0.3777168 ,  0.22974521, -0.17235155,
         0.612447  ,  0.1079702 ,  0.31237742,  0.1376416 , -0.23954038,
        -0.7165192 ,  0.3355758 , -0.44837242,  0.39962763,  0.05851945,
        -0.2554324 ,  0.36359704, -0.16568245, -0.59909034, -0.2211798 ,
        -0.23983523,  0.20817852,  0.6178698 ,  0.36052775,  0.32834893,
         0.15039438, -0.08724194,  0.5564775 , -0.11186665, -0.25103796,
        -0.32387325,  0.015442  ,  0.11832143,  0.11494556, -0.25509137,
        -0.40199208,  0.59099567,  0.04025366, -0.03242267, -0.01610409,
         0.37565672,  0.12011106,  0.6788713 , -0.00665618, -0.09390499,
         0.1276626 , -0.09634619,  0.17009056,  0.35540503, -0.3561705 ,
        -0.2783841 , -0.54009384, -0.32537788,  0.16948396,  0.20173383,
         0.0307413 , -0.16080798,  0.3946266 , -0.2023036 ,  0.03321686,
         0.11943647,  0.01040315, -0.0026666 ,  0.14515261, -0.3206344 ,
         0.20001465, -0.20309588, -0.28382602, -0.3

In [100]:
similarities = list(map(lambda x: cosine_similarity(ingredients_vec, x)[0][0],recipes))


df_recommendation  = df
df_recommendation['Cosine Similarity'] = similarities
df_recommendation= df_recommendation[['Cosine Similarity', 'title','raw_ingredients','directions']]
df_recommendation.rename(columns = {'title':'Recipe Name', 'raw_ingredients': 'Ingredients','directions':'Directions'})
df_recommendation.sort_values(by = 'Cosine Similarity', ascending = False)[:20]

Unnamed: 0,Cosine Similarity,title,raw_ingredients,directions
79093,0.944944,Fish Stew,"fish, bacon, onion, tomato, potato, , salt, sh...","Cook diced bacon and onion until sauteed., Add..."
594551,0.939401,Fisherman'S Catch Chowder,"fish, onion, celery, carrot, parsley, tomato, ...","Cut cleaned fish into 1-inch pieces., Add all ..."
391224,0.9386,Fish Stew,"fish, bacon, onion, potato, tomato, salt, pepp...","Parboil fish in water; lift fish out to cool.,..."
1021029,0.937461,Fish Hash,"fish, potato, onion, egg, salt, pepper",Combine all the ingredients and fry in bacon f...
687029,0.937461,Old Fashion Fish Cakes,"fish, potato, pepper, salt, egg, onion","Mix all ingredients., Shape into cakes., Bake ..."
596712,0.937304,Joe'S Fish Chowder,"fish fillet, water, potato, onion, potato, tom...","Boil fish fillet in water for 5 minutes., Then..."
258725,0.936167,Catfish Chowder,"fish, potato, onion, pepper, salt, butter, tomato","Boil catfish until tender;, take, out all bone..."
30060,0.933504,Fish Cakes Oven Baked Or Fried,"fish, onion, bacon fat, salt, egg, potato, , b...","Separate fish into small flakes., Cook onion i..."
693500,0.932722,Baked Fish,"fish, salt, oil, tomato, celery, onion, garlic...","Wash fish, add salt and pepper and rub with le..."
761532,0.932653,Fish Cakes,"potato, fish, italian good season, egg, flour","Mix, first, 4 ingredients well., Pat out 3-oun..."


### Model 2
Making a model that train over a smaller data frame. As a result, the model will get faster results. This will be practical for the web app.

In [113]:
df2 = df[:100000]

In [114]:
corpus2 = [i.split(', ') for i in df2['raw_ingredients']]

In [115]:
model2 = Word2Vec(sentences=corpus2, sg = 1, window=8, min_count=1, workers=8, seed = 42)

In [118]:
# saving model
#pickle.dump(model2, open('Trained_Word2Vec_Model2.pkl', 'wb'))

In [116]:
mev_model2 = MeanEmbeddingVectorizer(model2)

In [117]:
recipes2 = mev_model2.transform(corpus2)
recipes2 = [recipe.reshape(1, -1) for recipe in recipes2]
len(recipes2)

100000

In [119]:
#Saving trained recipes
#pickle.dump(recipes2, open('Trained_Recipes2.pkl', 'wb'))

### Model 3

In [120]:
df3 = df[:500000]

In [121]:
corpus3 = [i.split(', ') for i in df3['raw_ingredients']]

In [123]:
model3 = Word2Vec(sentences=corpus3, sg = 1, window=8, min_count=1, workers=8, seed = 42)

In [124]:
# saving model
#pickle.dump(model3, open('Trained_Word2Vec_Model3.pkl', 'wb'))

In [125]:
mev_model3 = MeanEmbeddingVectorizer(model3)

In [126]:
recipes3 = mev_model3.transform(corpus3)
recipes3 = [recipe.reshape(1, -1) for recipe in recipes3]
len(recipes3)

500000

In [127]:
#Saving trained recipes
#pickle.dump(recipes3, open('Trained_Recipes3.pkl', 'wb'))