Import dependencies

In [1]:
import pandas as pd
import nltk
from pandas.core.common import flatten
import sklearn.metrics
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import SnowballStemmer as snow


Load file as a dataframe

In [2]:
file = '/Users/karenwarmbein/ds/capstone/data/RAW_recipes.csv'
df = pd.read_csv(file)

Explore the data

In [3]:
print(df.shape)
print(df.columns)
df.head(1)

(231637, 12)
Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7


In [35]:
df.shape

(231637, 12)

In [25]:
df_gf = df[(df['description'].str.contains('gluten free') |
           df['description'].str.contains('gluten intolerant') |
           df['ingredients'].str.contains('gluten free') |
           df['ingredients'].str.contains('gluten intolerant') |
           df['name'].str.contains('gluten free') |
           df['name'].str.contains('gluten intolerant') |
           ~ df['ingredients'].str.contains('oat|rye|barley|wheat') 
           )]
df_gf.shape

(217325, 12)

Take a subset of the data (only 100 rows & the relevant features); set the index as the rexipe uniqe id

In [26]:
df_recipes = df_gf.loc[0:100, ['id', 'name', 'ingredients']]
df_recipes = df_recipes.set_index('id')
df_recipes.head()

Unnamed: 0_level_0,name,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
137739,arriba baked winter squash mexican style,"['winter squash', 'mexican seasoning', 'mixed ..."
31490,a bit different breakfast pizza,"['prepared pizza crust', 'sausage patty', 'egg..."
112140,all in the kitchen chili,"['ground beef', 'yellow onions', 'diced tomato..."
59389,alouette potatoes,"['spreadable cheese with garlic and herbs', 'n..."
44061,amish tomato ketchup for canning,"['tomato juice', 'apple cider vinegar', 'sugar..."


Vectorize the ingredients

In [27]:
vec = CountVectorizer()

In [28]:
# fit transform the ingredients
X = vec.fit_transform(df_recipes.ingredients)

In [29]:
df_ingredients = pd.DataFrame(X.toarray(), columns=vec.get_feature_names(), index=df_recipes.index)

#view the first three vectors
df_ingredients.head()

Unnamed: 0_level_0,added,albacore,all,almonds,american,and,angel,apple,apples,avocado,...,winter,with,wood,worcestershire,wrap,yeast,yellow,yogurt,yolks,zucchini
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
137739,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
31490,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
112140,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
59389,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
44061,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Create a cosine similarity table

In [30]:
cosine_similarity_array = sklearn.metrics.pairwise.cosine_similarity(df_ingredients)
df_similarity = pd.DataFrame(cosine_similarity_array, columns=df_ingredients.index).set_index(df_ingredients.index)
df_similarity.head()

id,137739,31490,112140,59389,44061,5289,25274,67888,70971,75452,...,98930,83133,39947,44895,39363,62368,111875,42522,34930,63793
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
137739,1.0,0.090909,0.055989,0.155126,0.226134,0.0,0.128565,0.073671,0.0,0.1557,...,0.241747,0.150756,0.0,0.083624,0.150756,0.06742,0.075378,0.080582,0.13484,0.095346
31490,0.090909,1.0,0.111979,0.310253,0.150756,0.090909,0.0,0.110506,0.0,0.1557,...,0.241747,0.075378,0.0,0.167248,0.150756,0.20226,0.075378,0.161165,0.13484,0.095346
112140,0.055989,0.111979,1.0,0.095539,0.139272,0.0,0.0,0.226863,0.0,0.047946,...,0.148888,0.139272,0.0,0.051503,0.139272,0.041523,0.0,0.148888,0.290659,0.176166
59389,0.155126,0.310253,0.095539,1.0,0.300123,0.0,0.182818,0.209519,0.0,0.044281,...,0.320844,0.085749,0.0,0.047565,0.042875,0.038348,0.0,0.09167,0.076696,0.054233
44061,0.226134,0.150756,0.139272,0.300123,1.0,0.226134,0.159901,0.305424,0.0,0.193649,...,0.334077,0.375,0.111803,0.138675,0.125,0.055902,0.0625,0.0,0.055902,0.158114


Look up primary recipes

In [31]:
#test case #1: apple
#---- TO DO ---- let user enter a comma separated list of ingredients 

user_response = 'apple'

In [32]:
#--TO DO--- combine apple and apples column

df_primary_recipes = df_ingredients[df_ingredients.get(user_response) > 0]
df_primary_recipes

Unnamed: 0_level_0,added,albacore,all,almonds,american,and,angel,apple,apples,avocado,...,winter,with,wood,worcestershire,wrap,yeast,yellow,yogurt,yolks,zucchini
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
44061,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5289,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
63593,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
71457,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Make a list of the primary recipe id's (the index)

In [33]:
list_primary_recipes = df_primary_recipes.index.tolist()
list_primary_recipes

[44061, 5289, 63593, 71457]

In [34]:
for id_ in list_primary_recipes:
    # lookup each primary recipe against all other recipes in the similarity matrix
    for column_id in df_similarity.columns:
        
        # check if the similarity is between values    
        if df_similarity.loc[id_,  column_id] > .4 \
            and df_similarity.loc[id_, column_id] < .98:

            #for the print statement
            primary = df_recipes.loc[id_, 'name']
            similar = df_recipes.loc[column_id, 'name']
        
            print(f"Has {user_response}: {primary} {id_} \nSimilar recipie: {similar} {column_id}\n")

Has apple: apple a day  milk shake 5289 
Similar recipie: healthy for them  yogurt popsicles 67664

Has apple: apple a day  milk shake 5289 
Similar recipie: more  more    apple pear jigglers 63593

Has apple: more  more    apple pear jigglers 63593 
Similar recipie: apple a day  milk shake 5289

Has apple: rise and shine  german fruit pancake 71457 
Similar recipie: deep fried dessert thingys 107699

Has apple: rise and shine  german fruit pancake 71457 
Similar recipie: jeanne s style  birthday cake 83025

Has apple: rise and shine  german fruit pancake 71457 
Similar recipie: make it your way  shortcakes 35653



## TF-IDF Work

Use a TFIDF vectorizer

In [36]:
stemmer = nltk.SnowballStemmer("english")

def tokenize(text):
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    tokens = nltk.regexp_tokenize(text, pattern)
    stemmer = snow("english")
    stems = []
    for item in tokens:
        stems.append(stemmer.stem(item))
    return stems

In [37]:

# remove stop words: ex: 'and'
# tokenize function
# Use a snowball stemmer

tfidf = sklearn.feature_extraction.text.TfidfVectorizer(stop_words='english', tokenizer=tokenize)

response = tfidf.fit_transform(df_recipes.ingredients)

df_tfidf = pd.DataFrame(response.toarray(), columns=tfidf.get_feature_names(), index=df_recipes.index)



df_tfidf.head()

  'stop_words.' % sorted(inconsistent))


Unnamed: 0_level_0,ad,albacor,almond,american,angel,appl,avocado,bacon,bake,banana,...,wine,winter,wood,worcestershir,wrap,yeast,yellow,yogurt,yolk,zucchini
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
137739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.399415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31490,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.214982,0.0,0.0,0.0
59389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.186813,0.0,0.0,0.0,0.0,0.0,0.209172,0.0,0.0,0.0
44061,0.0,0.0,0.0,0.0,0.0,0.319487,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Create a cosine similarity table

In [38]:
cosine_similarity_array_tfidf = sklearn.metrics.pairwise.cosine_similarity(df_tfidf)
df_similarity_tfidf = pd.DataFrame(cosine_similarity_array_tfidf, columns=df_ingredients.index).set_index(df_ingredients.index)
df_similarity_tfidf.head()

id,137739,31490,112140,59389,44061,5289,25274,67888,70971,75452,...,98930,83133,39947,44895,39363,62368,111875,42522,34930,63793
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
137739,1.0,0.018825,0.011787,0.072172,0.080791,0.0,0.138922,0.054928,0.0,0.052545,...,0.114344,0.04708,0.0,0.08855,0.053954,0.0294,0.029204,0.07896,0.065529,0.016874
31490,0.018825,1.0,0.040842,0.09218,0.048072,0.05509,0.0,0.034342,0.0,0.054619,...,0.047838,0.019242,0.062883,0.077118,0.056084,0.10417,0.041691,0.082591,0.04124,0.018706
112140,0.011787,0.040842,1.0,0.06985,0.124099,0.0,0.0,0.202878,0.0,0.011669,...,0.123495,0.10177,0.0,0.026209,0.068675,0.020408,0.0,0.158907,0.203367,0.114684
59389,0.072172,0.09218,0.06985,1.0,0.157588,0.0,0.162783,0.11784,0.0,0.011354,...,0.127842,0.031796,0.0,0.017234,0.011658,0.019856,0.0,0.050316,0.025124,0.062743
44061,0.080791,0.048072,0.124099,0.157588,1.0,0.25641,0.073907,0.234682,0.0,0.09763,...,0.270398,0.243664,0.052556,0.054875,0.049441,0.024744,0.024578,0.0,0.040925,0.058021


In [39]:
# lemmatize the users input
user_response_tfidf = stemmer.stem(user_response)
user_response_tfidf

'appl'

In [40]:
df_primary_recipes_tfidf = df_tfidf[df_tfidf.get(user_response_tfidf) > 0]
df_primary_recipes_tfidf

Unnamed: 0_level_0,ad,albacor,almond,american,angel,appl,avocado,bacon,bake,banana,...,wine,winter,wood,worcestershir,wrap,yeast,yellow,yogurt,yolk,zucchini
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
44061,0.0,0.0,0.0,0.0,0.0,0.319487,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5289,0.0,0.0,0.0,0.0,0.0,0.635317,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63593,0.0,0.0,0.0,0.0,0.0,0.341483,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.341483,0.0,0.0
5060,0.0,0.0,0.0,0.0,0.0,0.27818,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19208,0.0,0.0,0.0,0.0,0.0,0.395208,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71457,0.0,0.0,0.450538,0.0,0.0,0.335158,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
list_primary_recipes_tfidf = df_primary_recipes_tfidf.index.tolist()
list_primary_recipes_tfidf

[44061, 5289, 63593, 5060, 19208, 71457]

In [42]:
for id_ in list_primary_recipes_tfidf:
    # lookup each primary recipe against all other recipes in the similarity matrix
    for column_id in df_similarity_tfidf.columns:
        
        # check if the similarity is between values    
        if df_similarity_tfidf.loc[id_,  column_id] > .4 \
            and df_similarity_tfidf.loc[id_, column_id] < .99:

            #for the print statement
            primary = df_recipes.loc[id_, 'name']
            similar = df_recipes.loc[column_id, 'name']
            value = df_similarity_tfidf.loc[id_,  column_id]
            
            print(f"Has {user_response}: {primary} id: {id_} \nSimilar recipie: {similar} id: {column_id} value: {value}\n")

Has apple: apple a day  milk shake id: 5289 
Similar recipie: healthy for them  yogurt popsicles id: 67664 value: 0.4604935674159842

Has apple: rise and shine  german fruit pancake id: 71457 
Similar recipie: make it your way  shortcakes id: 35653 value: 0.5014459887636382

