Import dependencies

In [1]:
import pandas as pd
import nltk
from pandas.core.common import flatten
import sklearn.metrics
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import SnowballStemmer as snow


Load file as a dataframe

In [2]:
file = '/Users/karenwarmbein/ds/capstone/data/RAW_recipes.csv'
df = pd.read_csv(file)

Explore the data

In [3]:
print(df.shape)
print(df.columns)
df.head(1)

(231637, 12)
Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7


In [4]:
df.shape

(231637, 12)

In [5]:
df_gf = df[(df['description'].str.contains('gluten free') |
           df['description'].str.contains('gluten intolerant') |
           df['ingredients'].str.contains('gluten free') |
           df['ingredients'].str.contains('gluten intolerant') |
           df['name'].str.contains('gluten free') |
           df['name'].str.contains('gluten intolerant') |
           ~ df['ingredients'].str.contains('oat|rye|barley|wheat') 
           )]
df_gf.shape

(217325, 12)

Take a subset of the data (only 100 rows & the relevant features); set the index as the rexipe uniqe id

In [6]:
df_recipes = df_gf.loc[0:100, ['id', 'name', 'ingredients']]
df_recipes = df_recipes.set_index('id')
df_recipes.head()

Unnamed: 0_level_0,name,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
137739,arriba baked winter squash mexican style,"['winter squash', 'mexican seasoning', 'mixed ..."
31490,a bit different breakfast pizza,"['prepared pizza crust', 'sausage patty', 'egg..."
112140,all in the kitchen chili,"['ground beef', 'yellow onions', 'diced tomato..."
59389,alouette potatoes,"['spreadable cheese with garlic and herbs', 'n..."
44061,amish tomato ketchup for canning,"['tomato juice', 'apple cider vinegar', 'sugar..."


## TF-IDF Work

Use a TFIDF vectorizer

In [7]:
stemmer = nltk.SnowballStemmer("english")

def tokenize(text):
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    tokens = nltk.regexp_tokenize(text, pattern)
    stemmer = snow("english")
    stems = []
    for item in tokens:
        stems.append(stemmer.stem(item))
    return stems

In [8]:

# remove stop words: ex: 'and'
# tokenize function
# Use a snowball stemmer

tfidf = sklearn.feature_extraction.text.TfidfVectorizer(stop_words='english', tokenizer=tokenize)

response = tfidf.fit_transform(df_recipes.ingredients)

df_tfidf = pd.DataFrame(response.toarray(), columns=tfidf.get_feature_names(), index=df_recipes.index)



df_tfidf.head()

  'stop_words.' % sorted(inconsistent))


Unnamed: 0_level_0,ad,albacor,almond,american,angel,appl,avocado,bacon,bake,banana,...,wine,winter,wood,worcestershir,wrap,yeast,yellow,yogurt,yolk,zucchini
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
137739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.399415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31490,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.214982,0.0,0.0,0.0
59389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.186813,0.0,0.0,0.0,0.0,0.0,0.209172,0.0,0.0,0.0
44061,0.0,0.0,0.0,0.0,0.0,0.319487,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Create a cosine similarity table

In [12]:
cosine_similarity_array_tfidf = sklearn.metrics.pairwise.cosine_similarity(df_tfidf)
df_similarity_tfidf = pd.DataFrame(cosine_similarity_array_tfidf, columns=df_tfidf.index).set_index(df_tfidf.index)
df_similarity_tfidf.head()

id,137739,31490,112140,59389,44061,5289,25274,67888,70971,75452,...,98930,83133,39947,44895,39363,62368,111875,42522,34930,63793
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
137739,1.0,0.018825,0.011787,0.072172,0.080791,0.0,0.138922,0.054928,0.0,0.052545,...,0.114344,0.04708,0.0,0.08855,0.053954,0.0294,0.029204,0.07896,0.065529,0.016874
31490,0.018825,1.0,0.040842,0.09218,0.048072,0.05509,0.0,0.034342,0.0,0.054619,...,0.047838,0.019242,0.062883,0.077118,0.056084,0.10417,0.041691,0.082591,0.04124,0.018706
112140,0.011787,0.040842,1.0,0.06985,0.124099,0.0,0.0,0.202878,0.0,0.011669,...,0.123495,0.10177,0.0,0.026209,0.068675,0.020408,0.0,0.158907,0.203367,0.114684
59389,0.072172,0.09218,0.06985,1.0,0.157588,0.0,0.162783,0.11784,0.0,0.011354,...,0.127842,0.031796,0.0,0.017234,0.011658,0.019856,0.0,0.050316,0.025124,0.062743
44061,0.080791,0.048072,0.124099,0.157588,1.0,0.25641,0.073907,0.234682,0.0,0.09763,...,0.270398,0.243664,0.052556,0.054875,0.049441,0.024744,0.024578,0.0,0.040925,0.058021


In [16]:
# lemmatize the users input

user_response = 'banana'
user_response_tfidf = stemmer.stem(user_response)
user_response_tfidf

'banana'

In [17]:
df_primary_recipes_tfidf = df_tfidf[df_tfidf.get(user_response_tfidf) > 0]
df_primary_recipes_tfidf

Unnamed: 0_level_0,ad,albacor,almond,american,angel,appl,avocado,bacon,bake,banana,...,wine,winter,wood,worcestershir,wrap,yeast,yellow,yogurt,yolk,zucchini
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
70971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.202105,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238122,0.29415,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95926,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.479401,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238935,0.295154,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.489014,0.302037,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
111875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.240398,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.240398,0.0,0.0


In [18]:
list_primary_recipes_tfidf = df_primary_recipes_tfidf.index.tolist()
list_primary_recipes_tfidf

[70971, 75452, 95926, 83062, 39363, 111875]

In [19]:
for id_ in list_primary_recipes_tfidf:
    # lookup each primary recipe against all other recipes in the similarity matrix
    for column_id in df_similarity_tfidf.columns:
        
        # check if the similarity is between values    
        if df_similarity_tfidf.loc[id_,  column_id] > .4 \
            and df_similarity_tfidf.loc[id_, column_id] < .99:

            #for the print statement
            primary = df_recipes.loc[id_, 'name']
            similar = df_recipes.loc[column_id, 'name']
            value = df_similarity_tfidf.loc[id_,  column_id]
            
            print(f"Has {user_response}: {primary} id: {id_} \nSimilar recipie: {similar} id: {column_id} value: {value}\n")

Has banana: beat this  banana bread id: 75452 
Similar recipie: one bowl  perfect pound cake id: 26835 value: 0.5706388461228459

Has banana: beat this  banana bread id: 75452 
Similar recipie: the best  banana bread  or muffins id: 39363 value: 0.46343641373013666

Has banana: spicy  banana bread id: 83062 
Similar recipie: the best  banana bread  or muffins id: 39363 value: 0.4285704334369126

Has banana: the best  banana bread  or muffins id: 39363 
Similar recipie: beat this  banana bread id: 75452 value: 0.46343641373013666

Has banana: the best  banana bread  or muffins id: 39363 
Similar recipie: get the sensation  brownies id: 27087 value: 0.41169276205289995

Has banana: the best  banana bread  or muffins id: 39363 
Similar recipie: jeanne s style  birthday cake id: 83025 value: 0.4001067957864519

Has banana: the best  banana bread  or muffins id: 39363 
Similar recipie: keep it going  german friendship cake id: 26995 value: 0.5588144306100108

Has banana: the best  banana br