In [1]:
import pandas as pd
from pandas.core.common import flatten
import sklearn.metrics
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
file = '/Users/karenwarmbein/ds/capstone/data/RAW_recipes.csv'
df = pd.read_csv(file)
df.shape

(231637, 12)

In [3]:
df.head(1)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7


In [4]:
df.columns

Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')

In [24]:
df_test_subset = df.loc[0:200, ['id', 'name', 'ingredients', 'nutrition']]
df_test_subset.head(1)

Unnamed: 0,id,name,ingredients,nutrition
0,137739,arriba baked winter squash mexican style,"['winter squash', 'mexican seasoning', 'mixed ...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]"


In [25]:
# preprocessing_ingredients 
df_test_subset['ingredients'] = (df_test_subset.ingredients
 .str.replace('[', '')
 .str.replace(']', '')
 .str.replace("'", '')
 .str.replace(' ', '_')
 .str.replace(',_', ', '))

# preprocessing_nutrition 
df_test_subset['nutrition'] = (df_test_subset.nutrition
 .str.replace('[', '')
 .str.replace(']', '')
 .str.replace("'", '')
 .str.replace(",", '')
 .str.split())

In [26]:
#ck
df_test_subset.head(3)

Unnamed: 0,id,name,ingredients,nutrition
0,137739,arriba baked winter squash mexican style,"winter_squash, mexican_seasoning, mixed_spice,...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]"
1,31490,a bit different breakfast pizza,"prepared_pizza_crust, sausage_patty, eggs, mil...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]"
2,112140,all in the kitchen chili,"ground_beef, yellow_onions, diced_tomatoes, to...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]"


In [27]:
#create new nutrition columns
df_test_subset['calories'] = df_test_subset.apply(lambda row: float(row.nutrition[0]), axis=1)
df_test_subset['carbohydrates'] = df_test_subset.apply(lambda row: float(row.nutrition[2]), axis=1)

In [28]:
#Vectorize categories of columns
df_test_subset['Calories_less_or_equal_to_100'] = df_test_subset['calories'].apply(lambda x: 1 if x <= 100 else 0)
df_test_subset['Calories_greater_than_100'] = df_test_subset['calories'].apply(lambda x: 1 if x > 100 else 0)

In [29]:
df_test_subset['carbohydrates_less_or_equal_to_15'] = df_test_subset['carbohydrates'].apply(lambda x: 1 if x <= 15 else 0)
df_test_subset['carbohydrates_greater_than_15'] = df_test_subset['carbohydrates'].apply(lambda x: 1 if x > 15 else 0)

In [30]:
#ck
df_test_subset.head(3)

Unnamed: 0,id,name,ingredients,nutrition,calories,carbohydrates,Calories_less_or_equal_to_100,Calories_greater_than_100,carbohydrates_less_or_equal_to_15,carbohydrates_greater_than_15
0,137739,arriba baked winter squash mexican style,"winter_squash, mexican_seasoning, mixed_spice,...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",51.5,13.0,1,0,1,0
1,31490,a bit different breakfast pizza,"prepared_pizza_crust, sausage_patty, eggs, mil...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",173.4,0.0,0,1,1,0
2,112140,all in the kitchen chili,"ground_beef, yellow_onions, diced_tomatoes, to...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",269.8,32.0,0,1,0,1


In [31]:
#vectorize the ingredients

vec = CountVectorizer()
X = vec.fit_transform(df_test_subset.ingredients)
df_test_new = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
df_test_new.columns = df_test_new.columns.str.replace('_',' ')
df_test_new = df_test_new.join(df_test_subset)

In [32]:
# drop unnecessary columns
df_test_new = df_test_new.drop(['ingredients', 'calories', 'carbohydrates','nutrition'], axis=1)
df_test_new = df_test_new.set_index('id')

In [33]:
df_test_new.head(3)

Unnamed: 0_level_0,fresh ground pepper,freshly ground black pepper,garlic soup,low,original sauce,pepper,pepper cola,active dry yeast,added ketchup,adobo seasoning,...,yellow onion,yellow onions,yoghurt,yogurt,zucchini,name,Calories_less_or_equal_to_100,Calories_greater_than_100,carbohydrates_less_or_equal_to_15,carbohydrates_greater_than_15
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
137739,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,arriba baked winter squash mexican style,1,0,1,0
31490,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,a bit different breakfast pizza,0,1,1,0
112140,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,all in the kitchen chili,0,1,0,1


In [34]:
#create a cosine similarity array
cosine_similarity_array = sklearn.metrics.pairwise.cosine_similarity(df_test_new.drop('name', axis=1))
df_new_similarity = pd.DataFrame(cosine_similarity_array, columns=df_test_new.index).set_index(df_test_new.index)
# sorted(set(list(flatten(df_new_similarity.values.tolist()))))

In [35]:
df_new_similarity.head(3)

id,137739,31490,112140,59389,44061,5289,25274,67888,70971,75452,...,39226,216030,268169,98357,150365,243251,20186,109818,243785,207525
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
137739,1.0,0.117851,0.086066,0.27735,0.105409,0.0,0.100504,0.133333,0.0,0.100504,...,0.178174,0.166667,0.333333,0.0,0.222222,0.316228,0.0,0.0,0.117851,0.105409
31490,0.117851,1.0,0.091287,0.196116,0.111803,0.288675,0.213201,0.070711,0.125,0.213201,...,0.0,0.176777,0.235702,0.125,0.235702,0.223607,0.133631,0.144338,0.375,0.223607
112140,0.086066,0.091287,1.0,0.143223,0.244949,0.210819,0.07785,0.258199,0.182574,0.23355,...,0.345033,0.193649,0.086066,0.273861,0.172133,0.08165,0.19518,0.316228,0.182574,0.08165


In [44]:
#look up primary recipes

#test case #1: tomato
user_response = 'tomato'

# find the input in the column names
# array_found_tag = df_test_new.columns.str.find(user_response)
# array_found_tag
df_test_new.get(user_response)

In [36]:
#look up primary recipes

#test case #1: tomato
# user_response = 'tomato chocolate'
user_response_list = user_response.split()
# find the input in the column names
list_of_arrays_for_ingredients_matched = []
for _ in user_response_list:
    array_found_tag = df_test_new.columns.str.find(_)
    list_of_arrays_for_ingredients_matched.append(array_found_tag)

In [37]:
# choose the series that matches
df_found = pd.DataFrame()

for array in list_of_arrays_for_ingredients_matched:
    cnt = -1
    for i in array:
        cnt=cnt+1
        if i >= 0:
            df_found = df_found.join(df_test_new.iloc[:, cnt], how='outer')
    print(df_found)

        chopped tomato  crushed tomatoes  diced tomatoes  fresh tomato  \
id                                                                       
137739               0                 0               0             0   
31490                0                 0               0             0   
112140               0                 0               1             0   
59389                0                 0               0             0   
44061                0                 0               0             0   
5289                 0                 0               0             0   
25274                0                 0               0             0   
67888                0                 0               0             0   
70971                0                 0               0             0   
75452                0                 0               0             0   
109439               0                 0               0             0   
42198                0                

In [38]:
list_m = []
cnt2 = -1
for i in  df_found:
    cnt2 = cnt2 + 1
    m = df_found[df_found.keys()[cnt2]] == 1
    list_m.append(df_found.index[m].tolist())
print(list_m)

[[63793, 59534], [83950], [112140, 54272, 87098], [64302], [190], [205331], [336744], [112140], [100870], [44061], [495314], [112140, 53402, 501028, 523359, 39226], [35595], [67888, 59952, 52443, 24137, 197013, 311991], [112140, 41756, 59534, 109818], [47366, 33606, 32169, 81185, 30300, 78655, 38276, 64045, 98930, 34930, 500166], [53402], [333281], [39363, 62368, 453467, 118843], [70971], [70971], [501028], [177187], [124286], [44895], [71635], [118843], [58651, 177187], [32271]]


In [39]:
list_m = list(flatten(list_m))
list_m

[63793,
 59534,
 83950,
 112140,
 54272,
 87098,
 64302,
 190,
 205331,
 336744,
 112140,
 100870,
 44061,
 495314,
 112140,
 53402,
 501028,
 523359,
 39226,
 35595,
 67888,
 59952,
 52443,
 24137,
 197013,
 311991,
 112140,
 41756,
 59534,
 109818,
 47366,
 33606,
 32169,
 81185,
 30300,
 78655,
 38276,
 64045,
 98930,
 34930,
 500166,
 53402,
 333281,
 39363,
 62368,
 453467,
 118843,
 70971,
 70971,
 501028,
 177187,
 124286,
 44895,
 71635,
 118843,
 58651,
 177187,
 32271]

In [40]:
# look-up common recipies

list_n = []
for i in list_m:
    n=((df_new_similarity[i] > .4) & (df_new_similarity[i] < .99))
    list_n.append(df_new_similarity.columns[n].tolist())
    list_n = list(set(list(flatten(list_n))))

In [41]:
set_primary = set(list_m)    
print('primary recipies', set_primary)

 
set_similar = set([elem for elem in list_n if elem not in list_m])
print('similar recipies', set_similar)

primary recipies {54272, 38276, 100870, 47366, 35595, 112140, 59534, 32271, 205331, 197013, 53402, 58651, 41756, 44061, 62368, 81185, 177187, 501028, 32169, 64045, 64302, 67888, 63793, 59952, 311991, 39226, 87098, 118843, 70971, 190, 78655, 39363, 33606, 500166, 24137, 495314, 71635, 52443, 30300, 453467, 523359, 44895, 333281, 336744, 83950, 98930, 34930, 109818, 124286}
similar recipies {93959, 19208, 39947, 42508, 268942, 61973, 236184, 136602, 42522, 269984, 71457, 5289, 343338, 40237, 80050, 208179, 209203, 98357, 107699, 503475, 95926, 286009, 144952, 75452, 83133, 5060, 35653, 52804, 27087, 83025, 70611, 26835, 227924, 83542, 42198, 149593, 20186, 67547, 216030, 18537, 522861, 176110, 216945, 26995, 92533, 83062, 35964, 22526, 149887}


In [42]:
# print results
print('These are recipies for: ', user_response)

print('\nRecipies with', user_response)
for i in set_primary:
    print('--', df_test_new.loc[i, 'name'])

print('\nYou may also like...')
for i in set_similar:
    print('--', df_test_new.loc[i, 'name'])

These are recipies for:  tomato chocolate

Recipies with tomato chocolate
-- fool the meat eaters  chili
-- now and later  vegetarian empanadas
-- leftovers  spaghetti sauce
-- forgotten  minestrone
-- pour over anything  mushroom gravy
-- all in the kitchen  chili
-- twisted american chop suey
-- one pot  brownies
-- chic  greek salad
-- 10 can spaghetti sauce   oamc
-- killer  lasagna
-- turtle  squares
-- souper  easy sweet   sour meatballs
-- amish  tomato ketchup  for canning
-- the best  chocolate chip cheesecake ever
-- mock a mole   low fat guacamole
-- 250 chocolate chip cookies recipe
-- 50 chili   for the crockpot
-- make that chicken dance  salsa pasta
-- some like it hot
-- red  macaroni salad
-- backyard style  barbecued ribs
-- tide me over   indian chaat  simple veggie salad
-- global gourmet  taco casserole
-- don t bother with the canned stuff    sloppy joes
-- fire  sauce
-- homemade  vegetable soup from a can
-- 250 00 chocolate chip cookies
-- bananas 4 ice cream  