In [1]:
import pandas as pd
from pandas.core.common import flatten
import sklearn.metrics
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
file = '/Users/karenwarmbein/ds/capstone/data/RAW_recipes.csv'
df = pd.read_csv(file)
df.shape

(231637, 12)

In [3]:
df.head(1)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7


In [4]:
df.columns

Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')

In [5]:
df_test_subset = df.loc[0:50, ['id', 'name', 'ingredients', 'nutrition']]
df_test_subset.head(1)

Unnamed: 0,id,name,ingredients,nutrition
0,137739,arriba baked winter squash mexican style,"['winter squash', 'mexican seasoning', 'mixed ...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]"


In [6]:
# preprocessing_ingredients 
df_test_subset['ingredients'] = (df_test_subset.ingredients
 .str.replace('[', '')
 .str.replace(']', '')
 .str.replace("'", '')
 .str.replace(' ', '_')
 .str.replace(',_', ', '))

# preprocessing_nutrition 
df_test_subset['nutrition'] = (df_test_subset.nutrition
 .str.replace('[', '')
 .str.replace(']', '')
 .str.replace("'", '')
 .str.replace(",", '')
 .str.split())

In [7]:
#ck
df_test_subset.head(3)

Unnamed: 0,id,name,ingredients,nutrition
0,137739,arriba baked winter squash mexican style,"winter_squash, mexican_seasoning, mixed_spice,...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]"
1,31490,a bit different breakfast pizza,"prepared_pizza_crust, sausage_patty, eggs, mil...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]"
2,112140,all in the kitchen chili,"ground_beef, yellow_onions, diced_tomatoes, to...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]"


In [8]:
#create new nutrition columns
df_test_subset['calories'] = df_test_subset.apply(lambda row: float(row.nutrition[0]), axis=1)
df_test_subset['carbohydrates'] = df_test_subset.apply(lambda row: float(row.nutrition[2]), axis=1)

In [9]:
#Vectorize categories of columns
df_test_subset['Calories_less_or_equal_to_100'] = df_test_subset['calories'].apply(lambda x: 1 if x <= 100 else 0)
df_test_subset['Calories_greater_than_100'] = df_test_subset['calories'].apply(lambda x: 1 if x > 100 else 0)

In [10]:
df_test_subset['carbohydrates_less_or_equal_to_15'] = df_test_subset['carbohydrates'].apply(lambda x: 1 if x <= 15 else 0)
df_test_subset['carbohydrates_greater_than_15'] = df_test_subset['carbohydrates'].apply(lambda x: 1 if x > 15 else 0)

In [11]:
#ck
df_test_subset.head(3)

Unnamed: 0,id,name,ingredients,nutrition,calories,carbohydrates,Calories_less_or_equal_to_100,Calories_greater_than_100,carbohydrates_less_or_equal_to_15,carbohydrates_greater_than_15
0,137739,arriba baked winter squash mexican style,"winter_squash, mexican_seasoning, mixed_spice,...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",51.5,13.0,1,0,1,0
1,31490,a bit different breakfast pizza,"prepared_pizza_crust, sausage_patty, eggs, mil...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",173.4,0.0,0,1,1,0
2,112140,all in the kitchen chili,"ground_beef, yellow_onions, diced_tomatoes, to...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",269.8,32.0,0,1,0,1


In [12]:
#vectorize the ingredients

vec = CountVectorizer()
X = vec.fit_transform(df_test_subset.ingredients)
df_test_new = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
df_test_new.columns = df_test_new.columns.str.replace('_',' ')
df_test_new = df_test_new.join(df_test_subset)

In [13]:
# drop unnecessary columns
df_test_new = df_test_new.drop(['ingredients', 'calories', 'carbohydrates','nutrition'], axis=1)
df_test_new = df_test_new.set_index('id')

In [14]:
df_test_new.head(3)

Unnamed: 0_level_0,fresh ground pepper,freshly ground black pepper,garlic soup,original sauce,pepper,all,american cheese,angel hair pasta,apple,apple cider vinegar,...,yellow bell pepper,yellow cake mix,yellow onion,yellow onions,zucchini,name,Calories_less_or_equal_to_100,Calories_greater_than_100,carbohydrates_less_or_equal_to_15,carbohydrates_greater_than_15
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
137739,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,arriba baked winter squash mexican style,1,0,1,0
31490,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,a bit different breakfast pizza,0,1,1,0
112140,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,all in the kitchen chili,0,1,0,1


In [15]:
#create a cosine similarity array
cosine_similarity_array = sklearn.metrics.pairwise.cosine_similarity(df_test_new.drop('name', axis=1))
df_new_similarity = pd.DataFrame(cosine_similarity_array, columns=df_test_new.index).set_index(df_test_new.index)
# sorted(set(list(flatten(df_new_similarity.values.tolist()))))

In [16]:
df_new_similarity.head(3)

id,137739,31490,112140,59389,44061,5289,25274,67888,70971,75452,...,93959,93958,58224,33606,94710,35173,83025,52804,108414,26995
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
137739,1.0,0.117851,0.086066,0.27735,0.105409,0.0,0.100504,0.133333,0.0,0.100504,...,0.2,0.071067,0.100504,0.152944,0.105409,0.201008,0.1849,0.0,0.19245,0.152944
31490,0.117851,1.0,0.091287,0.196116,0.111803,0.288675,0.213201,0.070711,0.125,0.213201,...,0.141421,0.150756,0.213201,0.324443,0.223607,0.213201,0.294174,0.196116,0.102062,0.243332
112140,0.086066,0.091287,1.0,0.143223,0.244949,0.210819,0.07785,0.258199,0.182574,0.23355,...,0.154919,0.165145,0.1557,0.059235,0.08165,0.1557,0.214834,0.214834,0.149071,0.23694


In [17]:
#look up primary recipes

#test case #1: tomato
user_response = 'tomato'

# find the input in the column names
array_found_tag = df_test_new.columns.str.find(user_response)

#look up primary recipes

#test case #1: tomato
user_response = 'tomato chocolate'
user_response_list = user_response.split()
# find the input in the column names
list_of_arrays_for_ingredients_matched = []
for _ in user_response_list:
    array_found_tag = df_test_new.columns.str.find(_)
    list_of_arrays_for_ingredients_matched.append(array_found_tag)

In [18]:
# choose the series that matches
df_found = pd.DataFrame()

for array in list_of_arrays_for_ingredients_matched:
    cnt = -1
    for i in array:
        cnt=cnt+1
        if i >= 0:
            df_found = df_found.join(df_test_new.iloc[:, cnt], how='outer')
    print(df_found)

        diced tomatoes  rotel tomatoes  tomato juice  tomato paste  \
id                                                                   
137739               0               0             0             0   
31490                0               0             0             0   
112140               1               1             0             1   
59389                0               0             0             0   
44061                0               0             1             0   
5289                 0               0             0             0   
25274                0               0             0             0   
67888                0               0             0             0   
70971                0               0             0             0   
75452                0               0             0             0   
109439               0               0             0             0   
42198                0               0             0             0   
67547               

In [19]:
list_m = []
cnt2 = -1
for i in  df_found:
    cnt2 = cnt2 + 1
    m = df_found[df_found.keys()[cnt2]] == 1
    list_m.append(df_found.index[m].tolist())
print(list_m)

[[112140, 54272, 87098], [112140], [44061], [112140], [67888, 59952], [112140], [47366, 33606], [70971], [70971]]


In [20]:
list_m = list(flatten(list_m))
list_m

[112140,
 54272,
 87098,
 112140,
 44061,
 112140,
 67888,
 59952,
 112140,
 47366,
 33606,
 70971,
 70971]

In [21]:
# look-up common recipies

list_n = []
for i in list_m:
    n=((df_new_similarity[i] > .4) & (df_new_similarity[i] < .99))
    list_n.append(df_new_similarity.columns[n].tolist())
    list_n = list(set(list(flatten(list_n))))

In [22]:
set_primary = set(list_m)    
print('primary recipies', set_primary)

 
set_similar = set([elem for elem in list_n if elem not in list_m])
print('similar recipies', set_similar)

primary recipies {54272, 47366, 33606, 112140, 67888, 59952, 87098, 70971, 44061}
similar recipies {5289, 67547}


In [23]:
# print results
print('These are recipies for: ', user_response)

print('\nRecipies with', user_response)
for i in set_primary:
    print('--', df_test_new.loc[i, 'name'])

print('\nYou may also like...')
for i in set_similar:
    print('--', df_test_new.loc[i, 'name'])

These are recipies for:  tomato chocolate

Recipies with tomato chocolate
-- fool the meat eaters  chili
-- forgotten  minestrone
-- italian sandwich  pasta salad
-- all in the kitchen  chili
-- backyard style  barbecued ribs
-- global gourmet  taco casserole
-- homemade  vegetable soup from a can
-- bananas 4 ice cream  pie
-- amish  tomato ketchup  for canning

You may also like...
-- apple a day  milk shake
-- better then bush s  baked beans
