In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast
from nltk.tokenize import wordpunct_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer

#### Load in data

In [40]:
data = pd.read_csv('allerhande_preprocessed.csv', index_col=0)
print(data.head())
data = data.replace('\xad', '')

        id                                title  \
0   861106              Vegetarische bonenstoof   
1   680104                    Frisse limoenroom   
2   683858  Kalkoenfilet met champignonroomsaus   
3  1187074       Andijvie met rauwe ham en prei   
4   660152              Spruiten-rozijnensalade   

                                         description        course  \
0  ['Stoofgerecht', 'vegetarische', 'balletjes,',...  hoofdgerecht   
1  ['lekker', 'recept.', 'nagerecht', 'bevat', 'v...     nagerecht   
2  ['Kidsproof', 'winters', 'recept', 'voor', 'pa...  hoofdgerecht   
3  ['bijgerecht', 'andijvie,', 'prei,', 'knoflook...    bijgerecht   
4  ['lekker', 'hollands', 'recept.', 'vegetarisch...    bijgerecht   

   recipe_yield                                        ingredients  calories  \
0           4.0  ['olijfolie', 'tomatenblokjes', 'bruine bonen'...     410.0   
1           4.0  ['limoenen', 'slagroom', 'vloeibare honing', '...     290.0   
2           4.0  ['olijfolie', '

#### Load in inverted columns and convert to dict

In [41]:
inverted_ingredients_pd = pd.read_csv('inverted_list_ingredients.csv', header=None, names=['Word', 'Index'])
print(inverted_ingredients_pd.head())
inverted_ingredients_dict = pd.Series(inverted_ingredients_pd.Index.values,index=inverted_ingredients_pd.Word).to_dict()

                Word             Index
0            rosties     [4181, 13447]
1    kikkererwtenmel            [2405]
2         kipshoarma     [6778, 12197]
3  paddenstoelentrio  [82, 8643, 9195]
4   knoflookbrooddeg            [2205]


In [42]:
inverted_tags_pd = pd.read_csv('inverted_list_tags.csv', header=None, names=['Word', 'Index'])
# remove row with value NaN
inverted_tags_pd.drop([0], inplace=True)
print(inverted_tags_pd.head())
inverted_tags_dict = pd.Series(inverted_tags_pd.Index.values,index=inverted_tags_pd.Word).to_dict()

                Word                                              Index
1   zonder vlees/vis  [148, 166, 194, 204, 221, 332, 358, 360, 418, ...
2     midden-oosters  [173, 452, 505, 660, 1093, 1294, 1566, 1622, 2...
3              pasen  [109, 634, 801, 1045, 1293, 1305, 1559, 1574, ...
4              kerst  [1, 5, 11, 32, 34, 38, 41, 53, 77, 81, 82, 90,...
5      scandinavisch  [76, 251, 752, 1435, 2113, 3140, 3885, 4739, 5...


In [43]:
inverted_types_pd = pd.read_csv('inverted_list_types.csv', header=None, names=['Word', 'Index'])
print(inverted_types_pd.head())
inverted_types_dict = pd.Series(inverted_types_pd.Index.values,index=inverted_types_pd.Word).to_dict()

     Word                                              Index
0     vis  [6, 8, 25, 42, 72, 131, 151, 180, 187, 188, 21...
1  souflé                                     [10029, 15780]
2   chili  [250, 262, 270, 313, 521, 580, 727, 761, 763, ...
3     wok  [77, 127, 197, 227, 253, 479, 527, 557, 578, 6...
4   taart  [12, 31, 43, 49, 59, 67, 120, 185, 201, 241, 2...


In [44]:
inverted_title_pd = pd.read_csv('inverted_list_title.csv', header=None, names=['Word', 'Index'])
print(inverted_title_pd.head())
# index_nr = []
# for i, word in enumerate(inverted_title_pd['Word']):
#     if type(word) is not str:
#         index_nr.append(i)
print(len(inverted_title_pd['Word']))
# print(index_nr)
inverted_title_pd.drop([6662], inplace=True)
inverted_title_dict = pd.Series(inverted_title_pd.Index.values,index=inverted_title_pd.Word).to_dict()


                  Word                Index
0              rosties        [4181, 13562]
1  hazelnootcantuccini               [9227]
2                  lax               [1331]
3           kipshoarma  [7540, 9256, 14042]
4        kokosspinazie              [15543]
8691


#### First for only one word in only one inverted column

In [45]:
# word = str, inverted_column = dict, retrieved_docs = list
def retrieve_inverted_column(word, inverted_column):
    retrieved_docs = []
    for key in inverted_column.keys():
        if word in key:
            retrieved_docs += ast.literal_eval(inverted_column.get(key))
    return list(set(retrieved_docs))

## Retrieval system
- tokenize query, remove all tokens that have len = 1, since these are not words
- for each word in query, create set of documents containing this word
    - look through each inverted column to retrieve possible documents
- get the intersection of the document sets of each word to only retrieve documents with all query words
- print all retrieved documents

In [46]:
# query = str, inverted_column = list of dicts?
def retrieve(query, inverted_columns, dataset):
    tokens = word_tokenize(query)
    words = [item for item in tokens if len(item) > 1]
    
    # stemmen
    stemmer = SnowballStemmer("dutch")
    words = [stemmer.stem(y) for y in words]
    
    set_per_word = []
    
    # loop through query words
    for word in words:
        
        # get set of retrieved docs for each word
        retrieved_docs_word = []
        for column in inverted_columns:
            retrieved_docs_word += retrieve_inverted_column(word, column)
        set_per_word.append(set(retrieved_docs_word))
    
    # get intersection of sets for whole query
    retrieved = set_per_word[0]
    for element in set_per_word[1:]:
        retrieved = retrieved.intersection(element)
        
    # print all retrieved recipes
    for index in retrieved:
        print(dataset.iloc[index,:])
        print('\n')
        
    return list(retrieved)
    
    

In [48]:
test1 = retrieve("ijskoffie", [inverted_ingredients_dict, inverted_tags_dict, inverted_types_dict, inverted_title_dict],data)
print(data.iloc[test1[0],1])

id                                                               549844
title                                           IJskoffie met chocorasp
description           ['lekker', 'recept.', 'vegetarische', 'nagerec...
course                                                        nagerecht
recipe_yield                                                          4
ingredients                  ['sterke koffie', 'pure chocolade', 'ijs']
calories                                                            105
protein                                                               2
carbohydrates                                                        12
fat                                                                   6
saturated_fat                                                       NaN
sodium                                                              NaN
fiber                                                               NaN
cooking_time                                                    

## User interface test

In [50]:
from tkinter import *
from PIL import Image, ImageTk

In [None]:
def click():
    entered_text = textentry.get()
    output.delete(0.0, END)
    recipes = retrieve(entered_text, [inverted_ingredients_dict, inverted_tags_dict, inverted_types_dict, inverted_title_dict],data)
    for element in recipes:
        output.insert(END, data.iloc[element,1])
        output.insert(END, '\n')
    
window = Tk()
window.title('Recepten zoeker')
window.configure(background='white')

image = Image.open("bestek.png")
photo1 = ImageTk.PhotoImage(Image.open("bestek.png"))
Label(window, image=photo1).grid(row=0, column=0, sticky=E)

Label(window, text="Wat wil je eten vanavond?", font='none 24 bold').grid(row=1,column=0,sticky=E+W)

textentry = Entry(window, width = 35)
textentry.grid(row=3,column=0,sticky=W)

Button(window, text='Zoek recepten', width=13, command=click, bg='red', fg='black', font='none 18 bold').grid(row=3,column=0,sticky=E)

Label(window, text='Gevonden recepten',font='none 18 bold').grid(row=4,column=0,sticky=E+W)

output = Text(window, width=75, height=6, wrap=WORD)
output.grid(row=5,column=0,columnspan=2,sticky=E+W)

window.mainloop()

id                                                               549844
title                                           IJskoffie met chocorasp
description           ['lekker', 'recept.', 'vegetarische', 'nagerec...
course                                                        nagerecht
recipe_yield                                                          4
ingredients                  ['sterke koffie', 'pure chocolade', 'ijs']
calories                                                            105
protein                                                               2
carbohydrates                                                        12
fat                                                                   6
saturated_fat                                                       NaN
sodium                                                              NaN
fiber                                                               NaN
cooking_time                                                    

id                                                               197388
title                                                    Rijstkroketjes
description           ['lekker', 'italiaans', 'recept.', 'vegetarisc...
course                                                      voorgerecht
recipe_yield                                                         12
ingredients           ['witte wijn', 'saffraandraadjes', 'zout', 'ri...
calories                                                            190
protein                                                               4
carbohydrates                                                        17
fat                                                                  12
saturated_fat                                                       NaN
sodium                                                              NaN
fiber                                                               NaN
cooking_time                                                    

id                                                               389204
title                                       Mexicaanse stamppot met jus
description           ['lekker', 'recept.', 'hoofdgerecht', 'bevat',...
course                                                     hoofdgerecht
recipe_yield                                                          4
ingredients           ['vloeibare margarine', 'aardappelpuree voor s...
calories                                                            590
protein                                                              26
carbohydrates                                                        55
fat                                                                  23
saturated_fat                                                         0
sodium                                                              NaN
fiber                                                               NaN
cooking_time                                                    

id                                                               406158
title                                Karbonade en herfstsalade met peer
description           ['lekker', 'hollands', 'recept.', 'hoofdgerech...
course                                                     hoofdgerecht
recipe_yield                                                          4
ingredients           ['olijfolie', 'ribkarbonades', 'champignons', ...
calories                                                            575
protein                                                              36
carbohydrates                                                        38
fat                                                                  31
saturated_fat                                                       NaN
sodium                                                              NaN
fiber                                                               NaN
cooking_time                                                    

id                                                              1188520
title                 Hollandse maaltijdsoep met aardappel en rookwo...
description           ['Soep', 'van', 'verspakket', 'groentesoep', '...
course                                                     hoofdgerecht
recipe_yield                                                          4
ingredients           ['verspakket groentesoep', 'roodschillige aard...
calories                                                            410
protein                                                              14
carbohydrates                                                        27
fat                                                                  26
saturated_fat                                                        10
sodium                                                             1240
fiber                                                                 6
cooking_time                                                    

id                                                               921614
title                  Zalm met groene thee, kokosrijst en misogroenten
description           ['lekker', 'aziatisch', 'recept.', 'hoofdgerec...
course                                                     hoofdgerecht
recipe_yield                                                          4
ingredients           ['basmatirijst', 'kokend water', 'vloeibare ho...
calories                                                            715
protein                                                              36
carbohydrates                                                        76
fat                                                                  28
saturated_fat                                                         0
sodium                                                              NaN
fiber                                                               NaN
cooking_time                                                    