In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast
from nltk.tokenize import wordpunct_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer

#### Load in data

In [3]:
data = pd.read_csv('allerhande_preprocessed.csv', index_col=0)
print(data.head())
data = data.replace('\xad', '')

        id                                   title  \
0   861106            ['vegetarisch', 'bonenstof']   
1   680104                   ['fris', 'limoenrom']   
2   683858  ['kalkoenfilet', 'champignonroomsaus']   
3  1187074     ['andijvie', 'rauw', 'ham', 'prei']   
4   660152        ['spruit', '-', 'rozijnensalad']   

                                         description        course  \
0  ['stoofgerecht', 'vegetarisch', 'balletjes', '...  hoofdgerecht   
1  ['recept', '.', 'nagerecht', 'bevat', 'volgend...     nagerecht   
2  ['kidsprof', 'winter', 'recept', 'pasta', 'kal...  hoofdgerecht   
3  ['bijgerecht', 'andijvie', ',', 'prei', ',', '...    bijgerecht   
4  ['holland', 'recept', '.', 'vegetarisch', 'bij...    bijgerecht   

   recipe_yield                                        ingredients  calories  \
0           4.0  ['olijfolie', 'tomatenblokjes', 'bruine bon', ...     410.0   
1           4.0  ['limoen', 'slagrom', 'vloeibare hon', 'grieks...     290.0   
2           4.

In [4]:
view_data = pd.read_csv('allerhande_raw.csv', index_col=0)
print(view_data.head())

        id                                title  \
0   861106              Vegetarische bonenstoof   
1   680104                    Frisse limoenroom   
2   683858  Kalkoenfilet met champignonroomsaus   
3  1187074       Andijvie met rauwe ham en prei   
4   660152              Spruiten-rozijnensalade   

                                         description        course  \
0  Stoofgerecht met vegetarische balletjes, aarda...  hoofdgerecht   
1  Een lekker recept. Het nagerecht bevat de volg...     nagerecht   
2  Kidsproof winters recept voor pasta met kalkoe...  hoofdgerecht   
3  bijgerecht met andijvie, prei, knoflook, tijm ...    bijgerecht   
4  Een lekker hollands recept. Het vegetarische b...    bijgerecht   

  recipe_yield                                        ingredients calories  \
0   4 personen  {'olijfolie': '2 el()', 'tomatenblokjes': '800...  410kcal   
1   4 personen  {'limoenen': ' 2(schoongeboend)', 'slagroom': ...  290kcal   
2   4 personen  {'olijfolie': '2 el()'

#### Load in inverted columns and convert to dict

In [5]:
inverted_ingredients_pd = pd.read_csv('inverted_list_ingredients.csv', header=None, names=['Word', 'Index'])
print(inverted_ingredients_pd.head())
inverted_ingredients_dict = pd.Series(inverted_ingredients_pd.Index.values,index=inverted_ingredients_pd.Word).to_dict()

                               Word                     Index
0                chinese-groentemix  [967, 8407, 8856, 10451]
1  paddenstoelenmix paprika bieslok                   [13788]
2                           bananas                    [7345]
3              aziatische wokgroent             [7595, 15247]
4              kruidenmix voor nasi                   [11139]


In [6]:
inverted_tags_pd = pd.read_csv('inverted_list_tags.csv', header=None, names=['Word', 'Index'])
# remove row with value NaN
index_nr = []
for i, word in enumerate(inverted_tags_pd['Word']):
    if type(word) is not str:
        index_nr.append(i)
inverted_tags_pd.drop(index_nr, inplace=True)
print(inverted_tags_pd.head())
inverted_tags_dict = pd.Series(inverted_tags_pd.Index.values,index=inverted_tags_pd.Word).to_dict()

               Word                                              Index
1  zonder vlees/vis  [14, 52, 59, 74, 174, 185, 245, 296, 311, 385,...
2         caribisch                                 [649, 4952, 10112]
3             frans  [44, 93, 107, 233, 246, 312, 325, 413, 430, 44...
4             pasen  [109, 634, 801, 1045, 1293, 1305, 1559, 1574, ...
5         traktatie  [2848, 4303, 6495, 7928, 8161, 9488, 14460, 16...


In [7]:
inverted_types_pd = pd.read_csv('inverted_list_types.csv', header=None, names=['Word', 'Index'])
print(inverted_types_pd.head())
inverted_types_dict = pd.Series(inverted_types_pd.Index.values,index=inverted_types_pd.Word).to_dict()

             Word                                              Index
0         hutspot  [766, 1082, 2267, 2794, 3092, 4189, 4266, 4617...
1      pannenkoek  [204, 248, 368, 414, 497, 572, 586, 786, 917, ...
2           fruit  [282, 877, 2094, 5986, 6117, 6609, 8763, 12188...
3  Type not found  [1, 17, 41, 51, 98, 101, 104, 121, 124, 126, 1...
4        stoofpot  [0, 62, 63, 167, 183, 314, 325, 327, 389, 537,...


In [8]:
inverted_title_pd = pd.read_csv('inverted_list_title.csv', header=None, names=['Word', 'Index'])
print(inverted_title_pd.head())
index_nr = []
for i, word in enumerate(inverted_title_pd['Word']):
    if type(word) is not str:
        index_nr.append(i)
print(index_nr)
print(len(inverted_title_pd['Word']))
# print(index_nr)
inverted_title_pd.drop(index_nr, inplace=True)
inverted_title_dict = pd.Series(inverted_title_pd.Index.values,index=inverted_title_pd.Word).to_dict()


             Word    Index
0        taglioni   [7690]
1         bananas  [16662]
2     koffiewafel    [618]
3  rodekooltaartj   [6256]
4        crawfish   [8587]
[5065]
8691


#### First for only one word in only one inverted column

In [9]:
# word = str, inverted_column = dict, retrieved_docs = list
def retrieve_inverted_column(word, inverted_column):
    retrieved_docs = []
    for key in inverted_column.keys():
        if word in key:
            retrieved_docs += ast.literal_eval(inverted_column.get(key))
    return list(set(retrieved_docs))

## Retrieval system
- tokenize query, remove all tokens that have len = 1, since these are not words
- for each word in query, create set of documents containing this word
    - look through each inverted column to retrieve possible documents
- get the intersection of the document sets of each word to only retrieve documents with all query words
- print all retrieved documents

In [10]:
# query = str, inverted_column = list of dicts?
def retrieve(query, inverted_columns, dataset):
    tokens = word_tokenize(query)
    words = [item for item in tokens if len(item) > 1]
    
    # lowercase & stemmen
    words = [y.lower() for y in words]
    stemmer = SnowballStemmer("dutch")
    words = [stemmer.stem(y) for y in words]
    
    set_per_word = []
    
    # loop through query words
    for word in words:
        
        # get set of retrieved docs for each word
        retrieved_docs_word = []
        for column in inverted_columns:
            retrieved_docs_word += retrieve_inverted_column(word, column)
        set_per_word.append(set(retrieved_docs_word))
    
    # get intersection of sets for whole query
    retrieved = set_per_word[0]
    for element in set_per_word[1:]:
        retrieved = retrieved.intersection(element)
        
    # print all retrieved recipes
    for index in retrieved:
        print(dataset.iloc[index,:])
        print('\n')
        
    return list(retrieved)
    
    

In [11]:
test1 = retrieve("ijskoffie", [inverted_ingredients_dict, inverted_tags_dict, inverted_types_dict, inverted_title_dict],data)
print(data.iloc[test1[0],1])

id                                                               549844
title                                        ['ijskoffie', 'chocorasp']
description           ['recept', '.', 'vegetarisch', 'nagerecht', 'b...
course                                                        nagerecht
recipe_yield                                                          4
ingredients                   ['sterke koffie', 'pure chocolad', 'ijs']
calories                                                            105
protein                                                               2
carbohydrates                                                        12
fat                                                                   6
saturated_fat                                                       NaN
sodium                                                              NaN
fiber                                                               NaN
cooking_time                                                    

## User interface test

In [12]:
from tkinter import *
from PIL import Image, ImageTk

In [19]:
def click():
    entered_text = textentry.get()
#     output.delete(0.0, END)
    recipes = retrieve(entered_text, [inverted_ingredients_dict, inverted_tags_dict, inverted_types_dict, inverted_title_dict],data)
    for i, element in enumerate(recipes):
        Button(window, text=view_data.iloc[element,1], command = show_descr).grid(row=5+i)
#         output.insert(END, view_data.iloc[element,1])
#         output.insert(END, description)
#         output.insert(END, '\n')
    
def show_descr():
    pass

window = Tk()
window.title('Recepten zoeker')
window.configure(background='white')
# window.geometry("{}x{}".format(window.winfo_screenwidth(), window.winfo_screenheight()))

image = Image.open("bestek.png")
photo1 = ImageTk.PhotoImage(Image.open("bestek.png"))
Label(window, image=photo1).grid(row=0,column = 0, columnspan = 4, sticky=W+E+N+S)

Label(window, text="Wat wil je eten vanavond?", font='none 24 bold').grid(row=1,column=0,columnspan = 4,sticky=E+W)

textentry = Entry(window, width = 55)
textentry.grid(row=3,column=0,columnspan = 4,sticky=W)

Button(window, text='Zoek recepten', width=13, command=click, bg='red', fg='black', font='none 18 bold').grid(row=3,column=0, columnspan=4,sticky=E)

# description = Button(window, text='Omschrijving')

Label(window, text='Gevonden recepten',font='none 18 bold').grid(row=4,column=0,sticky=E+W)

# output = Text(window, width=75, height=6, wrap=WORD)
# output.grid(row=5,column=0,columnspan=2,sticky=E+W)

window.mainloop()

id                                                               549844
title                                        ['ijskoffie', 'chocorasp']
description           ['recept', '.', 'vegetarisch', 'nagerecht', 'b...
course                                                        nagerecht
recipe_yield                                                          4
ingredients                   ['sterke koffie', 'pure chocolad', 'ijs']
calories                                                            105
protein                                                               2
carbohydrates                                                        12
fat                                                                   6
saturated_fat                                                       NaN
sodium                                                              NaN
fiber                                                               NaN
cooking_time                                                    