# Import Allerhande dataset and discover data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast
import re
import csv
import pickle

In [2]:
data = pd.read_csv('allerhande_raw.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,id,title,description,course,recipe_yield,ingredients,calories,protein,carbohydrates,...,saturated_fat,sodium,fiber,cooking_time,rating,review_count,recipe_instruction,source,tags,appliances
0,0,861106,Vegetarische bonenstoof,"Stoofgerecht met vegetarische balletjes, aarda...",hoofdgerecht,4 personen,"{'olijfolie': '2 el()', 'tomatenblokjes': '800...",410kcal,21g,52g,...,2g,1.040mg,16g,15 min. bereiden,3.0,483.0,Verwarm de oven voor op 200 ºC.,Allerhande magazine 2012 nr. 11 - Alles uit de...,"['vegetarisch', 'slank', 'oven']",['ovenschaal (20 x 30 cm)']
1,1,680104,Frisse limoenroom,Een lekker recept. Het nagerecht bevat de volg...,nagerecht,4 personen,"{'limoenen': ' 2(schoongeboend)', 'slagroom': ...",290kcal,8g,14g,...,,,,10 min. bereiden,3.0,161.0,,Allerhande magazine 2010 nr. 12 - Kerst nieuw-...,['kerst'],['hoge glazen']
2,2,683858,Kalkoenfilet met champignonroomsaus,Kidsproof winters recept voor pasta met kalkoe...,hoofdgerecht,4 personen,"{'olijfolie': '2 el()', 'gedroogde rozemarijn'...",585kcal,41g,58g,...,0g,,,25 min. bereiden,3.0,112.0,Kook de spaghetti volgens de aanwijzingen op d...,Receptkaart 2010 week 50-51,"['italiaans', 'wat eten we vandaag', 'bakken']",[]
3,3,1187074,Andijvie met rauwe ham en prei,"bijgerecht met andijvie, prei, knoflook, tijm ...",bijgerecht,4 personen,"{'olijfolie': '2 el()', 'prei': '2 ()', 'knofl...",175kcal,9g,7g,...,2g,475mg,6g,15 min. bereiden,3.0,8.0,Was de preien en snijd ze in ringen. Snijd de ...,Allerhande magazine 2016 nr. 05 - Eet de lente,"['glutenvrij', 'lactosevrij', 'hollands']",[]
4,4,660152,Spruiten-rozijnensalade,Een lekker hollands recept. Het vegetarische b...,bijgerecht,4 personen,"{'sladressing bieslook': '6 el(fles 270 ml)', ...",115kcal,1g,19g,...,0g,,,15 min. bereiden,3.0,58.0,Verwijder de buitenste bladeren van de spruite...,Allerhande magazine 2010 nr. 09 - Groente volo...,"['vegetarisch', 'hollands', 'koken']",[]


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20123 entries, 0 to 20122
Data columns (total 21 columns):
Unnamed: 0            20123 non-null int64
id                    20123 non-null int64
title                 20123 non-null object
description           20123 non-null object
course                20067 non-null object
recipe_yield          20122 non-null object
ingredients           20123 non-null object
calories              19864 non-null object
protein               19738 non-null object
carbohydrates         19762 non-null object
fat                   19609 non-null object
saturated_fat         12473 non-null object
sodium                7879 non-null object
fiber                 7674 non-null object
cooking_time          20121 non-null object
rating                20122 non-null float64
review_count          20122 non-null float64
recipe_instruction    13127 non-null object
source                19424 non-null object
tags                  20123 non-null object
appliances   

# Preprocessing

In this section of the notebook we will perform the following actions to preprocess our data:


*   Remove duplicate recipes from our dataset by deduplicating the dataframe based on the value in the column *id*
*   Remove the columns *unnamed: 0* and *source* as we will not use these values but instead use the pandas index of the dataframe
* Convert strings in the tags column to lists of tags
*   From the string of ingredients and their quantities in the column *ingredients*, transform only the ingredients into a list
*   Extract the main ingredient of a recipe from the *title* of the recipe
*   Extract the recipe type from the *title* of the recipe
*   Translate the English recipe titles into Dutch so that the entire dataset will be in Dutch
*   Stem the string values in the columns *title*, en X? **TODO**
*   Remove non-informative words from the columns *title* and *description*
*   Convert the string values into floats for the columns *recipe_yield*, *calories*, *protein*, *carbohydrates*, *fat*, *saturated_fat*, *sodium*, *fiber*, and *cooking_time*
* Create an inverted document listing each ingredient and the recipes that the ingredient appears in



In [4]:
# Remove duplicates
data.drop_duplicates(subset='id', inplace=True)

In [5]:
# Remove the 'unnamed: 0' and 'source' columns
data.drop(columns="Unnamed: 0", axis=1, inplace=True)
data.drop(columns='source', axis=1, inplace=True)

In [6]:
# Convert strings in the tags column to lists of tags
def tags_to_list(row):
    string_tags = row['tags']
    string_clean = string_tags.replace('[', '')
    string_clean = string_clean.replace(']','')
    string_clean = string_clean.replace('\'','')

    return string_clean.split(',')

data['tags'] = data.apply(tags_to_list, axis=1)

In [7]:
# Define function to get the ingredients from a given row in the dataframe
def get_ingredients(row):
    """
    Given a string in a dictionary format with ingredients and quantities, returns the ingredients
    """
    ingredients_dict = ast.literal_eval(row["ingredients"])
    return list(ingredients_dict.keys())
  
# Convert the strings in the ingredients column to lists of ingredients
data["ingredients"] = data.apply(get_ingredients, axis=1)

In [8]:
# Define function to extract the main ingredient from the title
def get_main_ingredient_from_title(row):
    """
    Given a string with the recipe's title, returns the main ingredient of that recipe
    """

    # Extract the main word of the title
    string_ingredients = row["title"]
    string_clean = re.sub(r'(Salade|Borrelplank)\s(met)(.*)', r'\3', string_ingredients) # For salads, take the ingredients in the salad, instead of 'salad' as ingerdient
    string_clean = re.sub(r'\s(met|pret|voor|XL|in|van).*','',string_clean) # Remove suffixes
    string_clean = re.sub(r'(.*?)-(.*)',r'\2', string_clean) # Remove first element of concatenation such as "spruiten-rozijnensalade"
    string_clean = re.sub(r'\s$|^\s', '', string_clean) # Remove unneccessary whitespaces
    string_clean = re.sub(r'\'', '', string_clean) # Remove commas
    string_clean = re.findall(r'(\w+)$', string_clean) # If multiple ingredients are listed, take the last one
    string_clean = ''.join(string_clean)
    main = string_clean.lower()

    # Check if the main word corresponds with an ingredient in the ingredients list
    for ingredient in row["ingredients"]:
        for word in ingredient.split():
            if word in main:
                return ingredient

    # Return main word if no corresponding ingredient is found
    return main
  
# Get main ingredient from the title
data["main_ingredient"] = data.apply(get_main_ingredient_from_title, axis=1)

In [32]:
# Define function to extract the recipe type from the title

def get_recipe_type(row):
    """
    Extracts the type of the recipe from the title of the recipe
    """
    # Create dictionary with recipe types        
    recipe_type_dict = {
        'salade' : 'salade',
        'penne' : 'pasta',
        'pasta' : 'pasta',
        'stoof' : 'stoofpot',
        'kalkoenfilet' : 'vlees',
        'couscous' : 'couscous',
        'curry' : 'curry',
        'mousse' : 'mousse',
        'gnocchi' : 'pasta',
        'spinazietaart' : 'hartige taart',
        'carpaccio' : 'carpaccio',
        'soep' : 'soep',
        'kipfilet' : 'vlees',
        'zalmfilet' : 'vis',
        'tilapia' : 'vis',
        'schotel' : 'ovenschotel',
        'gratin' : 'ovenschotel',
        'pizza' : 'pizza',
        'lamslende': 'vlees',
        'lamsrack' : 'vlees',
        'tosti' : 'tosti',
        'cake' : 'taart',
        'spaghetti' : 'pasta',
        'burger' : 'burger',
        'hutspot' : 'hutspot',
        'stampot' : 'stamppot',
        'stamppot' : 'stamppot',
        'wrap' : 'wraps',
        'tortilla' : 'wraps',
        'tagliatelle' : 'pasta',
        'biefstuk' : 'vlees',
        'tortellini' : 'pasta',
        'steak' : 'vlees',
        'lamskotelet' : 'vlees',
        'gazpacho' : 'soep',
        'fondue' : 'fondue',
        'hartige taart' : 'hartige taart',
        'muffin' : 'muffin',
        'sandwich' : 'sandwich',
        'smoothie' :'smoothie',
        'ceviche' : 'vis',
        'pudding' : 'pudding',
        'broodje' : 'sandwich',
        'brood' : 'brood',
        'dip' : 'saus',
        'vegetarisch bijgerecht' : 'groente',
        'vegetarische bijgerecht' : 'groente',
        'omelet' : 'omelet',
        'sate' : 'vlees',
        'saté' : 'vlees',
        'vlees' : 'vlees',
        'plaattaart' : 'hartige taart',
        'risotto' : 'risotto',
        'rigatoni' : 'pasta',
        'quesedilla' : 'wrap',
        'chili' : 'chili',
        'quiche' : 'hartige taart',
        'côte de boeuf' : 'vlees',
        'varkensfilet' : 'vlees',
        'lasagne' : 'lasagne',
        'karbonade' : 'vlees',
        'parelhoenfilet' : 'vlees',
        'monchoutaart' : 'taart',
        'bruschetta' : 'brood',
        'mueslirepen' : 'mueslireep',
        'lekkerbekjes' : 'vis',
        'toast' : 'brood',
        'mosselen' : 'zeevruchten',
        'soufflé' : 'soufflé',
        'runderballetjes' : 'vlees',
        'kruidkoek' : 'koek',
        'baguette' : 'brood',
        'visbakkie' : 'vis',
        'drumsticks' : 'vlees',
        'rollade' : 'vlees',
        'visfilet' : 'vis',
        'surf en turf' : 'vis',
        'gehaktbal' : 'vlees',
        'braadworst' : 'vlees',
        'petitfour' : 'gebak',
        'kabeljauw' : 'vis',
        'zuurkool' : 'stamppot',
        'lentestamp' : 'stamppot',
        'tortelloni' : 'pasta',
        'balinese kip' : 'vlees',
        'speklap' : 'vlees',
        'oranjekoek' : 'koek',
        'nasi goreng' : 'vlees',
        'wortel-kaastaart' : 'hartige taart',
        'roergebakken' : 'roerbak',
        'crostini' : 'brood',
        'quesadilla' : 'wrap',
        'milkshake' : 'milkshake',
        'wok' : 'wok',
        'taco' : 'wrap',
        'pecankoek' : 'koek',
        'tiramisu' : 'gebak',
        'baklava' : 'gebak',
        'wentelteefje' : 'brood',
        'zalm' : 'vis',
        'kip' : 'vlees',
        'kibbeling' : 'vis',
        'zeewolf' : 'zeevruchten',
        'ciabatta' : 'brood',
        'worstjes' : 'vlees',
        'meloenkrullen' : 'fruit',
        'garnalen' : 'zeevruchten',
        'oester' : 'zeevruchten',
        'tonijn' : 'vis',
        'mie' : 'wok',
        'appeltaart' : 'taart',
        'croissant' : 'brood',
        'ravioli' : 'pasta',
        'makreeltaart' : 'hartige taart',
        'ijstaart' : 'taart',
        'worteltaart' : 'taart',
        'eend' : 'vlees',
        'calzone' : 'pasta',
        'panna cotta' : 'gebak',
        'mangotaart' : 'taart',
        'vanilletaart' : 'taart',
        'sobert' : 'ijs',
        'walnut pie' : 'taart',
        'coq au vin' : 'vlees',
        'koolvis' : 'vis',
        'kaastaart' : 'hartige taart',
        'ribeye' : 'vlees',
        'pannenkoek' : 'pannenkoek',
        'scones' : 'gebak',
        'notentaart' : 'taart',
        'crumble' : 'gebak',
        'macaroni' : 'pasta',
        'brownie' : 'gebak',
        'aardappeltaart' : 'hartige taart',
        'mihoen' : 'wok',
        'gamba' : 'zeeveruchten',
        'kalkoen' : 'vlees',
        'burrito' : 'wrap',
        'vispakketje' : 'vis',
        'frites' : 'patat',
        'flensje' : 'pannenkoen',
        'zeevrucht' : 'zeevruchten',
        'chocoladetaart' : 'taart',
        'panini' : 'brood',
        'boeuf' : 'vlees',
        'garnaal' : 'zeevruchten',
        'aspergetaart' : 'hartige taart',
        'cannelloni' : 'pasta',
        'farfalle' : 'pasta',
        'wafel' : 'wafel',
        'scholfilet' : 'vis',
        'rivierkreeft' : 'vis',
        'brulée' : 'gebak',
        'desserttaart' : 'taart',
        'vla' : 'vla',
        'kalf' : 'vlees',
        'waterijs' : 'ijs',
        'meatball' : 'vlees',
        'fusilli' : 'pasta',
        'bloemkooltaart' : 'hartige taart',
        'varkenshaas' : 'vlees',
        'gadogado' : 'groenten',
        'croque madame' : 'brood',
        'tartaar' : 'vlees',
        'kwarktaart' : 'taart',
        'crèmetaart' : 'taart',
        'schelvis' : 'vis',
        'gender reveal-taart' : 'taart',
        'filetlap' : 'vlees',
        'roerbak' : 'roerbak',
        'gehaktspies' : 'vlees',
        'kebab' : 'vlees',
        'focaccia' : 'brood',
        'eclair' : 'gebak',
        'sinterklaastaart' : 'taart',
        'visspies' : 'vis',
        'speculaas' : 'gebak',
        'stracciatellataart' : 'taart',
        'andijviestamp' : 'stamppot',
        'bladerdeegtaart' : 'taart',
        'cocktail' : 'cocktail',
        'boterham' : 'brood',
        'wortel met' : 'groente',
        'vissticks' : 'vis',
        'gado-gado' : 'groente',
        'schol' : 'vis',
        'perziken met' : 'fruit',
        'kokoskoek' : 'koek',
        'martini' : 'cocktail',
        'preitaart' : 'hartige taart',
        'soufflee' : 'souflé',
        'gevulde aubergines' : 'groente',
        'sorbet' : 'ijs',
        'sangria' : 'cocktail',
        'chocolademelk' : 'chocolademelk',
        'bosvruchtentaart' : 'taart',
        'piña colada' : 'cocktail',
        'makreel' : 'vis',
        'beignets' : 'gebak',
        'meringue' : 'gebak',
        'forel' : 'vis',
        'perentaart' : 'taart',
        'vlaai' : 'taart',
        'taart' : 'taart',
        'ossenhaas' : 'vlees',
        'rund' : 'vlees',
        'tarte tatin' : 'taart',
        'wellington' : 'vlees',
        'zandkoek' : 'koek',
        'ijs met' : 'ijs',
        'bladerdeeghapje' : 'gebak',
        'trifle' : 'gebak',
        'chocoladekoek' : 'koek',
        'brioche' : 'brood',
        'banketstaaf' : 'gebak',
        'hot chocolate' : 'chocolademelk',
        'pepernoten' : 'snoep',
        'shake' : 'milkshake',
        'pudding' : 'gebak',
        'haring' : 'vis',
        'ham' : 'vlees',
        'zeetong' : 'vis',
        'cappuccino' : 'koffie',
        'rosbief' : 'vlees',
        'spare ribs' : 'vlees',
        'truffels' : 'chocolade',
        'espresso' : 'koffie',
        'coffee' : 'koffie',
        'kerstkoek' : 'koek',
        'zeebaars' : 'vis',
        'meatloaf' : 'vlees',
        'pie' : 'taart',
        'roodbaars' : 'vis',
        'tompouce' : 'gebak',
        'crema catalana' : 'gebak',
        'bloody mary' : 'cocktail',
        'churros' : 'gebak',
        'frozen bananas' : 'fruit',
        'gemberbier' : 'drank',
        'gegrilde maiskolven' : 'groente',
        'tomaten' : 'groente',
        'witlofhapje' : 'groente',
        'postelein' : 'groente',
        'nachos' : 'nachos',
        'spinazie' : 'groente',
        'asperges' : 'groente',
        'smoorballetjes' : 'vlees',
        'pudding' : 'gebak',
        'puddinkje' : 'gebak',
        'spareribs' : 'vlees',
        'crêpes': 'pannenkoek',
        'ijs' : 'ijs',
        'paella' : 'paella',
        'koek' : 'koek',
        'taaitaai' : 'koek',
        'entrecote' : 'vlees',
        'coupe' : 'ijs',
        'tompouzen' : 'gebak',
        'witlof' : 'groente',
        'crème brûlée' : 'gebak',
        'kletskop' : 'koek',
        'parelhoen' : 'vlees',
        'appel' : 'fruit',
        'peer' : 'fruit',
        'varkensschnitzel' : 'vlees',
        'appelflap' : 'gebak',
        'aardappel' : 'aardappel',
        'pompoen' : 'groente',
        'peen' : 'groente',
        'krieltjes' : 'aardappel',
        'knakworst' : 'vlees',
        'oliebol' : 'gebak',
        'kreeft' : 'vis',
        'paprika' : 'groente',
        'kwark' : 'kwark',
        'pruim' : 'fruit',
        'puddinkje' : 'gebak',
        'mojito' : 'cocktail',
        'ei' : 'ei',
        'worst' : 'vlees',
        'glühwein' : 'cocktail',
        'tofu' : 'groente',
        'courgette' : 'groente',
        'wienerschnitzel' : 'vlees',
        'peren' : 'fruit',
        'fruit' : 'fruit',
        'lollies' : 'snoep',
        'marshmallow' : 'snoep',
        'vis' : 'vis',
        'pindakaas' : 'broodbeleg',
        'nutella' : 'broodbleg',
        'boerenkool' : 'groente',
        'aubergine' : 'groente',
        'artisjok' : 'groente',
        'egg' : 'ei',
        'limonade' : 'cocktail',
        'erwt' : 'groente',
        'sperziebonen' : 'groente',
        'calamaris' : 'zeevrucht',
        'bonen' : 'groente',
        'gebak' :'gebak',
        'mango' : 'fruit',
        'snijbonen' : 'groente',
        'spruit' : 'groente',
        'pavlova' : 'gebak',
        'soesje' : 'gebak',
        'yoghurt' : 'yoghurt',
        'bosvrucht' : 'fruit',
        'tomaat' : 'groente',
        'bloemkool' : 'groente',
        'broccoli' : 'groente',
        'dame blanche' : 'ijs',
        'wodka' : 'cocktail',
        'biet' : 'groente',
        'wortel' : 'groente',
        'donut' : 'gebak',
        'groente' : 'groente',
        'paddenstoel' : 'groente',
        'venkel' : 'groente',
        'frambozen' : 'fruit',
        'poffertje' : 'pannenkoek',
        'kool' : 'groente',
        'olie' : 'olie',
        'fazant' : 'vlees',
        'chorizo' : 'vlees',
        'ananas' : 'fruit',
        'bananen' : 'fruit',
        'limoncello' : 'cocktail',
        'haricots' : 'groente',
        'watermeloen' : 'fruit',
        'zeeuwse bolus' : 'gebak',
        'macarons' : 'gebak',
        'hachee' : 'vlees',
        'colada' : 'cocktail',
        'pretzel' : 'gebak',
        'meloen' : 'fruit',
        'latte' : 'koffie',
        'nectarine' : 'fruit',
        'chicken' : 'vlees',
        'cranberry' : 'fruit',
        'limoen' : 'fruit',
        'kruidnoot' : 'snoep',
        'chimichurri' : 'groente',
        'chocola' : 'snoep',
        'vijg' : 'fruit',
        'halloumi' : 'kaas',
        'abrikozen' : 'fruit',
        'vruchten' : 'fruit',
        'camembert' : 'kaas',
        'radicchio' : 'groente',
        'bonbon' : 'gebak',
        'karamel' : 'snoep',
        'hangop' : 'kwark',
        'marinade' : 'saus',
        'saus' : 'saus',
        'selderij' : 'groente',
        'bes' : 'fruit',
        'tapenade' : 'broodbeleg',
        'lolly' : 'snoep',
        'frikandel' : 'vlees',
        'babi pangang' : 'vlees',
        'stol' : 'gebak',
        'noedels' : 'roerbak',
        'boontjes' : 'groente',
        'dessert' : 'gebak',
        'boter' : 'boter',
        'polpette' : 'groente',
        'vega' : 'groente',
        'mayonaise' : 'saus',
        'reep' : 'koek',
        'salsa' : 'saus',
        'friet' : 'aardappel',
        'jan in de zak' : 'gebak',
        'nonnevot' : 'gebak',
        'polenta' : 'pap',
        'abrikoos' : 'fruit',
        'pap' : 'pap',
        'komkommer' : 'groente',
        'ketchup' : 'saus',
        'kaas' : 'kaas',
        'bami' : 'vlees',
        'bellini' : 'gebak',
        'loempia' : 'groente',
        'advocaat' : 'cocktail',
        'moccaccino' : 'koffie',
        'boregi' : 'gebak',
        'filo' : 'gebak',
        'borstplaat' : 'vlees',
        'granola' : 'granola',
        'museli' : 'granola',
        'druiven' : 'fruit',
        'room' : 'room',
        'granita' : 'ijs',
        'haas' : 'vlees',
        'melk' : 'melk',
        'koffie' : 'koffie',
        'saliedriehoek' : 'gebak',
        'popcorn' : 'snoep',
        'tempeh' : 'groente',
        'hummus' : 'broodbeleg',
        'nasi rames' : 'vlees',
        'bavarois' : 'gebak',
        'bluf' : 'gebak',
        'legumes' : 'groente',
        'glazuur' : 'glazuur',
        'bladerdeeg' : 'hartige taart',
        'kwartel' : 'vlees',
        'maanzaadbolletjes' : 'brood',
        'andijvie' : 'groente',
        'piccalilly' : 'saus',
        'pastinaak' : 'groente',
        'sukadelapje' : 'vlees',
        'ribollita' : 'soep',
        'heksenvinger' : 'gebak',
        'kerstkrans' : 'gebak',
        'flottantes' : 'gebak',
        'heek' : 'vis',
        'frutti' : 'fruit',
        'frisdrank' : 'cocktail',
        'groppino' : 'cocktail',
        'amandelen' : 'snoep',
        'schuimspookje' : 'snoep',
        'citroen' : 'fruit'
    }

    # Make sure to return the recipe type as a list object so that the inverted document list can be easily computed

    # Check title for recipe type
    for key, value in recipe_type_dict.items():
        if key in row["title"].lower():
            return value.split()

    # Check description for recipe type
    for key, value in recipe_type_dict.items():
        if key in row["description"].lower():
            return value.split()
    
    # If no type is found, return that no type is found
    return ["Type not found"]

# Get the recipe type from the title
data["type"] = data.apply(get_recipe_type, axis=1)

In [34]:
# Check recipes that have no type yet
i = 0
for index, row in data.iterrows():
    if "Type not found" in row["type"]:
        print(index, ': ', row['title'])
        i += 1

print("Total not found: " + str(i) + " of " + str(data.shape[0]))

Total not found: 0 of 16691


In [None]:
# convert title and description to arrays
from nltk.tokenize import wordpunct_tokenize

f = lambda x: wordpunct_tokenize(x["title"])
data["title"] = data.apply(f, axis=1)
f = lambda x: wordpunct_tokenize(x["description"])
data["description"] = data.apply(f, axis=1)

In [None]:
# convert to lowercase
data['title'] = data["title"].apply(lambda x: [y.lower() for y in x])
data['description'] = data["description"].apply(lambda x: [y.lower() for y in x])

In [None]:
# Stem the columns title and description into new column
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("dutch")
data['title'] = data["title"].apply(lambda x: [stemmer.stem(y) for y in x])
data['description'] = data["description"].apply(lambda x: [stemmer.stem(y) for y in x])

In [None]:
# Remove non-informative words from title and description
def remove_words(words):
    to_remove = ['de', 'met', 'en', 'een', 'het', 'voor', 'vor', 'lekker']
    return [i for i in words if i not in to_remove]
      
data['title'] = data.apply(lambda x: remove_words(x['title']),axis=1)
data['description'] = data.apply(lambda x: remove_words(x['description']),axis=1)

In [None]:
data['ingredients'] = data["ingredients"].apply(lambda x: [stemmer.stem(y) for y in x])
data.head()

In [None]:
# !pip install googletrans

In [None]:
# from googletrans import Translator
# translator = Translator()
# # translator.translate('안녕하세요.', dest='ja')
# translations = translator.translate(['potatoes', 'strawberry', 'dressing'], dest='nl')
# for translation in translations:
#   print(translation.origin, ' -> ', translation.text)

In [None]:
# Define function to convert recipe yield to float numbers
def recipe_yield_numerical(dataset, column_name):
    numerical_col = []
    for item in dataset[column_name]:
        split = re.split(' ', str(item))
        numerical_col.append(float(split[0]))
    return numerical_col

In [None]:
# Define function to convert weights to grams as float numbers
def weights_to_numerical(dataset, column_name):
    numerical_column = []
    for item in dataset[column_name]:
        if type(item) is str:
          
            # remove . that indicates a thousand
            if '.' in item:
                item = re.sub(r'[.]', '', item)
            split = re.split(r'[a-zA-Z]', item)
            value = split[0]
            
            # substitute commas, that indicate decimals, for periods
            if ',' in value:
                value = re.sub(r'[,]', '.', value)
            numerical_column.append(float(value))
        else:
            numerical_column.append(item)
    return numerical_column

In [None]:
# Define function to convert times to minutes as float numbers
def time_to_numerical(data, column_name):
    numerical_times = []
    for item in data[column_name]:
        if type(item) is str:
            item = re.sub(r' bereiden| wachten', '', item)
            split = re.split(' ', item)
            if len(split) == 4:
                value = float(split[0])*60 + float(split[2])
                numerical_times.append(value)
            elif len(split) == 2:
                if 'u' in split[1]:
                    value = float(split[0]) * 60
                else:
                    value = float(split[0])
                numerical_times.append(value)
        else:
            numerical_times.append(item)            
         
    return numerical_times

In [None]:
# Rating and review_count to floats
data['rating'] = data['rating'].astype(float)
data['review_count'] = data['review_count'].astype(float)

# Conver the numeric values in the columns to float numbers
data['recipe_yield'] = recipe_yield_numerical(data, 'recipe_yield')
data['calories'] = weights_to_numerical(data, 'calories')
data['protein'] = weights_to_numerical(data, 'protein')
data['carbohydrates'] = weights_to_numerical(data, 'carbohydrates')
data['fat'] = weights_to_numerical(data, 'fat')
data['saturated_fat'] = weights_to_numerical(data, 'saturated_fat')
data['sodium'] = weights_to_numerical(data, 'sodium')
data['fiber'] = weights_to_numerical(data, 'fiber')
data['cooking_time'] = time_to_numerical(data, 'cooking_time')

In [None]:
# Define function to create an inverted document list for for a column and its corresponding recipe IDs

def create_inverted_document_list(dataframe,column_name):
    # Get the unique values from the column
    all_values = []
    for recipe in dataframe[column_name]:
        all_values.extend(recipe)
    unique_values = set(all_values)

    # For each unique value get a list of recipe IDs
    inverted_list = {}
    for value in unique_values:
        recipe_ids = []
        for recipe_id, recipe in enumerate(dataframe[column_name]):
            if value in recipe:
                recipe_ids.append(recipe_id)
        inverted_list[value] = recipe_ids
    
    return inverted_list

# Create inverted document lists for recipe ingredients, tags, and types
inverted_list_ingredients = create_inverted_document_list(data,'ingredients')
inverted_list_tags = create_inverted_document_list(data, 'tags')
inverted_list_types = create_inverted_document_list(data, 'type')
inverted_list_title = create_inverted_document_list(data, 'title')

# For report: using dict instead of list because it is faster for lookup: https://stackoverflow.com/questions/513882/python-list-vs-dict-for-look-up-table

In [None]:
data.head()

In [None]:
data.info()

# Save preprocessed dataset and inverted document lists

In [None]:
def dict_to_csv(output_document, dictionary):
    w = csv.writer(open(output_document+".csv", "w"))
    for key, val in dictionary.items():
        w.writerow([key, val])

In [None]:
# Save allerhande preprocessed dataset to CSV
data.to_csv(r'allerhande_preprocessed.csv')

# Save inverted document lists to CSV
dict_to_csv('inverted_list_ingredients',inverted_list_ingredients)
dict_to_csv('inverted_list_tags', inverted_list_tags)
dict_to_csv('inverted_list_types', inverted_list_types)
dict_to_csv('inverted_list_title', inverted_list_title)

In [None]:
data.loc[data['id'] == 1088596]