# Prepare (nutrient vector + ingredient vector) for training set

In [1]:
import pandas as pd

train_data = pd.read_csv('./data/train.csv')
print(len(train_data))

4855
856


### Ingredient vector for training set

In [2]:
# parse ingredients

def clean_recipe(ingred_list):
    cleanedtext = []
    ingred_list = ast.literal_eval(ingred_list)
    
    for matchtext in ingred_list:
        
        # Obtain all before first comma
        if re.compile('^(.+?)(?=,)').search(matchtext) is not None:
            matchtext = re.compile('^(.+?)(?=,)').search(matchtext).group(1)
         
        # Tokenize ingredient list
        tokenized = word_tokenize(matchtext)
        
        # Remove words likely to be stop words or measurements
        removed_stop = [w for w in tokenized if not w in measure_corpus]
        removed_stop = [w for w in removed_stop if not w in stop_words]
        
        # Filter adjectives and nouns
        ingred_words = lambda pos: pos[:2] in ['JJ','NN','NNS']
        ingreds = [word.lower() for (word, pos) in pos_tag(removed_stop) if ingred_words(pos)]
        
        # Convert to singular
        ingreds = [p.singular_noun(word) if p.singular_noun(word) else word for word in ingreds]
        
        # remove special characters (including numbers!)怎么去除 ½, ¼, %等
        #cleanedtext = re.sub('[^a-zA-Z]', '', cleanedtext)
        ingreds = [re.sub('[^ a-zA-Z]', '', i) for i in ingreds]
        #print(ingreds)
        
        # Remove common ingredients 
        common = []
        cleanedtext.append(ingreds)
        cleanedtext = [[ing for ing in ingreds if not any(word in common for word in ingreds)] for ingreds in cleanedtext]
        
        # Remove additional descriptors for long ingredient names
        cleanedtext = [ingreds[-2:] if len(ingreds) > 2 else ingreds for ingreds in cleanedtext]
        
    return [(' ').join(item) for item in cleanedtext if len(item)>0]
    #return [(' ').join(item) for item in cleanedtext if len(item)>0 and item!=['']]

In [3]:
# extract ingredients from each recipe

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
import re
import requests
from bs4 import BeautifulSoup
import ast
import inflect

p = inflect.engine()
stop_words = set(stopwords.words('english'))

df = train_data[['ingredients']]

page = requests.get('https://www.enchantedlearning.com/wordlist/measurement.shtml')
soup = BeautifulSoup(page.content, "html.parser")
measure_corpus = [tag.text for tag in soup.find_all('div',attrs={'class':'wordlist-item'})]
measure_corpus = measure_corpus + [text+'s' for text in measure_corpus] + \
            ['taste','strip', 'strips', 'package', 'packages', 'satchet', \
             'satchets', 'sprigs', 'head', 'bunch', 'small', 'large', 'big', 'medium', 'tbsp', 'g']

#clean_recipe(ast.literal_eval(data.astype(str)['ingredients'][0]))
df['ingredients'] = df['ingredients'].apply(clean_recipe)

df = df.rename(columns={"ingredients": "parsed_ingredients"})
train_data = train_data.join(df)
train_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ingredients'] = df['ingredients'].apply(clean_recipe)


Unnamed: 0,title,image,ingredients,nutrients,file_name,parsed_ingredients
0,Easy bread rolls,https://images.immediate.co.uk/production/vola...,"['500g strong white bread flour , plus extra f...","{'calories': '246 calories', 'fatContent': '2 ...",bread0.jpg,"[bread flour, action yeast, caster sugar, fine..."
1,Vegan banana bread,https://images.immediate.co.uk/production/vola...,"['3 large black bananas', '75ml vegetable oil ...","{'calories': '218 calories', 'fatContent': '8 ...",bread1.jpg,"[black banana, sunflower oil, brown sugar, pow..."
2,Meatball & garlic bread traybake,https://images.immediate.co.uk/production/vola...,"['350g turkey thigh mince', '1 tsp dried orega...","{'calories': '565 calories', 'fatContent': '28...",bread2.jpg,"[thigh mince, tsp oregano, fennel seed, olive ..."
3,Naan bread,https://images.immediate.co.uk/production/vola...,"['1x 7g sachet dried yeast', '2 tsp golden cas...","{'calories': '224 calories', 'fatContent': '8 ...",bread3.jpg,"[sachet yeast, caster sugar, bread flour, tsp ..."
4,Pitta bread,https://images.immediate.co.uk/production/vola...,"['2 tsp fast-action dried yeast', '500g strong...","{'calories': '246 calories', 'fatContent': '2 ...",bread4.jpg,"[fastaction yeast, bread flour, tsp salt, oliv..."


In [5]:
# count the frequency of each ingredient

dic = {}
top_ingredients = 100

for ingredient in train_data['parsed_ingredients']:
    for item in ingredient:
        try:
            dic[str(item)] += 1
        except:
            dic[str(item)] = 1

dic_sorted = sorted(dic.items(),key=lambda item:item[1], reverse=True)

# find most common top 10 ingredients
top_ingredient = []

for (i,j) in dic_sorted[:top_ingredients]:
    top_ingredient.append(i)

# store top 100 ingredients as csv file 
pd.DataFrame(dic_sorted[:top_ingredients], columns=['ingredient', 'recipe number']).to_csv('./data/top_100_ingre_in_training_set.csv', index=False)    
dic_sorted[:5]

[('olive oil', 1952),
 ('garlic clove', 1427),
 ('egg', 1298),
 ('butter', 1216),
 ('caster sugar', 1104)]

In [6]:
# turn ingredients of each recipe into vector
import numpy as np

ingre_vectors = []

for parsed_ingre in train_data['parsed_ingredients']:
    
    ingres = parsed_ingre
    ingre_vector = top_ingredients * [0]
    
    for ingre in ingres:
        if ingre in top_ingredient:
            ingre_vector[top_ingredient.index(ingre)] = 1
    
    ingre_vectors.append(ingre_vector)
    
train_data['ingredient_vector'] = ingre_vectors
train_data.head()

Unnamed: 0,title,image,ingredients,nutrients,file_name,parsed_ingredients,ingredient_vector
0,Easy bread rolls,https://images.immediate.co.uk/production/vola...,"['500g strong white bread flour , plus extra f...","{'calories': '246 calories', 'fatContent': '2 ...",bread0.jpg,"[bread flour, action yeast, caster sugar, fine...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Vegan banana bread,https://images.immediate.co.uk/production/vola...,"['3 large black bananas', '75ml vegetable oil ...","{'calories': '218 calories', 'fatContent': '8 ...",bread1.jpg,"[black banana, sunflower oil, brown sugar, pow...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Meatball & garlic bread traybake,https://images.immediate.co.uk/production/vola...,"['350g turkey thigh mince', '1 tsp dried orega...","{'calories': '565 calories', 'fatContent': '28...",bread2.jpg,"[thigh mince, tsp oregano, fennel seed, olive ...","[1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Naan bread,https://images.immediate.co.uk/production/vola...,"['1x 7g sachet dried yeast', '2 tsp golden cas...","{'calories': '224 calories', 'fatContent': '8 ...",bread3.jpg,"[sachet yeast, caster sugar, bread flour, tsp ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Pitta bread,https://images.immediate.co.uk/production/vola...,"['2 tsp fast-action dried yeast', '500g strong...","{'calories': '246 calories', 'fatContent': '2 ...",bread4.jpg,"[fastaction yeast, bread flour, tsp salt, oliv...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


### Nutrient vector for training set

{'calories': '737 calories', 
 'fatContent': '42 grams fat', 
 'saturatedFatContent': '10 grams saturated fat', 
 'carbohydrateContent': '49 grams carbohydrates', 
 'sugarContent': '6 grams sugar', 
 'fiberContent': '4 grams fiber', 
 'proteinContent': '39 grams protein', 
 'sodiumContent': '4.1 milligram of sodium'}

In [9]:
# extract nutrient information as target for training

import ast
import re
import numpy as np

nutrient_information = []

for nutrients in train_data['nutrients']:
    # convert string to dictionary
    data_dict = ast.literal_eval(nutrients)
    # data_dict.values()
    # find numerical values of each nutrients
    nutrient = re.findall('[\d+\.\d+]+', str(data_dict.values()))
    nutrient = [float(num) for num in nutrient]
    nutrient_information.append(nutrient)
    
    
train_data['nutrient_vector'] = nutrient_information
train_data.to_csv('./data/train_data_vector.csv', index=False)
print(len(train_data))


# remove those with incomplete nutients

j = 0
drop_index = []

for i in nutrient_information:
    if len(i) < 8:
        drop_index.append(j)
    j+=1


nutrient_information = [i for j, i in enumerate(nutrient_information) if j not in drop_index]
nutrient_information = np.asarray(nutrient_information)
print(nutrient_information.shape)

test = train_data.drop(drop_index)
test.to_csv('./data/train_data_vector_complete.csv', index=False)
#test.reindex(np.arange(len(test))+1)

4855
(4218, 8)


# Prepare (nutrient vector + ingredient vector) for test set

In [10]:
test_data = pd.read_csv('./data/test.csv')
print(len(test_data))

df = test_data[['ingredients']]

#clean_recipe(ast.literal_eval(data.astype(str)['ingredients'][0]))
df['ingredients'] = df['ingredients'].apply(clean_recipe)

df = df.rename(columns={"ingredients": "parsed_ingredients"})
test_data = test_data.join(df)
test_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ingredients'] = df['ingredients'].apply(clean_recipe)


Unnamed: 0,title,image,ingredients,nutrients,file_name,parsed_ingredients
0,Cheesy chard gratin,https://images.immediate.co.uk/production/vola...,"['bunch chard , about 340g', '150ml double cre...","{'calories': '391 calories', 'fatContent': '36...",bread383.jpg,"[chard, double cream, glutenfree alternative, ..."
1,Buttery chilli prawns,https://images.immediate.co.uk/production/vola...,"['25g butter', '2 tbsp olive oil', '3 garlic c...","{'calories': '237 calories', 'fatContent': '22...",bread384.jpg,"[butter, olive oil, garlic clove, red chilli, ..."
2,Broad beans with tomatoes & anchovies,https://images.immediate.co.uk/production/vola...,['1.3kg young broad beans in the pod (to give ...,"{'calories': '161 calories', 'fatContent': '10...",bread385.jpg,"[shelled bean, olive oil, cherry tomato, sprin..."
3,Sardine & asparagus traybake,https://images.immediate.co.uk/production/vola...,"['large bunch of asparagus', '1 olive & basil ...","{'calories': '700 calories', 'fatContent': '36...",bread386.jpg,"[asparagu, roll choice, olive oil, choose leaf]"
4,"Baked feta with peas, chilli & preserved lemon",https://images.immediate.co.uk/production/vola...,"['250g block of feta', '3 tbsp extra virgin ol...","{'calories': '541 calories', 'fatContent': '43...",bread387.jpg,"[block feta, olive oil, red chilli, lemon, fre..."


In [11]:
# turn ingredients of each recipe into vector

ingre_vectors = []

for parsed_ingre in test_data['parsed_ingredients']:
    
    ingres = parsed_ingre
    ingre_vector = top_ingredients * [0]
    
    for ingre in ingres:
        if ingre in top_ingredient:
            ingre_vector[top_ingredient.index(ingre)] = 1
    
    ingre_vectors.append(ingre_vector)
    
test_data['ingredient_vector'] = ingre_vectors
test_data.head()

Unnamed: 0,title,image,ingredients,nutrients,file_name,parsed_ingredients,ingredient_vector
0,Cheesy chard gratin,https://images.immediate.co.uk/production/vola...,"['bunch chard , about 340g', '150ml double cre...","{'calories': '391 calories', 'fatContent': '36...",bread383.jpg,"[chard, double cream, glutenfree alternative, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
1,Buttery chilli prawns,https://images.immediate.co.uk/production/vola...,"['25g butter', '2 tbsp olive oil', '3 garlic c...","{'calories': '237 calories', 'fatContent': '22...",bread384.jpg,"[butter, olive oil, garlic clove, red chilli, ...","[1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Broad beans with tomatoes & anchovies,https://images.immediate.co.uk/production/vola...,['1.3kg young broad beans in the pod (to give ...,"{'calories': '161 calories', 'fatContent': '10...",bread385.jpg,"[shelled bean, olive oil, cherry tomato, sprin...","[1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
3,Sardine & asparagus traybake,https://images.immediate.co.uk/production/vola...,"['large bunch of asparagus', '1 olive & basil ...","{'calories': '700 calories', 'fatContent': '36...",bread386.jpg,"[asparagu, roll choice, olive oil, choose leaf]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"Baked feta with peas, chilli & preserved lemon",https://images.immediate.co.uk/production/vola...,"['250g block of feta', '3 tbsp extra virgin ol...","{'calories': '541 calories', 'fatContent': '43...",bread387.jpg,"[block feta, olive oil, red chilli, lemon, fre...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [12]:
# extract nutrient information as target for training

nutrient_information = []

for nutrients in test_data['nutrients']:
    # convert string to dictionary
    data_dict = ast.literal_eval(nutrients)
    # data_dict.values()
    # find numerical values of each nutrients
    nutrient = re.findall('[\d+\.\d+]+', str(data_dict.values()))
    nutrient = [float(num) for num in nutrient]
    nutrient_information.append(nutrient)
    
    
test_data['nutrient_vector'] = nutrient_information
test_data.to_csv('./data/test_data_vector.csv', index=False)
print(len(test_data))


# remove those with incomplete nutients

j = 0
drop_index = []

for i in nutrient_information:
    if len(i) < 8:
        drop_index.append(j)
    j+=1


nutrient_information = [i for j, i in enumerate(nutrient_information) if j not in drop_index]
nutrient_information = np.asarray(nutrient_information)
print(nutrient_information.shape)

test = test_data.drop(drop_index)
test.to_csv('./data/test_data_vector_complete.csv', index=False)

856
(696, 8)
