In [787]:
import numpy as np
import pandas as pd
import re
import unicodedata

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from itertools import chain

In [788]:
train_set = pd.read_json('whats-cooking/train.json/train.json')
test_set = pd.read_json('whats-cooking/test.json/test.json')
train_set

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [789]:
test_set

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."
...,...,...
9939,30246,"[large egg yolks, fresh lemon juice, sugar, bo..."
9940,36028,"[hot sauce, butter, sweet potatoes, adobo sauc..."
9941,22339,"[black pepper, salt, parmigiano reggiano chees..."
9942,42525,"[cheddar cheese, cayenne, paprika, plum tomato..."


In [790]:
def findStrangeIngredients(ingredients):
    strangeIngredients = set()
    for ingredientList in ingredients:
        for ingredient in ingredientList:
            if not re.findall(r'^[a-zA-Z ]+$', ingredient):
                strangeIngredients.add(ingredient)
    return strangeIngredients

strangeIngredients = findStrangeIngredients(train_set['ingredients'])
print(f'Number of strange ingredients: {len(strangeIngredients)}')
strangeIngredients

Number of strange ingredients: 455


{'(    oz.) tomato sauce',
 '(   oz.) tomato paste',
 '(10 oz.) frozen chopped spinach',
 '(10 oz.) frozen chopped spinach, thawed and squeezed dry',
 '(14 oz.) sweetened condensed milk',
 '(14.5 oz.) diced tomatoes',
 '(15 oz.) refried beans',
 '1% low-fat buttermilk',
 '1% low-fat chocolate milk',
 '1% low-fat cottage cheese',
 '1% low-fat milk',
 '2 1/2 to 3 lb. chicken, cut into serving pieces',
 '2% low fat cheddar chees',
 '2% low-fat cottage cheese',
 '2% lowfat greek yogurt',
 '2% milk shredded mozzarella cheese',
 '2% reduced-fat milk',
 '25% less sodium chicken broth',
 '33% less sodium cooked deli ham',
 '33% less sodium cooked ham',
 '33% less sodium ham',
 '33% less sodium smoked fully cooked ham',
 '40% less sodium taco seasoning',
 '40% less sodium taco seasoning mix',
 '7 Up',
 '8 ounc ziti pasta, cook and drain',
 '95% lean ground beef',
 'BACARDI® Mixers Margarita Mix',
 'BACARDI® Superior',
 "BREAKSTONE'S Sour Cream",
 'Barilla Oven-Ready Lasagne',
 'Bengali 5 Spice'

In [791]:
def findStrangeCuisines(cuisines):
    strangeCuisines = set()
    for cuisine in cuisines:
        if not re.findall(r'^[a-z ]+$', cuisine):
            strangeCuisines.add(cuisine)
    return strangeCuisines

strangeCuisines = findStrangeCuisines(train_set['cuisine'])
strangeCuisines

{'cajun_creole', 'southern_us'}

In [792]:
def lemmatizeIngredients(ingredients):
    return [" ".join([WordNetLemmatizer().lemmatize(word).lower() for word in ingredient.split()]) for ingredient in ingredients]
    


In [793]:
def convertUnicodeCharacters(ingredients):
    return [''.join(c for c in unicodedata.normalize('NFD', ingredient)
                  if unicodedata.category(c) != 'Mn') for ingredient in ingredients]

In [794]:
def removeAdditionalInfoFromIngredients(ingredients):
    ingredients = [re.sub(r'\(.*oz.*\)( )?|chopped|crushed|diced|frozen|ground|minced|peeled|powder|sliced|whole', '', ingredient) for ingredient in ingredients]
    ingredients = [re.sub(r'feta cheese crumbles', 'feta cheese', ingredient) for ingredient in ingredients]
    ingredients = [re.sub(r',(.+?)$', '', ingredient) for ingredient in ingredients]
    ingredients = [re.sub(r'[^a-z ]', '', ingredient) for ingredient in ingredients]
    ingredients = [re.sub(r' +', ' ', ingredient.strip()) for ingredient in ingredients]
    return ingredients

In [795]:
def transformData(ingredients):
    ingredients = lemmatizeIngredients(ingredients)
    ingredients = convertUnicodeCharacters(ingredients)
    ingredients = removeAdditionalInfoFromIngredients(ingredients)
    return ingredients


In [796]:
train_set['ingredients'] = train_set['ingredients'].apply(transformData)
train_set

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olive, grape tomato, g..."
1,25693,southern_us,"[plain flour, pepper, salt, tomato, black pepp..."
2,20130,filipino,"[egg, pepper, salt, mayonaise, cooking oil, gr..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallot, cornflour, cayenne pep..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[kraft zesty italian dressing, purple onion, b..."
39771,2238,irish,"[egg, citrus fruit, raisin, sourdough starter,..."
39772,41882,chinese,"[boneless chicken skinless thigh, garlic, stea..."


In [797]:
strangeIngredients = findStrangeIngredients(train_set['ingredients'])
print(f'Number of strange ingredients: {len(strangeIngredients)}')
strangeIngredients

Number of strange ingredients: 0


set()

In [798]:
strangeIngredients = findStrangeIngredients(test_set['ingredients'])
print(f'Number of strange ingredients: {len(strangeIngredients)}')
strangeIngredients

Number of strange ingredients: 258


{'(    oz.) tomato sauce',
 '(14.5 oz.) diced tomatoes',
 '1% low-fat buttermilk',
 '1% low-fat cottage cheese',
 '1% low-fat milk',
 '2% low-fat cottage cheese',
 '2% lowfat greek yogurt',
 '2% reduced fat chocolate milk',
 '2% reduced-fat milk',
 '33% less sodium cooked deli ham',
 '33% less sodium smoked ham',
 '40% less sodium taco seasoning',
 '50% less sodium black beans',
 '7 Up',
 '8 ounc ziti pasta, cook and drain',
 '95% lean ground beef',
 'Bengali 5 Spice',
 'Bertolli Tomato & Basil Sauce',
 'Bertolli® Alfredo Sauce',
 'Bertolli® Classico Olive Oil',
 'Bisquick Original All-Purpose Baking Mix',
 "Campbell's Condensed Cheddar Cheese Soup",
 "Campbell's Condensed Cream of Chicken Soup",
 'Country Crock® Spread',
 'Diamond Crystal® Kosher Salt',
 'Dutch-processed cocoa powder',
 "Frank's® RedHot® Original Cayenne Pepper Sauce",
 "French's Spicy Brown Mustard",
 'Green Giant™ Steamers™ Niblets® frozen corn',
 'Green Giant™ frozen chopped spinach',
 "Hellmann''s Light Mayonnaise

In [799]:
test_set['ingredients'] = test_set['ingredients'].apply(transformData)
test_set

Unnamed: 0,id,ingredients
0,18009,"[baking, egg, allpurpose flour, raisin, milk, ..."
1,28583,"[sugar, egg yolk, corn starch, cream of tartar..."
2,41580,"[sausage link, fennel bulb, frond, olive oil, ..."
3,29752,"[meat cut, file, smoked sausage, okra, shrimp,..."
4,35687,"[black pepper, salt, sausage casing, leek, par..."
...,...,...
9939,30246,"[large egg yolk, fresh lemon juice, sugar, bou..."
9940,36028,"[hot sauce, butter, sweet potato, adobo sauce,..."
9941,22339,"[black pepper, salt, parmigiano reggiano chees..."
9942,42525,"[cheddar cheese, cayenne, paprika, plum tomato..."


In [800]:
strangeIngredients = findStrangeIngredients(test_set['ingredients'])
print(f'Number of strange ingredients: {len(strangeIngredients)}')
strangeIngredients

Number of strange ingredients: 0


set()

In [801]:
def getDistinctIngredients(ingredients):
    return {x for l in ingredients for x in l}



In [802]:
dist_ingredients = getDistinctIngredients(train_set['ingredients'])
dist_ingredients_test = getDistinctIngredients(test_set['ingredients'])

print(len(dist_ingredients))

for ingredient in dist_ingredients_test:
    if ingredient not in dist_ingredients:
        dist_ingredients.add(ingredient)
        print(ingredient)

print(len(dist_ingredients))

6378
flax
pompeian extra virgin olive oil
malbec
licorice
streusel topping
molasses sugar
sargento artisan blends shredded milk mozzarella cheese
low fat small curd cottag chees
elderflower syrup
banana pudding
farfallini
braised seitan
mint jelly
reduced sodium tomato juice
johnsonville italian all natural hot sausage
lowfat wheat cracker
recaito
candied citron
pinipig
pork strip
knox gelatin
kettle chips
stir fry oil
pumpkin butter
herb mix
loose black tea
lemon flavor instant pudding mix
zabaglione
rock candy syrup
taco rub
wheat white flour
butter cake mix
vietnamese spinach
asian herb
gumdrop
special k cereal
chobani yogurt
macadamia
new mexican chile
sandwich cooky
boiler
cepe
lowfat creamy peanut butter
baby octopus
nonfat thousand island dressing
lovage
surimi
brain
thai kitchen red curry paste
quick rolled oat
liquid stevia
splenda granulated
shredded lowfat cheese
white lentil
skinless boneless duck breast half
pumpkin pie filling
dress russian
crumble topping
ortega taco sea

In [803]:
extra_data = pd.DataFrame(0, index=np.arange(len(train_set)), columns=dist_ingredients)
train_set = train_set.join(extra_data)

In [811]:
def fillOnes(row):
    print(row)
    # row[row['ingredients']] = 1
    return row

row = train_set.loc[0]
row.apply(fillOnes)
# train_set = train_set.apply(fillOnes)
# train_set

10259
greek
['romaine lettuce', 'black olive', 'grape tomato', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo bean', 'feta cheese']
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


id                                                                  10259
cuisine                                                             greek
ingredients             [romaine lettuce, black olive, grape tomato, g...
caciocavallo                                                            0
mung bean vermicelli                                                    0
                                              ...                        
condensed soup                                                          0
yellow miso                                                             0
buckwheat groat                                                         0
ginger puree                                                            0
biga                                                                    0
Name: 0, Length: 6795, dtype: object

In [805]:
def wordsPopularity(ingredients):
    words = []
    
    def forEach(ingredientsList):
        words.extend([([WordNetLemmatizer().lemmatize(word) for word in word_tokenize(ingredient)]) for ingredient in ingredientsList])
    # for ingredientsList in ingredients:
    ingredients.apply(forEach)
    # words = []#[(word_tokenize(ingredient) for ingredient in listIngredients) for listIngredients in ingredients]
    # words = [[word for word in word_tokenize(ingredient)] for ingredient in ingredients]
    words = [x for l in words for x in l]
    words = dict((word, words.count(word)) for word in set(words))
    
    return words

# words = wordsPopularity(train_set['ingredients'])

In [806]:
# words2 = sorted(words.items(), key=lambda x: x[1], reverse=True)
# words2[:10]

In [807]:
# dist_ingredients = list(set(chain(*train_set['ingredients']))).sort()
# dist_ingredients
dist_ingredients = list({x for l in train_set['ingredients'] for x in l})
dist_ingredients.sort()
dist_ingredients[:10]
# [i for i in dist_ingredients if 'oz.' in i]
# result.sort()
# result

['a taste of thai rice noodles',
 'abalone',
 'abbamele',
 'absinthe',
 'abura age',
 'acai',
 'acai juice',
 'accent',
 'accent seasoning',
 'accompaniment']

In [808]:
dist_ingredients = list({x for l in train_set['ingredients'] for x in l})
dist_ingredients.sort()
dist_ingredients[:10]

['a taste of thai rice noodles',
 'abalone',
 'abbamele',
 'absinthe',
 'abura age',
 'acai',
 'acai juice',
 'accent',
 'accent seasoning',
 'accompaniment']

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olive, grape tomato, g..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomato, gro..."
2,20130,filipino,"[egg, pepper, salt, mayonaise, cooking oil, gr..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallot, cornflour, cayenne pep..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[kraft zesty italian dressing, purple onion, b..."
39771,2238,irish,"[egg, citrus fruit, raisin, sourdough starter,..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [809]:
def fun(ingredients):
    counter = 0
    for ingredientsList in ingredients:
        for ingredient in ingredientsList:
            if 'potato puree' in ingredient:
                counter += 1
                print(ingredient)
    print(counter)

fun(train_set['ingredients'])

0


In [810]:
def fun(cuisines):
    for cuisine in cuisines:
        if 'southern_us' not in cuisine and 'us' in cuisine:
            print(cuisine)

fun(train_set['cuisine'])

russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
russian
