In [43]:
import numpy as np
import pandas as pd
import re
import unicodedata
import time

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from itertools import chain

In [44]:
train_set = pd.read_json('whats-cooking/train.json/train.json')
test_set = pd.read_json('whats-cooking/test.json/test.json')
train_set

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [45]:
test_set

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."
...,...,...
9939,30246,"[large egg yolks, fresh lemon juice, sugar, bo..."
9940,36028,"[hot sauce, butter, sweet potatoes, adobo sauc..."
9941,22339,"[black pepper, salt, parmigiano reggiano chees..."
9942,42525,"[cheddar cheese, cayenne, paprika, plum tomato..."


In [46]:
def findStrangeIngredients(ingredients):
    strangeIngredients = set()
    for ingredientList in ingredients:
        for ingredient in ingredientList:
            if not re.findall(r'^[a-zA-Z ]+$', ingredient):
                strangeIngredients.add(ingredient)
    return strangeIngredients

strangeIngredients = findStrangeIngredients(train_set['ingredients'])
print(f'Number of strange ingredients: {len(strangeIngredients)}')
strangeIngredients

Number of strange ingredients: 455


{'(    oz.) tomato sauce',
 '(   oz.) tomato paste',
 '(10 oz.) frozen chopped spinach',
 '(10 oz.) frozen chopped spinach, thawed and squeezed dry',
 '(14 oz.) sweetened condensed milk',
 '(14.5 oz.) diced tomatoes',
 '(15 oz.) refried beans',
 '1% low-fat buttermilk',
 '1% low-fat chocolate milk',
 '1% low-fat cottage cheese',
 '1% low-fat milk',
 '2 1/2 to 3 lb. chicken, cut into serving pieces',
 '2% low fat cheddar chees',
 '2% low-fat cottage cheese',
 '2% lowfat greek yogurt',
 '2% milk shredded mozzarella cheese',
 '2% reduced-fat milk',
 '25% less sodium chicken broth',
 '33% less sodium cooked deli ham',
 '33% less sodium cooked ham',
 '33% less sodium ham',
 '33% less sodium smoked fully cooked ham',
 '40% less sodium taco seasoning',
 '40% less sodium taco seasoning mix',
 '7 Up',
 '8 ounc ziti pasta, cook and drain',
 '95% lean ground beef',
 'BACARDI® Mixers Margarita Mix',
 'BACARDI® Superior',
 "BREAKSTONE'S Sour Cream",
 'Barilla Oven-Ready Lasagne',
 'Bengali 5 Spice'

In [47]:
def findStrangeCuisines(cuisines):
    strangeCuisines = set()
    for cuisine in cuisines:
        if not re.findall(r'^[a-z ]+$', cuisine):
            strangeCuisines.add(cuisine)
    return strangeCuisines

strangeCuisines = findStrangeCuisines(train_set['cuisine'])
strangeCuisines

{'cajun_creole', 'southern_us'}

In [48]:
def lemmatizeIngredients(ingredients):
    return [" ".join([WordNetLemmatizer().lemmatize(word).lower() for word in ingredient.split()]) for ingredient in ingredients]
    


In [49]:
def convertUnicodeCharacters(ingredients):
    return [''.join(c for c in unicodedata.normalize('NFD', ingredient)
                  if unicodedata.category(c) != 'Mn') for ingredient in ingredients]

In [50]:
def removeAdditionalInfoFromIngredients(ingredients):
    ingredients = [re.sub(r'\(.*oz.*\)( )?|chopped|crushed|diced|frozen|ground|minced|peeled|powder|sliced|whole', '', ingredient) for ingredient in ingredients]
    ingredients = [re.sub(r'feta cheese crumbles', 'feta cheese', ingredient) for ingredient in ingredients]
    ingredients = [re.sub(r',(.+?)$', '', ingredient) for ingredient in ingredients]
    ingredients = [re.sub(r'[^a-z ]', '', ingredient) for ingredient in ingredients]
    ingredients = [re.sub(r' +', ' ', ingredient.strip()) for ingredient in ingredients]
    return ingredients

In [51]:
def transformData(ingredients):
    ingredients = lemmatizeIngredients(ingredients)
    ingredients = convertUnicodeCharacters(ingredients)
    ingredients = removeAdditionalInfoFromIngredients(ingredients)
    return ingredients


In [52]:
train_set['ingredients'] = train_set['ingredients'].apply(transformData)
train_set

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olive, grape tomato, g..."
1,25693,southern_us,"[plain flour, pepper, salt, tomato, black pepp..."
2,20130,filipino,"[egg, pepper, salt, mayonaise, cooking oil, gr..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallot, cornflour, cayenne pep..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[kraft zesty italian dressing, purple onion, b..."
39771,2238,irish,"[egg, citrus fruit, raisin, sourdough starter,..."
39772,41882,chinese,"[boneless chicken skinless thigh, garlic, stea..."


In [53]:
strangeIngredients = findStrangeIngredients(train_set['ingredients'])
print(f'Number of strange ingredients: {len(strangeIngredients)}')
strangeIngredients

Number of strange ingredients: 0


set()

In [54]:
strangeIngredients = findStrangeIngredients(test_set['ingredients'])
print(f'Number of strange ingredients: {len(strangeIngredients)}')
strangeIngredients

Number of strange ingredients: 258


{'(    oz.) tomato sauce',
 '(14.5 oz.) diced tomatoes',
 '1% low-fat buttermilk',
 '1% low-fat cottage cheese',
 '1% low-fat milk',
 '2% low-fat cottage cheese',
 '2% lowfat greek yogurt',
 '2% reduced fat chocolate milk',
 '2% reduced-fat milk',
 '33% less sodium cooked deli ham',
 '33% less sodium smoked ham',
 '40% less sodium taco seasoning',
 '50% less sodium black beans',
 '7 Up',
 '8 ounc ziti pasta, cook and drain',
 '95% lean ground beef',
 'Bengali 5 Spice',
 'Bertolli Tomato & Basil Sauce',
 'Bertolli® Alfredo Sauce',
 'Bertolli® Classico Olive Oil',
 'Bisquick Original All-Purpose Baking Mix',
 "Campbell's Condensed Cheddar Cheese Soup",
 "Campbell's Condensed Cream of Chicken Soup",
 'Country Crock® Spread',
 'Diamond Crystal® Kosher Salt',
 'Dutch-processed cocoa powder',
 "Frank's® RedHot® Original Cayenne Pepper Sauce",
 "French's Spicy Brown Mustard",
 'Green Giant™ Steamers™ Niblets® frozen corn',
 'Green Giant™ frozen chopped spinach',
 "Hellmann''s Light Mayonnaise

In [55]:
test_set['ingredients'] = test_set['ingredients'].apply(transformData)
test_set

Unnamed: 0,id,ingredients
0,18009,"[baking, egg, allpurpose flour, raisin, milk, ..."
1,28583,"[sugar, egg yolk, corn starch, cream of tartar..."
2,41580,"[sausage link, fennel bulb, frond, olive oil, ..."
3,29752,"[meat cut, file, smoked sausage, okra, shrimp,..."
4,35687,"[black pepper, salt, sausage casing, leek, par..."
...,...,...
9939,30246,"[large egg yolk, fresh lemon juice, sugar, bou..."
9940,36028,"[hot sauce, butter, sweet potato, adobo sauce,..."
9941,22339,"[black pepper, salt, parmigiano reggiano chees..."
9942,42525,"[cheddar cheese, cayenne, paprika, plum tomato..."


In [56]:
strangeIngredients = findStrangeIngredients(test_set['ingredients'])
print(f'Number of strange ingredients: {len(strangeIngredients)}')
strangeIngredients

Number of strange ingredients: 0


set()

In [57]:
def getDistinctIngredients(ingredients):
    return {x for l in ingredients for x in l}



In [58]:
dist_ingredients = getDistinctIngredients(train_set['ingredients'])
dist_ingredients_test = getDistinctIngredients(test_set['ingredients'])

print(len(dist_ingredients))

for ingredient in dist_ingredients_test:
    if ingredient not in dist_ingredients:
        dist_ingredients.add(ingredient)
        print(ingredient)

print(len(dist_ingredients))

6378
knorr beef flavored bouillon cube
low sodium pasta sauce
chobani yogurt
hazelnut paste
bramley apple
blended whiskey
spring chicken
vegan milk substitute
emerils essence
candied flower
low sodium turkey breast
unsalted creamy peanut butter
cheese crouton
italian moscato
glutenfree pizza crust
red horseradish
elderflower syrup
wheat spiral pasta
ragu old world style smooth pasta sauc
reduced fat reduced sodium condensed cream of chicken soup
passover cake meal
black sticky rice
candied mixed citrus peel
sargento artisan blends shredded milk mozzarella cheese
lowfat chocolate ice cream
large potato
bluefish
whisky
dill pickle spear
citrus rind
chablis
surimi
tabbouleh
butter cooky
spanish tuna
steamed bun flour
nochicken broth
tat soi
banh trang
fresh
pastrami
braised seitan
diet lemon lime soda
taco bell thick chunky medium salsa
special k cereal
goya seasoning
cooked cut green bean
fat free instant chocolate pudding mix
pace salsa
golden cake mix
low sodium vegetable juice cocktai

In [59]:
extra_data = pd.DataFrame(0, index=np.arange(len(train_set)), columns=dist_ingredients)
train_set = train_set.join(extra_data)
train_set

Unnamed: 0,id,cuisine,ingredients,au jus gravy,low sodium pasta sauce,sesame seed,spice,burger roll,boneless chicken cutlet,eggnog,...,orange slice,meat marinade,tbone steak,waffle,black onion seed,pure acai puree,mild salsa,centercut salmon fillet,pickle relish,cubed bread
0,10259,greek,"[romaine lettuce, black olive, grape tomato, g...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,25693,southern_us,"[plain flour, pepper, salt, tomato, black pepp...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20130,filipino,"[egg, pepper, salt, mayonaise, cooking oil, gr...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,22213,indian,"[water, vegetable oil, wheat, salt]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13162,indian,"[black pepper, shallot, cornflour, cayenne pep...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39770,11462,italian,"[kraft zesty italian dressing, purple onion, b...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39771,2238,irish,"[egg, citrus fruit, raisin, sourdough starter,...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39772,41882,chinese,"[boneless chicken skinless thigh, garlic, stea...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
def fillOnes(data):
    for row in range(len(data)):
        for ingredient in data['ingredients'][row]:
            data[ingredient][row] = 1
    return data


train_set = fillOnes(train_set)
train_set

Unnamed: 0,id,cuisine,ingredients,au jus gravy,low sodium pasta sauce,sesame seed,spice,burger roll,boneless chicken cutlet,eggnog,...,orange slice,meat marinade,tbone steak,waffle,black onion seed,pure acai puree,mild salsa,centercut salmon fillet,pickle relish,cubed bread
0,10259,greek,"[romaine lettuce, black olive, grape tomato, g...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,25693,southern_us,"[plain flour, pepper, salt, tomato, black pepp...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20130,filipino,"[egg, pepper, salt, mayonaise, cooking oil, gr...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,22213,indian,"[water, vegetable oil, wheat, salt]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13162,indian,"[black pepper, shallot, cornflour, cayenne pep...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39770,11462,italian,"[kraft zesty italian dressing, purple onion, b...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39771,2238,irish,"[egg, citrus fruit, raisin, sourdough starter,...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39772,41882,chinese,"[boneless chicken skinless thigh, garlic, stea...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
X = train_set.drop(['id', 'cuisine', 'ingredients'],axis='columns')
y = train_set['cuisine']
trainX, valX, trainY, valY = train_test_split(X, y, random_state = 12)

In [68]:
trainX.reset_index(drop=True)
valX.reset_index(drop=True)
trainY.reset_index(drop=True)
valY.reset_index(drop=True)
start = time.time()
model = LogisticRegression()

model = model.fit(trainX, trainY)
print(f'Total time: {time.time()-start}')

Total time: 133.73711276054382


In [69]:
model.score(valX, valY)

0.7791633145615446

In [71]:
extra_data = pd.DataFrame(0, index=np.arange(len(test_set)), columns=dist_ingredients)
test_set = test_set.join(extra_data)
test_set = fillOnes(test_set)
test_set

Unnamed: 0,id,ingredients,au jus gravy,low sodium pasta sauce,sesame seed,spice,burger roll,boneless chicken cutlet,eggnog,toasted slivered almond,...,orange slice,meat marinade,tbone steak,waffle,black onion seed,pure acai puree,mild salsa,centercut salmon fillet,pickle relish,cubed bread
0,18009,"[baking, egg, allpurpose flour, raisin, milk, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,28583,"[sugar, egg yolk, corn starch, cream of tartar...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,41580,"[sausage link, fennel bulb, frond, olive oil, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,29752,"[meat cut, file, smoked sausage, okra, shrimp,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,35687,"[black pepper, salt, sausage casing, leek, par...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9939,30246,"[large egg yolk, fresh lemon juice, sugar, bou...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9940,36028,"[hot sauce, butter, sweet potato, adobo sauce,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9941,22339,"[black pepper, salt, parmigiano reggiano chees...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9942,42525,"[cheddar cheese, cayenne, paprika, plum tomato...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [73]:
X_test = test_set.drop(['id', 'ingredients'],axis='columns')
id_test = test_set['id']

In [74]:
predictions = model.predict(X_test)

In [79]:
id_test = pd.DataFrame(id_test)
predictions = pd.DataFrame(predictions)
pred_to_csv = id_test.join(predictions)

pred_to_csv.rename(columns={0:'cuisine'}).to_csv('kaggle.csv', index=False)