In [1]:
import pandas as pd
import numpy as np
import random
import re

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [16]:
train = pd.read_json('train.json')
test = pd.read_json('test.json')

In [17]:
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [18]:
def clean_ingredients(recipe):
    recipe = recipe.lower()
    '''
    Clean contractions
    '''
    recipe = re.sub("(let's)", "let us", recipe)
    recipe = re.sub("('m)", " am", recipe)
    recipe = re.sub("('re)", " are", recipe)
    recipe = re.sub("('ve)", " have", recipe)
    recipe = re.sub("('s)", " is", recipe)
    recipe = re.sub("('ll)", " will", recipe)
    recipe = re.sub("('d)", " would", recipe)
    recipe = re.sub("(can't)", "cannot", recipe)
    recipe = re.sub("(n't)", " not", recipe)
    
    # Remove non-alpha/numeric characters from ingredients
    recipe = re.sub("[^0-9a-zA-Z ]+", "", recipe)
    return recipe.lower()

In [19]:
# Make list of ingredients into 1 sentence
train.ingredients = train.ingredients.apply(' '.join)
train.ingredients = train.ingredients.apply(clean_ingredients)
train.ingredients = train.ingredients.apply(word_tokenize)

In [20]:
# Lemmatize the ingredients
lemma = WordNetLemmatizer()
ingredients = []
for recipe in train.ingredients:
    recipe = list(map(lemma.lemmatize, recipe))
    ingredients.append(recipe)
train.ingredients = ingredients

In [23]:
# add all words into bag
bag_of_words = []
for recipe in train.ingredients:
    bag_of_words += recipe

In [24]:
print(len(bag_of_words))
print(bag_of_words[0:20])

807485
['romaine', 'lettuce', 'black', 'olive', 'grape', 'tomato', 'garlic', 'pepper', 'purple', 'onion', 'seasoning', 'garbanzo', 'bean', 'feta', 'cheese', 'crumbles', 'plain', 'flour', 'ground', 'pepper']


In [27]:
bag_of_words = nltk.FreqDist(bag_of_words)

In [28]:
print(bag_of_words.most_common(15))

[('pepper', 27187), ('salt', 24426), ('oil', 23323), ('onion', 19242), ('garlic', 18941), ('ground', 18256), ('fresh', 17853), ('sauce', 13188), ('sugar', 12502), ('olive', 12281), ('cheese', 11776), ('chicken', 11536), ('tomato', 11205), ('black', 10557), ('water', 9790)]


In [29]:
# Save sorted bag of words and use first 5000 as features
word_features = [i[0] for i in bag_of_words.most_common()]
word_features = word_features[0:5000]

In [30]:
# Check if word in recipe list is in word_features
def find_features(recipe):
    ingredient = set(recipe)
    features = {}
    for i in word_features:
        features[i] = (i in ingredient)
    
    return features

In [31]:
feature_set = [(find_features(train.ingredients[i]),train.cuisine[i]) for i in range(len(train))]

In [32]:
training_set = feature_set[:int(len(feature_set) * 0.8)]
testing_set = feature_set[int(len(feature_set) * 0.8):]

In [33]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [34]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier accuracy percent: 71.95474544311755


In [35]:
classifier.show_most_informative_features(15)

Most Informative Features
                  masala = True           indian : italia =   1321.4 : 1.0
                   garam = True           indian : italia =   1203.9 : 1.0
                   mirin = True           japane : mexica =   1000.9 : 1.0
                    sake = True           japane : italia =    873.9 : 1.0
                    thai = True             thai : italia =    707.3 : 1.0
                    miso = True           japane : italia =    640.4 : 1.0
             beansprouts = True           vietna : italia =    613.0 : 1.0
                turmeric = True           indian : italia =    604.5 : 1.0
              lemongrass = True             thai : italia =    602.3 : 1.0
                  peanut = True             thai : italia =    589.0 : 1.0
                  sesame = True           korean : cajun_ =    575.9 : 1.0
               andouille = True           cajun_ : mexica =    541.5 : 1.0
                    feta = True            greek : chines =    531.7 : 1.0