In [1]:
import pandas as pd
import numpy as np
import random
import re

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
train = pd.read_json('train.json')
test = pd.read_json('test.json')

In [3]:
test.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


In [4]:
def clean_ingredients(recipe):
    # Make list of ingredients into 1 sentence
    recipe = ' '.join(recipe)
    
    recipe = recipe.lower()
    '''
    Clean contractions
    '''
    recipe = re.sub("(let's)", "let us", recipe)
    recipe = re.sub("('m)", " am", recipe)
    recipe = re.sub("('re)", " are", recipe)
    recipe = re.sub("('ve)", " have", recipe)
    recipe = re.sub("('s)", " is", recipe)
    recipe = re.sub("('ll)", " will", recipe)
    recipe = re.sub("('d)", " would", recipe)
    recipe = re.sub("(can't)", "cannot", recipe)
    recipe = re.sub("(n't)", " not", recipe)
    
    # Remove non-alpha/numeric characters from ingredients
    recipe = re.sub("[^0-9a-zA-Z ]+", "", recipe)
    recipe = recipe.lower()
    
#     lemma = WordNetLemmatizer()
#     recipe = lemma.lemmatize(recipe)
    
    return recipe

In [5]:
lemma = WordNetLemmatizer()

def stem_tokens(tokens):
    stemmed = []
    for item in tokens:
        stemmed.append(lemma.lemmatize(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens)
    return stems

In [6]:
train_features = train.ingredients
train_labels = train.cuisine

In [7]:
le = LabelEncoder()
train_labels = le.fit_transform(train_labels)

In [8]:
train_labels[0:10]

array([ 6, 16,  4,  7,  7, 10, 17,  9, 13,  9], dtype=int64)

In [9]:
train_features = train_features.apply(clean_ingredients)
test_features = test.ingredients.apply(clean_ingredients)

In [10]:
count_vectorize = CountVectorizer(tokenizer = tokenize, stop_words = 'english')
count_vec = count_vectorize.fit_transform(train_features)
#test_count_vec = count_vectorize.transform(test_features)

In [11]:
count_vec = count_vec.toarray()
#test_count_vec = test_count_vec.toarray()

In [12]:
vectorize = TfidfVectorizer(tokenizer = tokenize , stop_words = 'english')
vec = vectorize.fit_transform(train_features)
test_vec = vectorize.transform(test_features)

In [13]:
vec.shape

(39774, 2816)

In [14]:
vec = vec.toarray()
test_vec = test_vec.toarray()

In [15]:
vec[0]

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [16]:
tfidfx_train, tfidfy_train, tfidfx_test, tfidfy_test = train_test_split(vec, train_labels, test_size = 0.25, random_state = 42)
cvx_train, cvy_train, cvx_test, cvy_test = train_test_split(count_vec, train_labels, test_size = 0.25, random_state = 42)

In [17]:
#Run logistic regression for count vectorized model
cvclf = LogisticRegression(C = 10, random_state = 123)
cvclf.fit(cvx_train,cvx_test)
cvlrpredict = cvclf.predict(cvy_train)

In [18]:
print("Accuracy:", accuracy_score(cvlrpredict, cvy_test))
print("Precision:", precision_score(cvy_test,cvlrpredict, average = "macro"))
print("Recall:", recall_score(cvy_test, cvlrpredict, average = "macro"))

Accuracy: 0.77554304103
Precision: 0.717415954801
Recall: 0.676315569732


In [20]:
lr = LogisticRegression(random_state = 123)
#skb = SelectKBest(f_classif)
pipeline = Pipeline(steps=[#("SKB",skb),
                           ("LR", lr)])
params = {#'SKB__k':range(2,2817,500),
          'LR__C': [5],
          'LR__tol': [1e-3]
         }
split = StratifiedShuffleSplit(test_size=0.1, random_state=42)
gs = GridSearchCV(pipeline, params, cv = split, scoring = 'accuracy')

gs.fit(vec,train_labels)
lrclf=gs.best_estimator_
print(gs.best_params_)
print(gs.best_score_)
print(lrclf)

{'LR__C': 5, 'LR__tol': 0.001}
0.793966817496
Pipeline(memory=None,
     steps=[('LR', LogisticRegression(C=5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=123, solver='liblinear', tol=0.001,
          verbose=0, warm_start=False))])


In [21]:
test_predict = lrclf.predict(test_vec)
test_predict = le.inverse_transform(test_predict)

In [23]:
example_food = vectorize.transform(['garlic onion chicken wings soy sauce vinegar salt pepper bay leaf cooking oil'])
test_predict = lrclf.predict(example_food)
le.inverse_transform(test_predict)

array(['filipino'], dtype=object)

In [26]:
example_food = vectorize.transform(['blueberry kiwi banana pineapple mango strawberry granola honey banana acai vanilla almond milk'])
test_predict = lrclf.predict(example_food)
le.inverse_transform(test_predict)

array(['brazilian'], dtype=object)

In [151]:
test['cuisine'] = test_predict

In [152]:
test.to_csv("submission.csv", columns = ['id', 'cuisine'], index = False)

In [26]:
tfidfx_train

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])