In [57]:
import pandas as pd
import numpy as np
import random
import re

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [109]:
train = pd.read_json('train.json')
test = pd.read_json('test.json')
submission = pd.read_csv('sample_submission.csv')

In [117]:
test.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


In [76]:
def clean_ingredients(recipe):
    # Make list of ingredients into 1 sentence
    recipe = ' '.join(recipe)
    
    recipe = recipe.lower()
    '''
    Clean contractions
    '''
    recipe = re.sub("(let's)", "let us", recipe)
    recipe = re.sub("('m)", " am", recipe)
    recipe = re.sub("('re)", " are", recipe)
    recipe = re.sub("('ve)", " have", recipe)
    recipe = re.sub("('s)", " is", recipe)
    recipe = re.sub("('ll)", " will", recipe)
    recipe = re.sub("('d)", " would", recipe)
    recipe = re.sub("(can't)", "cannot", recipe)
    recipe = re.sub("(n't)", " not", recipe)
    
    # Remove non-alpha/numeric characters from ingredients
    recipe = re.sub("[^0-9a-zA-Z ]+", "", recipe)
    recipe = recipe.lower()
    
#     lemma = WordNetLemmatizer()
#     recipe = lemma.lemmatize(recipe)
    
    return recipe

In [77]:
lemma = WordNetLemmatizer()

def stem_tokens(tokens):
    stemmed = []
    for item in tokens:
        stemmed.append(lemma.lemmatize(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens)
    return stems

In [78]:
train_features = train.ingredients
train_labels = train.cuisine

In [79]:
le = LabelEncoder()
train_labels = le.fit_transform(train_labels)

In [80]:
train_labels[0:10]

array([ 6, 16,  4,  7,  7, 10, 17,  9, 13,  9], dtype=int64)

In [81]:
train_features = train_features.apply(clean_ingredients)
test_features = test.ingredients.apply(clean_ingredients)

In [83]:
vectorize = TfidfVectorizer(tokenizer = tokenize , stop_words = 'english')
vec = vectorize.fit_transform(train_features)
test_vec = vectorize.transform(test_features)

In [84]:
vec = vec.toarray()
test_vec = test_vec.toarray()

In [85]:
x_train, y_train, x_test, y_test = train_test_split(vec, train_labels, test_size = 0.25, random_state = 42)

In [86]:
clf = GaussianNB()
clf.fit(x_train, x_test)
predict = clf.predict(y_train)
accuracy_score(predict, y_test)

0.25110619469026546

In [112]:
lrclf = LogisticRegression(C = 10, random_state = 123)
lrclf.fit(x_train,x_test)
lrpredict = lrclf.predict(y_train)
accuracy_score(lrpredict, y_test)

0.78660498793242151

In [113]:
test_predict = lrclf.predict(test_vec)
test_predict = le.inverse_transform(test_predict)

In [123]:
test['cuisine'] = test_predict

In [125]:
test.to_csv("submission.csv", columns = ['id', 'cuisine'], index = False)