# What's Cooking?

This playground competition asks you to predict the category of a dish's cuisine given a list of its ingredients. 

Training is done with JSON data about the recipe id, the type of cuisine, and the list of ingredients of each recipe (of variable length).

For testing, he format of a recipe is the same as `train.json`, only the cuisine type is removed, as it is the target variable you are going to predict.

In [1]:
import numpy as np
import pandas as pd

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.stem.porter import PorterStemmer
from matplotlib import pyplot as plt
%matplotlib inline

In [52]:
train = pd.read_json('train.json', orient='records')
test = pd.read_json('test.json', orient='records')

train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [53]:

train['ingredients'] = [", ".join(ingredient) for ingredient in train.ingredients]
test['ingredients'] = [", ".join(ingredient) for ingredient in test.ingredients]

from sklearn.preprocessing import LabelEncoder 
# encode target labels with values 0 to n_classes-1
le = LabelEncoder()
y = le.fit_transform(train.cuisine)  #encoding the target values

#train['ingredients'] = [', '.join(z).strip() for z in train['ingredients']]
train.ingredients[:10]

0    romaine lettuce, black olives, grape tomatoes,...
1    plain flour, ground pepper, salt, tomatoes, gr...
2    eggs, pepper, salt, mayonaise, cooking oil, gr...
3                    water, vegetable oil, wheat, salt
4    black pepper, shallots, cornflour, cayenne pep...
5    plain flour, sugar, butter, eggs, fresh ginger...
6    olive oil, salt, medium shrimp, pepper, garlic...
7    sugar, pistachio nuts, white almond bark, flou...
8    olive oil, purple onion, fresh pineapple, pork...
9    chopped tomatoes, fresh basil, garlic, extra-v...
Name: ingredients, dtype: object

In [54]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = [PorterStemmer().stem(word) for word in tokens]
    return stems

tfidf = TfidfVectorizer(tokenizer=tokenize)
X = tfidf.fit_transform(train.ingredients)  #vectorizing the train values


In [55]:
# Make an sklearn pipeline
from sklearn.pipeline import make_pipeline

model = make_pipeline(TfidfVectorizer(), LinearSVC(C=0.5))
model.fit(train.ingredients, y)
#model.score(X, y)

Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('linearsvc',
                 LinearSVC(C=0.5, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,

In [56]:
#Let's try reducing the dimensionality
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=300)
X_decom = svd.fit_transform(X)
X.shape, X_decom.shape

((39774, 2777), (39774, 300))

In [57]:
np.cumsum(svd.explained_variance_ratio_)[-1]

0.7348599056247502

In [58]:
from sklearn.model_selection import cross_val_score
model = LinearSVC(C = 0.5)
cross_val_score(model, X_decom, y)  #returns scores of model for each run of CV



array([0.75352484, 0.75073535, 0.75105644])

In [59]:
#SVM
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy score

vectorizer_train = TfidfVectorizer(stop_words='english')
vectorizer_train.fit(train['ingredients'])
tfidf_train = vectorizer_train.transform(train['ingredients'])

#vectorizer_test = TfidfVectorizer(stop_words='english')
tfidf_test = vectorizer_train.transform(test['ingredients'])

X = tfidf_train
targets = train['cuisine']

svm = LinearSVC(C=0.80, penalty='l2')
svm.fit(X, targets)

predictions = svm.predict(tfidf_test)



In [60]:
# Creating submission file for Kaggle
test['cuisine'] = predictions
test = test.sort_values(['id'], ascending=True)
test.reset_index(drop=True, inplace=True)
test[['id', 'cuisine']].to_csv('submission.csv')

In [63]:
submitdf = pd.read_csv('submission.csv', index_col=False)

In [72]:
#submitdf.drop(['x'], inplace=True, axis=1)
submitdf.to_csv('submission.csv', index=False)

Barebones model gives 78% accuracy on test set yet.

In [75]:
# Better model
from sklearn.metrics import accuracy_score

svm = LinearSVC(penalty='l2', multi_class='ovr', C=0.5)
svm.fit(X, targets)

predictions_train = svm.predict(tfidf_train)
predictions_test = svm.predict(tfidf_test)
accuracy_score(predictions_train, targets)  #train accuracy

0.8449741036858249

In [83]:

test['cuisine'] = predictions_test
test = test.sort_values(['id'], ascending=True)
test.reset_index(drop=True, inplace=True)
test[['id', 'cuisine']].to_csv('submission_better.csv')

test.head()


Unnamed: 0,id,ingredients,cuisine
0,5,"mushrooms, chopped onion, tomato sauce, cheese...",british
1,7,"minced garlic, brown rice, sour cream, chicken...",southern_us
2,11,"lime juice, sesame oil, garlic cloves, fish sa...",italian
3,12,"sugar, vanilla extract, corn starch, coffee gr...",cajun_creole
4,13,"frozen pie crust, bourbon whiskey, powdered su...",italian


In [87]:
submitdf = pd.read_csv('submission_better.csv', index_col=0)
submitdf.head()

Unnamed: 0,id,cuisine
0,5,british
1,7,southern_us
2,11,italian
3,12,cajun_creole
4,13,italian


In [88]:
submitdf.to_csv('submission_better.csv', index=False)