# What's Cooking?

This playground competition asks you to predict the category of a dish's cuisine given a list of its ingredients. 

Training is done with JSON data about the recipe id, the type of cuisine, and the list of ingredients of each recipe (of variable length).

For testing, he format of a recipe is the same as `train.json`, only the cuisine type is removed, as it is the target variable you are going to predict.

In [16]:
import numpy as np
import pandas as pd

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.stem.porter import PorterStemmer
from matplotlib import pyplot as plt
%matplotlib inline

In [35]:
train = pd.read_json('train.json')
test = pd.read_json('test.json')


In [36]:

train['ingredients'] = [", ".join(ingredient) for ingredient in train.ingredients]
test['ingredients'] = [", ".join(ingredient) for ingredient in test.ingredients]

from sklearn.preprocessing import LabelEncoder 
# encode target labels with values 0 to n_classes-1
le = LabelEncoder()
y = le.fit_transform(train.cuisine)  #encoding the target values

#train['ingredients'] = [', '.join(z).strip() for z in train['ingredients']]
train.ingredients[:10]

0    romaine lettuce, black olives, grape tomatoes,...
1    plain flour, ground pepper, salt, tomatoes, gr...
2    eggs, pepper, salt, mayonaise, cooking oil, gr...
3                    water, vegetable oil, wheat, salt
4    black pepper, shallots, cornflour, cayenne pep...
5    plain flour, sugar, butter, eggs, fresh ginger...
6    olive oil, salt, medium shrimp, pepper, garlic...
7    sugar, pistachio nuts, white almond bark, flou...
8    olive oil, purple onion, fresh pineapple, pork...
9    chopped tomatoes, fresh basil, garlic, extra-v...
Name: ingredients, dtype: object

In [48]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = [PorterStemmer().stem(word) for word in tokens]
    return stems

tfidf = TfidfVectorizer(tokenizer=tokenize)
X = tfidf.fit_transform(train.ingredients)  #vectorizing the train values


<39774x2777 sparse matrix of type '<class 'numpy.float64'>'
	with 785041 stored elements in Compressed Sparse Row format>

In [47]:
# Make an sklearn pipeline
from sklearn.pipeline import make_pipeline

model = make_pipeline(TfidfVectorizer(), LinearSVC(C=0.5))
model.fit(train.ingredients, y)
model.score(X, y)

AttributeError: lower not found

In [42]:
#Let's try reducing the dimensionality
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=300)
X_decom = svd.fit_transform(X)
X.shape, X_decom.shape

((39774, 2777), (39774, 300))

In [44]:
np.cumsum(svd.explained_variance_ratio_)[-1]

0.7350108804033604

In [46]:
from sklearn.model_selection import cross_val_score
model = LinearSVC(C = 0.5)
cross_val_score(model, X_decom, y)  #returns scores of model for each run of CV



array([0.75661615, 0.75171582, 0.75218835])