In [1]:
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.stem import PorterStemmer
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer,HashingVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lekhasmacbook/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lekhasmacbook/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_json("train.json")
testset = pd.read_json("test.json")

In [3]:
df.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [4]:
testset.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


In [5]:
df.isnull().sum()

cuisine        0
id             0
ingredients    0
dtype: int64

In [6]:
testset.isnull().sum()

id             0
ingredients    0
dtype: int64

## Check different types of cuisines

In [7]:
df.cuisine.unique()

array(['greek', 'southern_us', 'filipino', 'indian', 'jamaican',
       'spanish', 'italian', 'mexican', 'chinese', 'british', 'thai',
       'vietnamese', 'cajun_creole', 'brazilian', 'french', 'japanese',
       'irish', 'korean', 'moroccan', 'russian'], dtype=object)

## Text Data processing

### Convert the ingredients to string.

In [8]:
df.ingredients = df.ingredients.astype('str')
testset.ingredients = testset.ingredients.astype('str')

In [9]:
df.ingredients[0]

"['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo beans', 'feta cheese crumbles']"

In [10]:
testset.ingredients[0]

"['baking powder', 'eggs', 'all-purpose flour', 'raisins', 'milk', 'white sugar']"

### Lets remove those unnecessary symbols, which might be problem when tokenizing and lemmatizing

In [11]:
df.ingredients = df.ingredients.str.replace("["," ")
df.ingredients = df.ingredients.str.replace("]"," ")
df.ingredients = df.ingredients.str.replace("'"," ")
df.ingredients = df.ingredients.str.replace(","," ")

In [12]:
testset.ingredients = testset.ingredients.str.replace("["," ")
testset.ingredients = testset.ingredients.str.replace("]"," ")
testset.ingredients = testset.ingredients.str.replace("'"," ")
testset.ingredients = testset.ingredients.str.replace(","," ")

In [13]:
df.ingredients[0]

'  romaine lettuce    black olives    grape tomatoes    garlic    pepper    purple onion    seasoning    garbanzo beans    feta cheese crumbles  '

In [14]:
testset.ingredients[0]

'  baking powder    eggs    all-purpose flour    raisins    milk    white sugar  '

### Convert everything to lower ( I think they are already in lower case, but to be on safe side).

In [15]:
df.ingredients = df.ingredients.str.lower()
testset.ingredients = testset.ingredients.str.lower()

Lets TOKENIZE the data now. (the processing of splitting into individual words)

In [16]:
word_tokenize('I am the best')

['I', 'am', 'the', 'best']

In [17]:
df.ingredients = df.ingredients.apply(lambda x: word_tokenize(x))
testset.ingredients = testset.ingredients.apply(lambda x: word_tokenize(x))

In [18]:
df.ingredients[0:5]

0    [romaine, lettuce, black, olives, grape, tomat...
1    [plain, flour, ground, pepper, salt, tomatoes,...
2    [eggs, pepper, salt, mayonaise, cooking, oil, ...
3                 [water, vegetable, oil, wheat, salt]
4    [black, pepper, shallots, cornflour, cayenne, ...
Name: ingredients, dtype: object

In [19]:
testset.ingredients[0:5]

0    [baking, powder, eggs, all-purpose, flour, rai...
1    [sugar, egg, yolks, corn, starch, cream, of, t...
2    [sausage, links, fennel, bulb, fronds, olive, ...
3    [meat, cuts, file, powder, smoked, sausage, ok...
4    [ground, black, pepper, salt, sausage, casings...
Name: ingredients, dtype: object

Lets LEMMATIZE the data now (Since i believe that dataset might have different representation of same words, like the olives and olive, tomatoes and tomato, which represent the same word)

In [20]:
lemmatizer = WordNetLemmatizer()

In [21]:
def lemmat(wor):
    l = []
    for i in wor:
        l.append(lemmatizer.lemmatize(i))
    return l

In [22]:
df.ingredients = df.ingredients.apply(lemmat)
testset.ingredients = testset.ingredients.apply(lemmat)

In [23]:
df.ingredients[0]

['romaine',
 'lettuce',
 'black',
 'olive',
 'grape',
 'tomato',
 'garlic',
 'pepper',
 'purple',
 'onion',
 'seasoning',
 'garbanzo',
 'bean',
 'feta',
 'cheese',
 'crumbles']

In [24]:
testset.ingredients[0]

['baking',
 'powder',
 'egg',
 'all-purpose',
 'flour',
 'raisin',
 'milk',
 'white',
 'sugar']

Observe that olives converted to olive, tomatoes to tomato etc, many words are now in their root form.

In [25]:
type(df.ingredients[0])

list

Lemmatization converted it back to list, so change to str again and remove the unncessary words.

In [26]:
df.ingredients = df.ingredients.astype('str')
df.ingredients = df.ingredients.str.replace("["," ")
df.ingredients = df.ingredients.str.replace("]"," ")
df.ingredients = df.ingredients.str.replace("'"," ")
df.ingredients = df.ingredients.str.replace(","," ")

In [27]:
testset.ingredients = testset.ingredients.astype('str')
testset.ingredients = testset.ingredients.str.replace("["," ")
testset.ingredients = testset.ingredients.str.replace("]"," ")
testset.ingredients = testset.ingredients.str.replace("'"," ")
testset.ingredients = testset.ingredients.str.replace(","," ")

In [28]:
type(df.ingredients[0])

str

In [29]:
df.ingredients[0]

'  romaine    lettuce    black    olive    grape    tomato    garlic    pepper    purple    onion    seasoning    garbanzo    bean    feta    cheese    crumbles  '

Now our data looks good for vectorization.

In [30]:
#vect = HashingVectorizer ()
vect = TfidfVectorizer()

In [31]:
features = vect.fit_transform(df.ingredients)

In [32]:
features

<39774x2788 sparse matrix of type '<class 'numpy.float64'>'
	with 756011 stored elements in Compressed Sparse Row format>

In [33]:
type(features)

scipy.sparse.csr.csr_matrix

So, **now our features has 2826 features, which are created by the process of vectorization.**

Lets visualize some random features.

In [34]:
testfeatures = vect.transform(testset.ingredients)

In [35]:
testfeatures

<9944x2788 sparse matrix of type '<class 'numpy.float64'>'
	with 189396 stored elements in Compressed Sparse Row format>

Lets create our labels now, which is obviously cuisine column. Lets labelencode it so that they convert to numerical lables, which usually might give better prediction results. Not a necessary step tho

In [36]:
encoder = LabelEncoder()
labels = encoder.fit_transform(df.cuisine)

Lets split the dataset into training and testing parts

In [37]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2)

Check the shapes, to make sure.

In [38]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(31819, 2788) (7955, 2788) (31819,) (7955,)


## Data Modeling

In [39]:
logreg = LogisticRegression(C=10,solver='lbfgs', multi_class='multinomial',max_iter=400)
#C : float, default: 1.0; Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.


logreg.fit(X_train,y_train)



LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=400, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [40]:
print("Logistic Regression accuracy",logreg.score(X_test, y_test))

Logistic Regression accuracy 0.7912005028284098


In [41]:
logreg.predict(X_test)

array([ 3,  5, 12, ...,  9, 12,  9])

In [42]:
from sklearn import linear_model
sgd = linear_model.SGDClassifier()
sgd.fit(X_train, y_train)




SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

This estimator implements regularized linear models with stochastic gradient descent (SGD) learning: the gradient of the loss is estimated each sample at a time and the model is updated along the way with a decreasing strength schedule (aka learning rate). 

In [43]:
print("SGD classifier accuracy",sgd.score(X_test, y_test))

SGD classifier accuracy 0.7836580766813325


In [44]:
from sklearn.svm import LinearSVC
linearsvm = LinearSVC(C=1.0,random_state=0,multi_class='crammer_singer',dual = False, max_iter = 1500)
linearsvm.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1500,
     multi_class='crammer_singer', penalty='l2', random_state=0,
     tol=0.0001, verbose=0)

In [45]:
print("Linear SVM accuracy", linearsvm.score(X_test, y_test))

Linear SVM accuracy 0.7972344437460717


Now, lets try our luck with neural networks.

## NEURAL NETWORK'S

I have tried both Keras and tensorflow (Of course the backend is same), but Keras code looks simpler and clear.

For Neural Networks we need to have the dense array's as inputs and preferably one hot encoding for lables. So, lets create lables.

In [46]:
labelsNN = df.cuisine

Convert it to one hot formatting, there are many ways to do, i prefer to do this way.

In [47]:
labelsNN = pd.get_dummies(labelsNN)

Convert it to arrays, you can do by values method or np.array() both are same

In [48]:
labelsNN = labelsNN.values

Here's how the one hot encoding looks like.

In [49]:
labelsNN[0]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=uint8)

Our labels are ready, now we need the features, **we have already created the features above but it was sparse matrix, which neural network doesnt like, so convert to dense arrays.**

In [50]:
from scipy.sparse import csr_matrix
sparse_dataset = csr_matrix(features)
featuresNN = sparse_dataset.todense()


Here's how the features look like.

In [51]:
featuresNN[0]

matrix([[0., 0., 0., ..., 0., 0., 0.]])

Split the dataset.

In [52]:
X_trainNN, X_testNN, y_trainNN, y_testNN = train_test_split(featuresNN, labelsNN, test_size=0.2)
print(X_trainNN.shape, X_testNN.shape, y_trainNN.shape, y_testNN.shape)

(31819, 2788) (7955, 2788) (31819, 20) (7955, 20)


In [53]:
numfeat = X_trainNN.shape[1]
print(numfeat)
#The shape attribute for numpy arrays returns the dimensions of the array. 
#If Y has n rows and m columns, then Y.shape is (n,m). So Y.shape[1] is m.

2788


## KERAS

In [54]:
import keras
from keras.layers import *

Using TensorFlow backend.


A sequential NN with 300,500 and 400 nodes in first,second and third layers resp.

The loss is categorical cross entropy and the optimizer is adam with default learning rate. We can tweak a lot of parameters like the no of nodes, epochs, batchsize etc to improve accuracy.

In [55]:
model = keras.models.Sequential()
model.add(Dense(300,input_dim = numfeat,activation = 'relu'))
model.add(Dense(500,activation = 'relu'))
model.add(Dense(400,activation = 'relu'))
model.add(Dense(20,activation='softmax'))
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam',metrics = ['categorical_accuracy'])
model.fit(X_trainNN,y_trainNN,epochs=50,shuffle=True, verbose =2,batch_size=500)

Epoch 1/50
 - 6s - loss: 1.8799 - categorical_accuracy: 0.4583
Epoch 2/50
 - 5s - loss: 0.9029 - categorical_accuracy: 0.7236
Epoch 3/50
 - 5s - loss: 0.6730 - categorical_accuracy: 0.7972
Epoch 4/50
 - 5s - loss: 0.5575 - categorical_accuracy: 0.8343
Epoch 5/50
 - 5s - loss: 0.4757 - categorical_accuracy: 0.8562
Epoch 6/50
 - 5s - loss: 0.4141 - categorical_accuracy: 0.8738
Epoch 7/50
 - 5s - loss: 0.3534 - categorical_accuracy: 0.8942
Epoch 8/50
 - 5s - loss: 0.2935 - categorical_accuracy: 0.9115
Epoch 9/50
 - 5s - loss: 0.2380 - categorical_accuracy: 0.9301
Epoch 10/50
 - 5s - loss: 0.1875 - categorical_accuracy: 0.9475
Epoch 11/50
 - 5s - loss: 0.1410 - categorical_accuracy: 0.9615
Epoch 12/50
 - 5s - loss: 0.1034 - categorical_accuracy: 0.9732
Epoch 13/50
 - 5s - loss: 0.0759 - categorical_accuracy: 0.9818
Epoch 14/50
 - 5s - loss: 0.0563 - categorical_accuracy: 0.9873
Epoch 15/50
 - 5s - loss: 0.0405 - categorical_accuracy: 0.9919
Epoch 16/50
 - 5s - loss: 0.0303 - categorical_ac

<keras.callbacks.History at 0x1a257d33c8>

The model needs to know what input shape it should expect. For this reason, the first layer in a Sequential model (and only the first, because following layers can do automatic shape inference) needs to receive information about its input shape. There are several possible ways to do this:

Pass an input_shape argument to the first layer. This is a shape tuple (a tuple of integers or None entries, where None indicates that any positive integer may be expected). In input_shape, the batch dimension is not included.

Some 2D layers, such as Dense, support the specification of their input shape via the argument input_dim, and some 3D temporal layers support the arguments input_dim and input_length.

If you ever need to specify a fixed batch size for your inputs (this is useful for stateful recurrent networks), you can pass a batch_size argument to a layer. If you pass both batch_size=32 and input_shape=(6, 8) to a layer, it will then expect every batch of inputs to have the batch shape (32, 6, 8).

In [56]:
print("Accuracy with KERAS" ,model.evaluate(X_testNN,y_testNN)[1])

Accuracy with KERAS 0.7683218101597972


I have trained with KERAS on my pc for few times and achieved max accuracy of 0.81.

Now, we have achieved almost similar accuracies in all the above models, I dont prefer NN's on this data as it is computationally very expensive.

## PREDICTION


I prefer just using the logisticRegression or linearsvm for predictions, but linearSVC also has almost same results. I'm not predict using Keras or Tensorflow, since it needs an extra two steps to convert the labels, which I dont want to waste my time on.

In [57]:
linearsvmfinal = LinearSVC(C=1.0,random_state=0,multi_class='crammer_singer',dual = False, max_iter = 1500)
linearsvmfinal.fit(features,labels)



LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1500,
     multi_class='crammer_singer', penalty='l2', random_state=0,
     tol=0.0001, verbose=0)

In [58]:
import lightgbm as lgb


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [59]:
gbm = lgb.LGBMClassifier(objective="mutliclass",n_estimators=10000,num_leaves=512)
gbm.fit(X_train,y_train,verbose = 300)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=10000, n_jobs=-1, num_leaves=512,
        objective='mutliclass', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [60]:
pred = gbm.predict(testfeatures)

In [61]:
pred = linearsvmfinal.predict(testfeatures)

In [62]:
predconv = encoder.inverse_transform(pred)

In [63]:
sub = pd.DataFrame({'id':testset.id,'cuisine':predconv})

In [64]:
output = sub[['id','cuisine']]

In [65]:
output.to_csv("outputfile.csv",index = False)