In [49]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score

In [42]:
training = pd.read_json('./train.json')
test = pd.read_json('./test.json')

In [43]:
X_train = training['ingredients']
y_train = training['cuisine']
X_test = test['ingredients']
for x in X_train:
    if x == "00":
        print('X, weird value')
print('X_train[0]: ', X_train[0])
print('Y_train[0]: ', y_train[0])
print('X_test[0]: ', X_test[0])

X_train[0]:  ['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo beans', 'feta cheese crumbles']
Y_train[0]:  greek
X_test[0]:  ['baking powder', 'eggs', 'all-purpose flour', 'raisins', 'milk', 'white sugar']


In [44]:
ingredients_corpus = [','.join(x) for x in X_train]
x_test_ingredients_corpus = [','.join(x) for x in X_test]
print('Ingredients Corpus: ', ingredients_corpus[0])
type(ingredients_corpus)

vectorizer = TfidfVectorizer(stop_words='english', analyzer='word', ngram_range = ( 1 , 1 ))
X_train = vectorizer.fit_transform(ingredients_corpus)
X_test = vectorizer.transform(x_test_ingredients_corpus)
# print(vectorizer.get_feature_names())
print(X_train.shape)

Ingredients Corpus:  romaine lettuce,black olives,grape tomatoes,garlic,pepper,purple onion,seasoning,garbanzo beans,feta cheese crumbles
(39774, 2970)


In [46]:
clf = LogisticRegression()
lr = clf.fit(X_train, y_train)
predictions = lr.predict(X_test)
print('Predictions: ', predictions)
test['cuisine'] = predictions
test.head(20)

Predictions:  ['british' 'southern_us' 'italian' ... 'italian' 'southern_us' 'mexican']


Unnamed: 0,id,ingredients,cuisine
0,18009,"[baking powder, eggs, all-purpose flour, raisi...",british
1,28583,"[sugar, egg yolks, corn starch, cream of tarta...",southern_us
2,41580,"[sausage links, fennel bulb, fronds, olive oil...",italian
3,29752,"[meat cuts, file powder, smoked sausage, okra,...",cajun_creole
4,35687,"[ground black pepper, salt, sausage casings, l...",italian
5,38527,"[baking powder, all-purpose flour, peach slice...",southern_us
6,19666,"[grape juice, orange, white zinfandel]",italian
7,41217,"[ground ginger, white pepper, green onions, or...",chinese
8,28753,"[diced onions, taco seasoning mix, all-purpose...",mexican
9,22659,"[eggs, cherries, dates, dark muscovado sugar, ...",british


In [52]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

X_train, X_test, y_train, y_test = train_test_split(training['ingredients'], training['cuisine'], test_size=0.2, random_state=5622)
ingredients_corpus = [','.join(x) for x in X_train]
x_test_ingredients_corpus = [','.join(x) for x in X_test]

vectorizer = TfidfVectorizer(stop_words='english', analyzer='word', ngram_range = ( 1 , 1 ))
X_train = vectorizer.fit_transform(ingredients_corpus)
X_test = vectorizer.transform(x_test_ingredients_corpus)


clf = MultinomialNB().fit(X_train, y_train)
clf1 = LogisticRegression().fit(X_train, y_train)
clf2 = LinearSVC().fit(X_train, y_train)
predict = clf.predict(X_test)
predict1 = clf1.predict(X_test)
predict2 = clf2.predict(X_test)
print('Naive Bayes: ', accuracy_score(y_test,predict))
print('Logistic Regression: ', accuracy_score(y_test, predict1))
print('SVM: ', accuracy_score(y_test, predict2))

Naive Bayes:  0.6711502199874293
Logistic Regression:  0.774732872407291
SVM:  0.7905719673161533


In [53]:
from sklearn.base import BaseEstimator
class ClfSwitcher(BaseEstimator):
    
    def __init__(self, estimator=LinearSVC()):
        self.estimator = estimator
        
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self

    def predict(self, X, y=None):
        return self.estimator.predict(X)

    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

    def score(self, X, y):
        return self.estimator.score(X, y)
    
    def accuracy(self, y_test, predicted):
        return accuracy_score(y_test, predicted)
        

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('clf', LinearSVC())
])

parameters = [
#     {
#         'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
# #         'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
# #         'clf__estimator__max_iter': [50, 80],
#         'clf__estimator__tol': [1e-4],
#         'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
#     },
#     {
#         'clf__estimator': [MultinomialNB()],
#         'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
#     },
      {
#          'clf__estimator': [LinearSVC()],
        'clf__dual': [True, False],
        'clf__tol': [1e-4],
        'clf__C': [0.001, 0.01, 0.1, 1, 10],
        'clf__multi_class': ['ovr', 'crammer_singer']

    }
]

gscv = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)
gscv.fit(X_train, y_train)
predictions = gscv.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, predictions))

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.5min finished


Accuracy Score:  0.7905719673161533
