# Kaggle Cooking Challenge


This is the code for [kaggle cooking challenge](https://). 

Necesary script for load data from drive

In [2]:
# load dataset from google drive
from google.colab import drive
drive.mount('/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /drive


In [3]:
# check access to dataset
!ls /drive/My\ Drive/cooking/

svc_output.csv	test2.json  train2.json


# Import packages

In [0]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

# Load data

In [5]:
data_dir = '/drive/My Drive/cooking/'
train = pd.read_json(data_dir + 'train2.json') 
test = pd.read_json(data_dir + 'test2.json')

print(train.shape)
print(test.shape)

print(50 * '-')

print(list(train))
print(list(test))

(39774, 3)
(9944, 2)
--------------------------------------------------
['cuisine', 'id', 'ingredients']
['id', 'ingredients']


# Preprocess data

In [0]:
# remove outlier
train['num_ingredients'] = train['ingredients'].apply(len)
train = train[train['num_ingredients'] > 1]

In [7]:
# remove number
# remove word with len smaller than 2
# remove hyphen
# apply lemmatization
import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
def preprocess(ingredients):
    ingredients_text = ' '.join(ingredients)
    ingredients_text = ingredients_text.lower()
    ingredients_text = ingredients_text.replace('-', ' ')
    words = []
    for word in ingredients_text.split():
        if re.findall('[0-9]', word): continue
        if len(word) <= 2: continue
        if '’' in word: continue
        word = lemmatizer.lemmatize(word)
        if len(word) > 0: words.append(word)
    return ' '.join(words)

for ingredient, expected in [
    ('Eggs', 'egg'),
    ('all-purpose flour', 'all purpose flour'),
    ('purée', 'purée'),
    ('1% low-fat milk', 'low fat milk'),
    ('half & half', 'half half'),
    ('safetida (powder)', 'safetida (powder)')
]:
    actual = preprocess([ingredient])

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
train['x'] = train['ingredients'].apply(preprocess)
test['x'] = test['ingredients'].apply(preprocess)

In [9]:
print(list(train))
print(list(test))

['cuisine', 'id', 'ingredients', 'num_ingredients', 'x']
['id', 'ingredients', 'x']


# Create Features

In [0]:
def generate_text(data):
    text_data = [" ".join(doc) for doc in data.ingredients]
    return text_data

In [0]:
def get_tfidf_vectorize(train, test):
    print('start vectorized...')
    train_df, test_df = train, test
    vect = TfidfVectorizer()

    train_features = vect.fit_transform(generate_text(train_df))
    test_features = vect.transform(generate_text(test_df))

    train_label = [doc for doc in train_df.cuisine]
    print('finish vectorized...')

    return train_features, train_label, test_features

In [12]:
train_feature, train_label, test_feature = get_tfidf_vectorize(train, test)
print(train_feature.shape)
print('number of labels:', len(train_label))

start vectorized...
finish vectorized...
(39768, 2797)
number of labels: 39768


# Helper functions

In [0]:
def save_submission(test, file_name, y_pred):
    # Submission
    print("Generate Submission File for ", file_name)
    test_id = [doc for doc in test.id]
    sub = pd.DataFrame({'id': test_id, 'cuisine': y_pred}, columns=['id', 'cuisine'])
    sub.to_csv(data_dir + file_name + '_output.csv', index=False)

# Hyper-tunning

In [0]:
lb = LabelEncoder()
train_label = lb.fit_transform(train_label)

In [0]:
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    train_feature, train_label, test_size=0.2, random_state=7)

tuned_parameters = {'n_estimators':[200,500,750,1000],'max_depth':[20,30,40],'max_features':['sqrt'],'warm_start':[True,False]}

scores = ['precision']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=3, scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

# Ensemble models

In [46]:
X_train, X_test, y_train, y_test = train_test_split(
    train_feature, train_label, test_size=0.4, random_state=7)

clf0 = OneVsRestClassifier(SVC(C=50, gamma=1.4, coef0=1))
clf1 = LinearSVC(loss='squared_hinge', C=0.02, max_iter=1000)
clf2 = MultinomialNB(alpha=1)
clf3 = SGDClassifier(loss='log')
clf4 = RandomForestClassifier(max_features='sqrt') # ~80 #{'max_depth': 30, 'max_features': 'sqrt', 'n_estimators': 200}
clf5 = GradientBoostingClassifier()

eclf3 = VotingClassifier(estimators=[('sgd', clf4)], voting='soft', weights=[1], flatten_transform=True)
eclf3.fit(X_train, y_train)
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))



#     print('Random forest')
#     model3 = RandomForestClassifier()
#     model3.fit(train_features, train_label)
#     pred3 = model3.predict(test_features)
#     pred3 = lb.inverse_transform(pred3)
#     DataHelper.save_submission('feature_random_forest', pred3)

#     print('GradientBoostingClassifier')
#     model4 = GradientBoostingClassifier()
#     model4.fit(train_features, train_label)
#     pred4 = model4.predict(test_features)
#     pred4 = lb.inverse_transform(pred4)
#     DataHelper.save_submission('feature_gbc', pred4)

             precision    recall  f1-score   support

          0       0.78      0.36      0.49       188
          1       0.77      0.15      0.25       340
          2       0.79      0.68      0.73       612
          3       0.71      0.89      0.79      1020
          4       0.82      0.40      0.54       283
          5       0.59      0.57      0.58      1058
          6       0.82      0.59      0.69       460
          7       0.84      0.92      0.88      1228
          8       0.71      0.28      0.40       271
          9       0.74      0.92      0.82      3229
         10       0.95      0.60      0.74       207
         11       0.91      0.62      0.74       556
         12       0.81      0.66      0.73       303
         13       0.87      0.93      0.90      2582
         14       0.88      0.73      0.80       300
         15       0.87      0.24      0.38       192
         16       0.64      0.82      0.72      1720
         17       0.80      0.32      0.46   

In [0]:
print("SVM 1vsRest")
svc = OneVsRestClassifier(SVC(C=50,
                                gamma=1.4,
                                coef0=1))

svc.fit(train_feature, train_label)
print('fit model is finished')
svc_prediction = svc.predict(test_feature)
svc_prediction = lb.inverse_transform(svc_prediction)
save_submission(test, 'svc', svc_prediction)

In [0]:
print('start logistic regression')
    logreg = LogisticRegression(C=10, solver='lbfgs', multi_class='multinomial', max_iter=1000, tol=1e-3)
    logreg.fit(train_feature, train_label)
    log_prediction = logreg.predict(test_feature)
    log_prediction = lb.inverse_transform(log_prediction)
    DataHelper.save_submission('logregression', log_prediction)

    print('start SGD')
    sgd = linear_model.SGDClassifier(random_state=0, max_iter=1000, tol=1e-3)
    sgd.fit(train_feature, train_label)
    sgd_prediction = sgd.predict(test_feature)
    sgd_prediction = lb.inverse_transform(sgd_prediction)
    DataHelper.save_submission('sgd', sgd_prediction)

    print('start Naive bayes')
    naive = MultinomialNB()
    naive.fit(train_feature, train_label)
    naive_prediction = naive.predict(test_feature)
    naive_prediction = lb.inverse_transform(naive_prediction)
    DataHelper.save_submission('naive_bayes', naive_prediction)

    print("SVM 1vsRest")
    model = OneVsRestClassifier(SVC(C=100,
                                    gamma=1,
                                    coef0=1,
                                    decision_function_shape=None))

    model.fit(train_feature, train_label)
    svc_prediction = model.predict(test_feature)
    svc_prediction = lb.inverse_transform(svc_prediction)
    DataHelper.save_submission('svc', svc_prediction)

    print('XGBoost')
    xgboost = xgb.XGBClassifier(max_depth=6, n_estimators=1000, learning_rate=0.1
                                , min_child_weight=5,
                                gamma=1,
                                subsample=0.8,
                                colsample_bytree=0.8,
                                nthread=4,
                                scale_pos_weight=1,
                                )
    xgboost.fit(train_feature, train_label)
    xgb_prediction = xgboost.predict(test_feature)
    xgb_prediction = lb.inverse_transform(xgb_prediction)
    DataHelper.save_submission('xgb', xgb_prediction)