In [3]:
import pandas as pd
import numpy as np 
import seaborn as sns
import csv
import matplotlib.pyplot as plt 
import collections
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

### Metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

### Gridsearch
from pprint import pprint
from time import time
import logging
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

### Classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adrianawad/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adrianawad/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [4]:
#Loading Data
train = pd.read_csv("train_data_w_ingredients.csv", sep=';', error_bad_lines=False, index_col=0)
test = pd.read_csv("test_data_w_ingredients.csv", sep=';', error_bad_lines=False, index_col=0)

In [7]:
train.drop(["index"], axis=1, inplace=True)
train.drop(["name"], axis=1, inplace=True)
test.drop(["index"], axis=1, inplace=True)
test.drop(["name"], axis=1, inplace=True)

KeyError: "['index'] not found in axis"

In [8]:
train.sample(15)

Unnamed: 0,ingredients,cuisine
225,"['sauce', '2 cups crushed tomatoes (fire-roast...",Italian
377,"['1 pound sea scallops (fresh)', '1 teaspoon s...",Iberic
370,"['3 plum tomatoes (ripe)', '7 ounces savoy cab...",Iberic
127,['4 slices white sandwich bread (torn into qua...,french
34,"['2 quarts chicken broth (organic low-sodium, ...",Vietnamese
265,"['1 teaspoon soy sauce', '1 tablespoon dark so...",Thai
94,"['1 tablespoon olive oil', '4 lamb shanks', 's...",Greek
48,"['1 spring roll wrapper (pack, I only used hal...",Vietnamese
219,"['1 loaf Italian bread (or ciabatta bread, 16 ...",Italian
147,"['2 tablespoons olive oil', '1 kilogram beef (...",french


In [19]:
european = ["Italian","Iberic","french","Greek"]

def continental (row):
    if row['cuisine'] in european:
        return 'european'
    return 'asian'

train["continental"]=train.apply(lambda row: continental(row), axis=1)
train.sample(15)

Unnamed: 0,ingredients,cuisine,continental
230,"['3 pounds boneless beef bottom round roast', ...",Italian,european
51,"['4 garlic cloves', '5 tablespoons fresh dill ...",Greek,european
395,"['1 pound potato (/ 3 medium potatoes, washed ...",Iberic,european
74,"['2 tablespoons olive oil (plus more)', '1 lar...",Greek,european
235,"['1 pound spaghetti', '4 large eggs (as fresh ...",Italian,european
366,"['1 loaf bread (about 16 slices)', 'milk', 'br...",Iberic,european
302,"['6 chicken thighs (free range)', '2 tablespoo...",Chinese,asian
383,"['1 cup long grain rice', '2 cups water', '1/4...",Iberic,european
134,"['4 ounces lean bacon (chunk)', '2 tablespoons...",french,european
286,"['1/2 tablespoon oil (or water)', '4 garlic cl...",Thai,asian


In [25]:
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text=str(text) #stringify
    text = text.lower() #kleinschreibung
    text = re.sub("\(.*?\)","()",text) #Alternativen entfernen - also alles zwischen ( )
    text=text.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', text)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    lemma_words=[lemmatizer.lemmatize(w) for w in filtered_words] 
    """lemma_words = [w for w in lemma_words if not w in measures]
    lemma_words = [w for w in lemma_words if not w in data_leaks]
    lemma_words = [w for w in lemma_words if not w in common_remove]
    lemma_words = [w for w in lemma_words if not w in useless_singles]"""
    
    return " ".join(lemma_words)


# Apply to the DF series
train['cleanText']=train['ingredients'].map(lambda s:preprocess(s)) 
train.drop(["ingredients"], axis=1, inplace=True)
test['cleanText']=test['ingredients'].map(lambda s:preprocess(s)) 
test.drop(["ingredients"], axis=1, inplace=True)
train.head(3)

Unnamed: 0,cuisine,cleanText
1,Vietnamese,cup beef broth cup water yellow onion clove ga...
2,Vietnamese,pound ground chicken tablespoon fish sauce oni...
3,Vietnamese,pound pork blade steak tablespoon light brown ...


In [7]:
X_train = train['cleanText']
y_train = train['cuisine']

X_test = test['cleanText']
y_test = test['cuisine']

In [8]:
# Based on http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html

data = train


# #############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__max_iter': (10, 50, 80),
    
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(data.cleanText, data.cuisine)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (1e-05, 1e-06),
 'clf__max_iter': (10, 50, 80),
 'clf__penalty': ('l2', 'elasticnet'),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__max_features': (None, 5000, 10000, 50000),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 1152 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 116 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 716 tasks      | elapsed:   24.9s


KeyboardInterrupt: ignored

In [11]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

y_pred = nb.predict(X_train)

print('accuracy %s' % accuracy_score(y_pred, y_train))
print(classification_report(y_train, y_pred))

accuracy 0.8775
              precision    recall  f1-score   support

     Chinese       0.84      0.98      0.91        60
       Greek       0.71      0.98      0.82        50
      Iberic       1.00      0.47      0.64        40
     Italian       0.98      0.96      0.97        50
        Thai       0.87      0.90      0.88        50
  Vietnamese       0.98      0.80      0.88        50
      french       0.87      0.90      0.88        50
      korean       0.96      0.92      0.94        50

    accuracy                           0.88       400
   macro avg       0.90      0.86      0.87       400
weighted avg       0.90      0.88      0.87       400



In [12]:
knn = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', KNeighborsClassifier()),
              ])
knn.fit(X_train, y_train)

y_pred = knn.predict(X_train)

print('accuracy %s' % accuracy_score(y_pred, y_train))
print(classification_report(y_train, y_pred))

accuracy 0.835
              precision    recall  f1-score   support

     Chinese       0.80      0.92      0.85        60
       Greek       0.75      0.96      0.84        50
      Iberic       0.83      0.60      0.70        40
     Italian       0.87      0.82      0.85        50
        Thai       0.78      0.94      0.85        50
  Vietnamese       1.00      0.68      0.81        50
      french       0.90      0.90      0.90        50
      korean       0.85      0.80      0.82        50

    accuracy                           0.83       400
   macro avg       0.85      0.83      0.83       400
weighted avg       0.85      0.83      0.83       400



In [15]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM",
         "Decision Tree", "Random Forest", "Neural Net",
         "AdaBoost", "multinomial Naive Bayes","SGD"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    MultinomialNB(),
    SGDClassifier()]

In [18]:
# iterate over classifiers
for name, clf in zip(names, classifiers):

    pipeline = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', clf),
              ])
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    print('**************')
    print(name)
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    #print(classification_report(y_test, y_pred))

**************
Nearest Neighbors
accuracy 0.775
**************
Linear SVM
accuracy 0.125
**************
RBF SVM
accuracy 0.9
**************
Decision Tree
accuracy 0.55
**************
Random Forest
accuracy 0.275
**************
Neural Net
accuracy 0.8
**************
AdaBoost
accuracy 0.35
**************
multinomial Naive Bayes
accuracy 0.625
**************
SGD
accuracy 0.825


In [19]:
rbf_svm = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', SVC(gamma=2, C=1)),
              ])
rbf_svm.fit(X_train, y_train)

y_pred = rbf_svm.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.9
              precision    recall  f1-score   support

     Chinese       1.00      1.00      1.00         5
       Greek       1.00      1.00      1.00         5
      Iberic       1.00      1.00      1.00         5
     Italian       0.80      0.80      0.80         5
        Thai       1.00      1.00      1.00         5
  Vietnamese       0.83      1.00      0.91         5
      french       0.67      0.80      0.73         5
      korean       1.00      0.60      0.75         5

    accuracy                           0.90        40
   macro avg       0.91      0.90      0.90        40
weighted avg       0.91      0.90      0.90        40



In [None]:
# Based on http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html

data = train


# #############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SVC()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
# max combinations: 3x4x2x2x2x4x2x2= 1536
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__C': (1, 10, 100, 1000),
    'clf__gamma': (0.001, 0.0001,1, 2),
    'clf__kernel': ('linear', 'rbf'), 
}

"""if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(data.cleanText, data.cuisine)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))"""

In [None]:
# Based on http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html

data = train


# #############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MLPClassifier()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
# max combinations: 3x4x2x2x2x4x2x2= 1536
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__hidden_layer_sizes': [(10,30,10),(20,)],
    'clf__activation': ['tanh', 'relu'],
    'clf__solver': ['sgd', 'adam'],
    'clf__alpha': [0.0001, 0.05],
    'clf__learning_rate': ['constant','adaptive'], 
}

"""if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=2)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(data.cleanText, data.cuisine)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))"""