In [1]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jason/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import json
import re
import unidecode
import numpy as np
import pandas as pd
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_validate
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [3]:
train = pd.read_json('/home/jason/tm_week5/trainfile2.json')
test = pd.read_json('testfile2a.json')
train

Unnamed: 0,id,cuisine,ingredients
0,786437,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,524295,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
2,524306,indian,"[water, vegetable oil, wheat, salt]"
3,524307,indian,"[black pepper, shallots, cornflour, cayenne pe..."
4,524308,jamaican,"[plain flour, sugar, butter, eggs, fresh ginge..."
...,...,...,...
31813,381385,irish,"[light brown sugar, granulated sugar, butter, ..."
31814,119241,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
31815,643533,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
31816,119245,chinese,"[boneless chicken skinless thigh, minced garli..."


In [4]:
train['num_ingredients'] = train['ingredients'].apply(len)
train = train[train['num_ingredients'] > 1]
train

Unnamed: 0,id,cuisine,ingredients,num_ingredients
0,786437,greek,"[romaine lettuce, black olives, grape tomatoes...",9
1,524295,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12
2,524306,indian,"[water, vegetable oil, wheat, salt]",4
3,524307,indian,"[black pepper, shallots, cornflour, cayenne pe...",20
4,524308,jamaican,"[plain flour, sugar, butter, eggs, fresh ginge...",12
...,...,...,...,...
31813,381385,irish,"[light brown sugar, granulated sugar, butter, ...",12
31814,119241,italian,"[KRAFT Zesty Italian Dressing, purple onion, b...",7
31815,643533,irish,"[eggs, citrus fruit, raisins, sourdough starte...",12
31816,119245,chinese,"[boneless chicken skinless thigh, minced garli...",21


In [5]:
lemmatizer = WordNetLemmatizer()
def preprocess(ingredients):
    ingredients_text = ' '.join(ingredients)
    ingredients_text = ingredients_text.lower()
    ingredients_text = ingredients_text.replace('-', ' ')
    words = []
    for word in ingredients_text.split():
        if re.findall('[0-9]', word): continue
        if len(word) <= 2: continue
        if '’' in word: continue
        word = lemmatizer.lemmatize(word)
        if len(word) > 0: words.append(word)
    return ' '.join(words)

for ingredient, expected in [
    ('Eggs', 'egg'),
    ('all-purpose flour', 'all purpose flour'),
    ('purée', 'purée'),
    ('1% low-fat milk', 'low fat milk'),
    ('half & half', 'half half'),
    ('safetida (powder)', 'safetida (powder)')
]:
    actual = preprocess([ingredient])
    assert actual == expected, f'"{expected}" is excpected but got "{actual}"'


In [6]:
train['x'] = train['ingredients'].progress_apply(preprocess)
test['x'] = test['ingredients'].progress_apply(preprocess)
train.head()

100%|██████████| 31803/31803 [00:07<00:00, 4338.85it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
100%|██████████| 7955/7955 [00:02<00:00, 3565.57it/s]


Unnamed: 0,id,cuisine,ingredients,num_ingredients,x
0,786437,greek,"[romaine lettuce, black olives, grape tomatoes...",9,romaine lettuce black olive grape tomato garli...
1,524295,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12,egg pepper salt mayonaise cooking oil green ch...
2,524306,indian,"[water, vegetable oil, wheat, salt]",4,water vegetable oil wheat salt
3,524307,indian,"[black pepper, shallots, cornflour, cayenne pe...",20,black pepper shallot cornflour cayenne pepper ...
4,524308,jamaican,"[plain flour, sugar, butter, eggs, fresh ginge...",12,plain flour sugar butter egg fresh ginger root...


In [7]:
vectorizer = make_pipeline(
    TfidfVectorizer(sublinear_tf=True),
    FunctionTransformer(lambda x: x.astype('float32'), validate=False)
)

x_train = vectorizer.fit_transform(train['x'].values)
x_train.sort_indices()
x_test = vectorizer.transform(test['x'].values)

In [8]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train['cuisine'].values)
dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

{'brazilian': 0,
 'british': 1,
 'cajun_creole': 2,
 'chinese': 3,
 'filipino': 4,
 'french': 5,
 'greek': 6,
 'indian': 7,
 'irish': 8,
 'italian': 9,
 'jamaican': 10,
 'japanese': 11,
 'korean': 12,
 'mexican': 13,
 'moroccan': 14,
 'russian': 15,
 'southern_us': 16,
 'spanish': 17,
 'thai': 18,
 'vietnamese': 19}

In [9]:
estimator = SVC(
    C=80,
    kernel='rbf',
    gamma=1.7,
    coef0=1,
    cache_size=500,
)
classifier = OneVsRestClassifier(estimator, n_jobs=-1)

In [10]:
%%time
scores = cross_validate(classifier, x_train, y_train, cv=3, n_jobs=-1)
scores['test_score'].mean()

CPU times: user 165 ms, sys: 137 ms, total: 302 ms
Wall time: 12min 48s


0.8071253281121206

In [11]:
%%time
classifier.fit(x_train, y_train)



CPU times: user 232 ms, sys: 203 ms, total: 435 ms
Wall time: 13min 16s


OneVsRestClassifier(estimator=SVC(C=80, cache_size=500, class_weight=None,
                                  coef0=1, decision_function_shape='ovr',
                                  degree=3, gamma=1.7, kernel='rbf',
                                  max_iter=-1, probability=False,
                                  random_state=None, shrinking=True, tol=0.001,
                                  verbose=False),
                    n_jobs=-1)

In [12]:
y_pred = label_encoder.inverse_transform(classifier.predict(x_train))
y_true = label_encoder.inverse_transform(y_train)

print(f'accuracy score on train data: {accuracy_score(y_true, y_pred)}')

def report2dict(cr):
    rows = []
    for row in cr.split("\n"):
        parsed_row = [x for x in row.split("  ") if len(x) > 0]
        if len(parsed_row) > 0: rows.append(parsed_row)
    measures = rows[0]
    classes = defaultdict(dict)
    for row in rows[1:]:
        class_label = row[0]
        for j, m in enumerate(measures):
            classes[class_label][m.strip()] = float(row[j + 1].strip())
    return classes
report = classification_report(y_true, y_pred)
pd.DataFrame(report2dict(report)).T

accuracy score on train data: 0.9996855642549445


IndexError: list index out of range

In [13]:
y_pred = label_encoder.inverse_transform(classifier.predict(x_test))
test['cuisine'] = y_pred
test[['id', 'cuisine']].to_csv('submission.csv', index=False)
test[['id', 'cuisine']].head()

Unnamed: 0,id,cuisine
0,996716,indian
1,829945,mexican
2,829949,cajun_creole
3,829953,indian
4,829954,mexican
