# Load datasets

In [12]:
import json

def read_dataset(path):
    return json.load(open(path))

train = read_dataset('./data/train.json')
test = read_dataset('./data/test.json')

# tfidf pipeline

In [63]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator

class DocumentsExtractor(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return [" ".join(item['ingredients']) for item in X]
    
tfidf_pip = Pipeline([
    ('doc_extractor', DocumentsExtractor()),
    ('tfidf_vectorizer', TfidfVectorizer())
])

X_train = tfidf_pip.fit_transform(train)
y_train = [item['cuisine'] for item in train]

# Quick scoring

In [None]:
from sklearn.model_selection import cross_val_score

## Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression()

cross_val_score(lr_clf, X_train, y_train)

array([ 0.77267587,  0.77011841,  0.7693933 ])

## GaussianNB

In [33]:
from sklearn.naive_bayes import GaussianNB

gnb_clf = GaussianNB()

cross_val_score(gnb_clf, X_train.toarray(), y_train)

array([ 0.26690794,  0.26849687,  0.25701781])

## MultinominalNB

In [38]:
from sklearn.naive_bayes import MultinomialNB

mnb_clf = MultinomialNB()

cross_val_score(mnb_clf, X_train, y_train)

array([ 0.66402775,  0.65721397,  0.66110776])

In [66]:
from sklearn.model_selection import GridSearchCV

mnb_pipeline = Pipeline([
    ('tfidf_pip', tfidf_pip),
    ('mnb', MultinomialNB())
])

grid_params = {
    'mnb__alpha': np.linspace(0.5, 1.5, 6),
    'mnb__fit_prior': [True, False],
    'tfidf_pip__tfidf_vectorizer__max_df': np.linspace(0.2, 1, 4),
    'tfidf_pip__tfidf_vectorizer__binary': [True, False],
    'tfidf_pip__tfidf_vectorizer__norm': ['l1', 'l2'],
    
}

clf = GridSearchCV(mnb_pipeline, grid_params, verbose=3, n_jobs=-1)
clf.fit(train, y_train)

print("Best Score: ", clf.best_score_)
print("Best Params: ", clf.best_params_)

KeyboardInterrupt: 

## Support Vector Classifier

In [22]:
from sklearn.svm import SVC

svc_clf = SVC()
cross_val_score(svc_clf, X_train, y_train)

array([ 0.19701425,  0.19707369,  0.19710232])

In [27]:
from sklearn.multiclass import OneVsRestClassifier

classifier = SVC(
    C=100, # penalty parameter, setting it to a larger value 
    kernel='rbf', # kernel type, rbf working fine here
    degree=3, # default value, not tuned yet
    gamma=1, # kernel coefficient, not tuned yet
    coef0=1, # change to 1 from default value of 0.0
    shrinking=True, # using shrinking heuristics
    tol=0.001, # stopping criterion tolerance 
    probability=False, # no need to enable probability estimates
    cache_size=200, # 200 MB cache size
    class_weight=None, # all classes are treated equally 
    verbose=False, # print the logs 
    max_iter=-1, # no limit, let it run
    decision_function_shape=None, # will use one vs rest explicitly 
    random_state=None
)
model = OneVsRestClassifier(classifier)

cross_val_score(model, X_train, y_train)

array([ 0.81150569,  0.80797949,  0.81165107])