In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import json

In [3]:
with open('../train.json', 'r') as f:
    train = json.load(f)
with open('../test.json', 'r') as f:
    test = json.load(f)

In [4]:
vec = []
for i in train:
    [vec.append(j) for j in i['ingredients']]

In [5]:
len(np.unique(vec)), len(vec)

(6714, 428275)

In [6]:
tvec = []
for i in test:
    [tvec.append(j) for j in i['ingredients']]

In [7]:
len(np.unique(tvec)), len(tvec)

(4484, 107395)

In [8]:
len(np.unique(np.concatenate((np.array(vec), np.array(tvec)))))
cats = np.unique((np.array(vec)))

In [9]:
len(train)

39774

In [15]:
train[0]['cuisine']

u'greek'

In [23]:
from scipy.sparse import csr_matrix

In [26]:
X = csr_matrix(map(lambda y: np.array(map(lambda x: 1 if x in y['ingredients'] else 0, cats), dtype='int8'), train))

In [27]:
X.shape

(39774, 6714)

In [28]:
target = map(lambda x: x['cuisine'], train)

In [31]:
b, y = np.unique(target, return_inverse=True)

In [34]:
test_sp = csr_matrix(map(lambda y: np.array(map(lambda x: 1 if x in y['ingredients'] else 0, cats), dtype='int8'), test))

In [36]:
from sklearn.cross_validation import train_test_split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)

In [38]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [51]:
cross_val_score(LogisticRegression(multi_class='ovr', random_state=0), X, y, scoring = 'accuracy')

array([ 0.774787  ,  0.77366317,  0.77882584])

In [50]:
cross_val_score(LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=0), X, y, scoring = 'accuracy')

array([ 0.77184649,  0.77019383,  0.77618473])

In [46]:
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score

In [59]:
parameters = {
    "penalty" : ['l1', 'l2'],
    "C" : [.1, .5, 1., 2., 5., 10., 100., 1000.],
    "random_state" : [0],
    "multi_class" : ["ovr"]
}

In [62]:
gs = GridSearchCV(LogisticRegression(), parameters, scoring = 'accuracy', n_jobs=-1, cv = 5)

In [63]:
gs.fit(X, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'multi_class': ['ovr'], 'C': [0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 100.0, 1000.0], 'random_state': [0]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [64]:
gs.best_score_

0.78113843214160006

In [65]:
gs.best_estimator_

LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [66]:
clf = LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [67]:
clf.fit(X, y)
preds = clf.predict(test_sp)

In [77]:
np.vstack((map(lambda x: x['id'], test), b[preds]))

array([[u'18009', u'28583'],
       [u'41580', u'29752'],
       [u'35687', u'38527'],
       ..., 
       [u'indian', u'french'],
       [u'southern_us', u'italian'],
       [u'southern_us', u'mexican']], 
      dtype='<U21')

In [78]:
res = pd.DataFrame(columns=['id', 'cuisine'])

In [80]:
res.id = map(lambda x: x['id'], test)

In [81]:
res.cuisine = b[preds]

In [84]:
res.to_csv('./first.csv', index=False)