<h3>Objective</h3>
I tried the stratified k fold cross validation last time and used the majority vote of the k model predictions to determine the final predictions and that was a slight improvement over earlier.

Next, I want to do a quick grid search to find better parameters for plan old vanilla Support Vector Machine. This darling algorithm of so many people can't be that bad at predicting.

In my next attempt, I will do a data deep dive. Idea would be to analyze where predictions are going wrong and also to think about what might make better features.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display

In [2]:
Training_data = pd.read_json("../input/whatscooking/train.json")
Training_data['str_ingredients'] = Training_data.ingredients.apply(lambda x: ','.join(y for y in x))
#describe_dataset(Training_data, "Training");
Testing_data = pd.read_json("../input/whatscooking/test.json")
Testing_data['str_ingredients'] = Testing_data.ingredients.apply(lambda x: ','.join(y for y in x))

In [3]:
import itertools
grouped_ingredients = itertools.groupby(sorted(list(itertools.chain.from_iterable(Training_data.ingredients))))
ingredient_counts = {key:len(list(group)) for key, group in grouped_ingredients}
global_pantry = pd.DataFrame.from_dict(ingredient_counts, orient='index')
global_pantry.columns = ['num_recipes']

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, vocabulary=global_pantry.index.values)
X_Train = vectorizer.fit_transform(Training_data.str_ingredients)
y_Train = Training_data.cuisine

In [5]:
import scipy.stats
from sklearn.svm import SVC
from sklearn.grid_search import RandomizedSearchCV
model = SVC()
hyperparams = {'C': scipy.stats.expon(scale=10), 'kernel': ['rbf', 'linear'], 'gamma': scipy.stats.expon(scale=0.1)}
search = RandomizedSearchCV(model, param_distributions=hyperparams, n_iter=10).fit(X_Train, y_Train)

NameError: name 'itemgetter' is not defined

In [7]:
from operator import itemgetter
top_score = sorted(search.grid_scores_, key=itemgetter(1), reverse=True)[0]
params, perform = top_score.parameters, top_score.mean_validation_score
model = SVC(C=params['C'], gamma=params['gamma'], kernel=params['kernel'])

In [8]:
params, perform

({'C': 19.379428286962913, 'gamma': 0.29126260414074895, 'kernel': 'rbf'},
 0.75848544275154628)

In [11]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from time import time

Y_Train = Training_data.cuisine
stratifiedKFolds = StratifiedKFold(Y_Train, n_folds=10)

name = "SVC_rbf"
classifier_generator = lambda : SVC(C = 19.38, gamma = 0.29, kernel='rbf')

classifiers_used = []
scores = []
train_times = []
test_times = []
X_Tests = []

for train, test in stratifiedKFolds:
    
    train = Training_data.iloc[train]
    test = Training_data.iloc[test]
    
    vectorizer = TfidfVectorizer(sublinear_tf=True, vocabulary=global_pantry.index.values)
    X_train = vectorizer.fit_transform(train.str_ingredients)
    y_train = train.cuisine
    X_test = vectorizer.transform(test.str_ingredients)
    y_test = test.cuisine
    X_Tests.append(vectorizer.transform(Testing_data.str_ingredients))

    classifier = classifier_generator()
    
    t0 = time()
    classifier.fit(X_train, y_train)
    train_time = time() - t0

    t0 = time()
    predictions = classifier.predict(X_test)
    test_time = time() - t0

    score = metrics.accuracy_score(y_test, predictions)

    classifiers_used.append(classifier)
    scores.append(score)
    train_times.append(train_time)
    test_times.append(test_time)

mean_score = np.mean(scores)
mean_train_time = np.mean(train_times)
mean_test_time = np.mean(test_time)

print "Number of classifiers: {}, Accuracy: {}, Train time: {}, Test time: {}".format(
    len(classifiers_used), mean_score, mean_train_time, mean_test_time)

Number of classifiers: 10, Accuracy: 0.767814018758, Train time: 116.82714932, Test time: 15.6199150085


In [15]:
for i, classifier in enumerate(classifiers_used):
    print i, classifier

0 SVC(C=19.38, cache_size=200, class_weight=None, coef0=0.0, degree=3,
  gamma=0.29, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)
1 SVC(C=19.38, cache_size=200, class_weight=None, coef0=0.0, degree=3,
  gamma=0.29, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)
2 SVC(C=19.38, cache_size=200, class_weight=None, coef0=0.0, degree=3,
  gamma=0.29, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)
3 SVC(C=19.38, cache_size=200, class_weight=None, coef0=0.0, degree=3,
  gamma=0.29, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)
4 SVC(C=19.38, cache_size=200, class_weight=None, coef0=0.0, degree=3,
  gamma=0.29, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)
5 SVC(C=19.38, cache_size=200,

In [16]:
Predicted_data = Testing_data.copy()
for i, classifier in enumerate(classifiers_used): 
    Predicted_data['fold{}_pred'.format(i)] = classifier.predict(X_Tests[i])
display(Predicted_data.head())

majority_vote = Predicted_data[['fold0_pred', 
                'fold1_pred', 
                'fold2_pred', 
                'fold3_pred', 
                'fold4_pred', 
                'fold5_pred', 
                'fold6_pred', 
                'fold7_pred', 
                'fold8_pred', 
                'fold9_pred']].mode(axis=1, numeric_only=False)

column_names = ['cuisine']
for i in range(2, majority_vote.shape[1]+1):
    column_names.append('cuisine_{}'.format(i))
majority_vote.columns = column_names
display(majority_vote.describe())
print majority_vote.shape
display(majority_vote.head())

#display(majority_vote[majority_vote.cuisine_2.notnull()].head())
#display(majority_vote[majority_vote.cuisine.notnull()].head())

merged_data = pd.concat([Predicted_data, majority_vote], axis=1)
merged_data[['id', 'cuisine']].head()

Unnamed: 0,id,ingredients,str_ingredients,fold0_pred,fold1_pred,fold2_pred,fold3_pred,fold4_pred,fold5_pred,fold6_pred,fold7_pred,fold8_pred,fold9_pred
0,18009,"[baking powder, eggs, all-purpose flour, raisi...","baking powder,eggs,all-purpose flour,raisins,m...",irish,irish,irish,russian,russian,irish,irish,irish,irish,irish
1,28583,"[sugar, egg yolks, corn starch, cream of tarta...","sugar,egg yolks,corn starch,cream of tartar,ba...",southern_us,southern_us,southern_us,southern_us,southern_us,southern_us,southern_us,southern_us,southern_us,southern_us
2,41580,"[sausage links, fennel bulb, fronds, olive oil...","sausage links,fennel bulb,fronds,olive oil,cub...",italian,italian,italian,italian,italian,italian,italian,italian,italian,italian
3,29752,"[meat cuts, file powder, smoked sausage, okra,...","meat cuts,file powder,smoked sausage,okra,shri...",cajun_creole,cajun_creole,cajun_creole,cajun_creole,cajun_creole,cajun_creole,cajun_creole,cajun_creole,cajun_creole,cajun_creole
4,35687,"[ground black pepper, salt, sausage casings, l...","ground black pepper,salt,sausage casings,leeks...",italian,italian,italian,italian,italian,italian,italian,italian,italian,italian


Unnamed: 0,cuisine,cuisine_2,cuisine_3
count,9944,119,1
unique,20,17,1
top,italian,italian,vietnamese
freq,1988,30,1


(9944, 3)


Unnamed: 0,cuisine,cuisine_2,cuisine_3
0,irish,,
1,southern_us,,
2,italian,,
3,cajun_creole,,
4,italian,,


Unnamed: 0,id,cuisine
0,18009,irish
1,28583,southern_us
2,41580,italian
3,29752,cajun_creole
4,35687,italian


In [17]:
submission = merged_data[['id', 'cuisine']]
import time
submission.to_csv('../output/whatscooking/whatscooking-{}.csv'.format(time.strftime("%Y%m%d--%H%M%S")), index=False)

<h3>And this, the seventh submission, is actually the best submission so far, which is heartening.</h3>