# imports

In [50]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV

In [51]:
list_min=25
list_max=200

# load the data

In [52]:
train = pd.read_json('train.json')

In [53]:
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


# add extra columns

In [54]:
def add_columns(dataframe, min_count, max_count):
    full_list = pd.read_pickle('ingredient_dataframe.pkl')
    cut = full_list[full_list[0] > min_count]
    cut_list = cut[cut[0] <= max_count]['ingredient']
    for i in cut_list:
        column_name = 'has_'+i.replace(' ','_')
        dataframe[column_name] = dataframe['ingredients'].apply(lambda x: 1 if i in x else 0)

    dataframe.drop(['ingredients','id'], axis=1, inplace=True)
    return dataframe

In [55]:
clean_train = add_columns(train.copy(),list_min, list_max)

In [56]:
clean_train.head()

Unnamed: 0,cuisine,has_egg_roll_wrappers,has_figs,has_whole_wheat_pastry_flour,has_galangal,has_short-grain_rice,has_ground_cayenne_pepper,has_ancho_chile_pepper,has_yellow_mustard_seeds,has_yellow_split_peas,...,has_sherry_wine_vinegar,has_peeled_tomatoes,has_preserved_lemon,has_red_miso,has_pork_ribs,has_boneless_chicken,has_salad_dressing,has_whole_milk_ricotta_cheese,has_cake,has_steamed_white_rice
0,greek,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,southern_us,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,filipino,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,indian,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,indian,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# fit model

In [57]:
clean_train_y = clean_train[['cuisine']]
clean_train.drop(['cuisine'], axis=1, inplace=True)

In [58]:
df_train_index, df_test_index = train_test_split(clean_train.index, test_size=0.35)
df_test_x = clean_train.loc[df_test_index]
df_train_x = clean_train.loc[df_train_index]
df_test_y = clean_train_y.loc[df_test_index]
df_train_y = clean_train_y.loc[df_train_index]

In [59]:
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# estimator = RandomForestClassifier()
# param_grid = dict(n_estimators = [10,25,50,100], class_weight = ['subsample','auto'], 
#                   criterion=['gini','entropy'],max_depth = [None], min_samples_split = [2,5,10,20])

estimator = ExtraTreesClassifier()
param_grid = dict(n_estimators = [10,25,50,100], class_weight = ['subsample','auto'], 
                  criterion=['gini','entropy'],max_depth = [None], min_samples_split = [2,5,10,25])


grid_search = GridSearchCV(estimator, param_grid = param_grid, cv = 2, n_jobs=4, verbose=1)

In [60]:
grid_search.fit(df_train_x,df_train_y['cuisine'])

Fitting 2 folds for each of 64 candidates, totalling 128 fits


[Parallel(n_jobs=4)]: Done   1 jobs       | elapsed:    8.9s
[Parallel(n_jobs=4)]: Done  50 jobs       | elapsed:  7.3min
[Parallel(n_jobs=4)]: Done 128 out of 128 | elapsed: 27.8min finished


GridSearchCV(cv=2, error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=4,
       param_grid={'min_samples_split': [2, 5, 10, 25], 'n_estimators': [10, 25, 50, 100], 'criterion': ['gini', 'entropy'], 'max_depth': [None], 'class_weight': ['subsample', 'auto']},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=1)

In [64]:
best_estimator = grid_search.best_estimator_
best_estimator.fit(df_train_x,df_train_y['cuisine'])

ExtraTreesClassifier(bootstrap=False, class_weight='subsample',
           criterion='gini', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=25,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [65]:
train_score = best_estimator.score(df_train_x,df_train_y['cuisine'])
test_score = best_estimator.score(df_test_x,df_test_y['cuisine'])
print "train: ",train_score
print " test: ",test_score

train:  0.61764592117
 test:  0.421952445945


# import test, score, output

In [66]:
test = pd.read_json('test.json')
test.set_index(test['id'],inplace=True)
clean_test = add_columns(test,list_min, list_max)
clean_test.head()

Unnamed: 0_level_0,has_egg_roll_wrappers,has_figs,has_whole_wheat_pastry_flour,has_galangal,has_short-grain_rice,has_ground_cayenne_pepper,has_ancho_chile_pepper,has_yellow_mustard_seeds,has_yellow_split_peas,has_yellow_squash,...,has_sherry_wine_vinegar,has_peeled_tomatoes,has_preserved_lemon,has_red_miso,has_pork_ribs,has_boneless_chicken,has_salad_dressing,has_whole_milk_ricotta_cheese,has_cake,has_steamed_white_rice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28583,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41580,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29752,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35687,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
output = best_estimator.predict(clean_test)
output

array([u'filipino', u'southern_us', u'cajun_creole', ..., u'italian',
       u'southern_us', u'mexican'], dtype=object)

In [68]:
cuisines = []
for x in output:
    cuisines.append(x)
cuisine_predictions = clean_test.copy()

In [69]:
cuisine_pick = pd.Series(data=cuisines, index=cuisine_predictions.index)
cuisine_predictions['cuisine'] = pd.Series(data=cuisines, index=cuisine_predictions.index)
cuisine_predictions['id'] = cuisine_predictions.index

In [70]:
cuisine_predictions

Unnamed: 0_level_0,has_egg_roll_wrappers,has_figs,has_whole_wheat_pastry_flour,has_galangal,has_short-grain_rice,has_ground_cayenne_pepper,has_ancho_chile_pepper,has_yellow_mustard_seeds,has_yellow_split_peas,has_yellow_squash,...,has_preserved_lemon,has_red_miso,has_pork_ribs,has_boneless_chicken,has_salad_dressing,has_whole_milk_ricotta_cheese,has_cake,has_steamed_white_rice,cuisine,id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,filipino,18009
28583,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,southern_us,28583
41580,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,cajun_creole,41580
29752,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,cajun_creole,29752
35687,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,spanish,35687
38527,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,filipino,38527
19666,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,filipino,19666
41217,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,spanish,41217
28753,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,mexican,28753
22659,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,moroccan,22659


In [71]:
output_df = cuisine_predictions[['id','cuisine']]

In [72]:
output_df.to_csv('random_forest3.csv', index=False)