In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

Training_data = pd.read_json("../input/whatscooking/train.json")
Training_data['str_ingredients'] = Training_data.ingredients.apply(lambda x: ','.join(y for y in x))
Training_data.describe()

Unnamed: 0,id
count,39774.0
mean,24849.536959
std,14360.035505
min,0.0
25%,12398.25
50%,24887.0
75%,37328.5
max,49717.0


In [2]:
from sklearn.cross_validation import train_test_split
train, test = train_test_split(Training_data, test_size = 0.2)
train.describe()

Unnamed: 0,id
count,31819.0
mean,24892.546466
std,14342.813669
min,1.0
25%,12504.5
50%,24910.0
75%,37343.5
max,49716.0


In [3]:
test.describe()

Unnamed: 0,id
count,7955.0
mean,24677.504337
std,14428.337867
min,0.0
25%,11936.5
50%,24803.0
75%,37259.0
max,49717.0


In [4]:
import itertools
global_pantry = pd.DataFrame.from_dict({key:len(list(group)) for key, group in itertools.groupby(sorted(list(itertools.chain.from_iterable(Training_data.ingredients))))}, orient='index')
global_pantry.columns = ['num_recipes']
print global_pantry.count()
global_pantry.head()

num_recipes    6714
dtype: int64


Unnamed: 0,num_recipes
low-sodium fat-free chicken broth,22
sweetened coconut,3
baking chocolate,3
egg roll wrappers,57
bottled low sodium salsa,1


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(sublinear_tf=True, vocabulary=global_pantry.index.values)
vect

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=array([u'low-sodium fat-free chicken broth', u'sweetened coconut', ...,
       u'lower sodium beef broth', u'hot water'], dtype=object))

In [6]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(vect.fit_transform(train.str_ingredients), train.cuisine)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [7]:
predictions = classifier.predict(vect.transform(test.str_ingredients))

In [8]:
prediction_comparison = predictions==test.cuisine
total = prediction_comparison.count()
true = prediction_comparison.sum()
accuracy = (100.0 * true)/total
"Total: {}, Correct: {}, Accuracy: {}".format(total, true, accuracy)

'Total: 7955, Correct: 5759, Accuracy: 72.3947203017'

In [9]:
Testing_data = pd.read_json("../input/whatscooking/test.json")
Predictions = classifier.predict(vect.transform(Testing_data.ingredients.apply(lambda x: ','.join(y for y in x))))
Predicted_data = Testing_data.copy()
Predicted_data['cuisine'] = Predictions
submission = Predicted_data[['id', 'cuisine']]
submission

Unnamed: 0,id,cuisine
0,18009,southern_us
1,28583,southern_us
2,41580,italian
3,29752,cajun_creole
4,35687,italian
5,38527,southern_us
6,19666,italian
7,41217,chinese
8,28753,mexican
9,22659,french


In [10]:
import time
submission.to_csv('../output/whatscooking/whatscooking-{}.csv'.format(time.strftime("%Y%m%d--%H%M%S")), index=False)