In [6]:
import pandas as pd
import json
import numpy as np
from sklearn.feature_extraction.text import *
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

with open('train.json') as data_file:    
    data = json.load(data_file)

X = []
y =[]
for item in data:
    X.append(', '.join(item['ingredients']))
    y.append(item['cuisine'])    

X_train, X_dev, y_train, y_dev = train_test_split(X, y, random_state=2)

In [13]:
print(len(X_train))
print(len(X_dev))

29830
9944


In [24]:
#simplest version
#f1_score is .775
cv = CountVectorizer()
tf_X_train = cv.fit_transform(X_train)
tf_X_dev = cv.transform(X_dev)

In [25]:
lr = LogisticRegression()
lr.fit(tf_X_train, y_train)
predictions = lr.predict(tf_X_dev)

In [26]:
print(classification_report(y_dev, predictions))
print(metrics.f1_score(y_dev, predictions, average='weighted'))

              precision    recall  f1-score   support

   brazilian       0.76      0.55      0.63       121
     british       0.59      0.33      0.43       206
cajun_creole       0.78      0.70      0.74       376
     chinese       0.79      0.85      0.82       670
    filipino       0.71      0.54      0.61       190
      french       0.59      0.64      0.61       636
       greek       0.76      0.70      0.73       258
      indian       0.85      0.89      0.87       758
       irish       0.67      0.47      0.55       175
     italian       0.80      0.90      0.85      1963
    jamaican       0.81      0.70      0.75       123
    japanese       0.82      0.69      0.75       342
      korean       0.84      0.74      0.79       221
     mexican       0.91      0.92      0.91      1668
    moroccan       0.81      0.78      0.80       215
     russian       0.66      0.40      0.50       133
 southern_us       0.69      0.77      0.73      1056
     spanish       0.63    

In [9]:
#using single words or pairs of words
#note - limiting it to those that appear 10 times (min_df=10) made it worse
#slight improvement - f1_score is .780
cv_bigrams = CountVectorizer(ngram_range=(1,2))
tf_X_train_bigrams = cv_bigrams.fit_transform(X_train)
tf_X_dev_bigrams = cv_bigrams.transform(X_dev)

In [13]:
lr_bigrams = LogisticRegression()
lr_bigrams.fit(tf_X_train_bigrams, y_train)
predictions = lr.predict(tf_X_dev_bigrams)

In [14]:
print(classification_report(y_dev, predictions))
print(metrics.f1_score(y_dev, predictions, average='weighted'))

              precision    recall  f1-score   support

   brazilian       0.77      0.55      0.64       121
     british       0.53      0.34      0.41       206
cajun_creole       0.80      0.72      0.76       376
     chinese       0.82      0.85      0.83       670
    filipino       0.69      0.59      0.63       190
      french       0.58      0.64      0.61       636
       greek       0.76      0.67      0.72       258
      indian       0.86      0.89      0.87       758
       irish       0.68      0.46      0.55       175
     italian       0.80      0.90      0.85      1963
    jamaican       0.84      0.67      0.74       123
    japanese       0.82      0.71      0.76       342
      korean       0.86      0.75      0.80       221
     mexican       0.91      0.92      0.92      1668
    moroccan       0.84      0.77      0.80       215
     russian       0.75      0.41      0.53       133
 southern_us       0.70      0.78      0.74      1056
     spanish       0.65    

In [55]:
#print out the heaviest weights -both positive and negative - for each cuisineb


largestWeightedWords = []
largestWeightedIndeces = []
cv_bigram_featurenames = cv_bigrams.get_feature_names()
cats = ['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino', 'french', 'greek', 'indian', 'irish', 'italian', 'jamaican','japanese', 'korean', 'mexican', 'moroccan', 'russian', 'southern_us', 'spanish', 'thai', 'vietnamese']


for cat in range(20):
    print(cats[cat])
    weightIndeces = np.argsort(abs(lr_bigrams.coef_[cat]))[-5:]
    for index in weightIndeces:
        weight = lr_bigrams.coef_[cat][index]
        
        print(cv_bigram_featurenames[index] + " " + str(weight))
    print('\n')

brazilian
manioc 1.81684210974
manioc flour 1.81684210974
black beans 1.8963307854
tapioca flour 1.94343426701
cachaca 4.42654489565


british
stilton cheese 1.58576911257
jam 1.60918448539
mincemeat 1.70117801631
marmite 2.14242286153
stilton 3.28159490246


cajun_creole
oil powdered 1.39534041956
powder dried 1.50187637936
cajun seasoning 1.67942575972
creole 2.32921833546
cajun 2.3873676481


chinese
szechwan peppercorns 1.58492318786
kimchi -1.62362458022
mandarin 1.80026540523
sake -1.89346466114
mirin -2.62748695135


filipino
tilapia 1.38693410006
mirin -1.42974680294
basil -1.49834913221
calamansi 1.94418668915
lumpia 1.97017120327


french
snails 1.60195331392
grits -1.75811624417
duck 1.76781095857
pasta -1.77045743012
cognac 1.99113545665


greek
phyllo 2.25904621531
tahini 2.27620076599
feta cheese 2.56303692104
feta 2.63273494929
greek 2.91312357351


indian
curds 1.95553708379
tandoori 2.20226763204
masala 2.23408928644
curry 2.41600137731
yoghurt 2.5569324714


irish
bri

In [90]:
#should format this into a pandas df
cm = confusion_matrix(y_dev, predictions)
cmdf = pd.DataFrame(cm, index = cats, columns = cats)
print(cmdf)


              brazilian  british  cajun_creole  chinese  filipino  french  \
brazilian            66        0             3        0         2       3   
british               1       70             2        0         3      40   
cajun_creole          1        2           271        2         0      11   
chinese               0        1             4      569         9       2   
filipino              3        2             1       17       112       4   
french                0        6             5        1         5     408   
greek                 0        0             0        1         0       6   
indian                1        2             0        3         1       6   
irish                 0       16             0        1         0      17   
italian               0        7             4        3         3      78   
jamaican              2        4             1        2         2       0   
japanese              2        2             0       24         2      10   

(20L, 20L)

In [83]:
#find biggest mistakes
pred_probs = lr.predict_log_proba(tf_X_dev_bigrams)

rs = []
for i in range(pred_probs.shape[0]):
    #print(i)
    probs = pred_probs[i]
    maxprob = np.exp(np.max(probs))
    correctprob = np.exp(probs[cats.index(y_dev[i])])
    rs.append(maxprob/correctprob)
len(rs)

9944

In [84]:
biggestMistakeIndeces = np.argsort(rs)[-20:]
for index in biggestMistakeIndeces:
    print(index)
    print('Real category: ' + y_dev[index])
    print('Probabilities:', np.exp(pred_probs[index]))
    print('Predicted category: ' + cats[np.where(pred_probs[index]==np.max(pred_probs[index]))[0][0]])
    print(rs[index])
    print('\n\n' + X_dev[index] + '\n\n')

2167
Real category: russian
('Probabilities:', array([  2.66931432e-05,   7.22968802e-01,   6.48334926e-05,
         8.45222784e-04,   3.74739540e-07,   2.78857247e-02,
         7.74505515e-06,   1.13152864e-01,   2.36254112e-03,
         4.69334972e-02,   4.00059541e-03,   7.02108197e-04,
         8.29881893e-05,   1.48489817e-03,   8.39546786e-05,
         3.41201498e-04,   7.90020480e-02,   8.38741082e-06,
         2.45087796e-05,   2.10115252e-05]))
Predicted category: british
2118.89105649


warm water, light molasses, large eggs, whipping cream, ground cinnamon, baking soda, semisweet chocolate, light corn syrup, all-purpose flour, ground ginger, golden brown sugar, unsalted butter, baking powder, salt, ground cloves, crystallized ginger, peeled fresh ginger, vanilla extract


6900
Real category: vietnamese
('Probabilities:', array([  2.85525030e-05,   4.89874605e-06,   2.92045079e-05,
         9.83721429e-01,   6.29499266e-04,   7.77219600e-05,
         1.18148458e-05,   5.76288