In [78]:
import pandas as pd
import json
import numpy as np
import re
from nltk import ngrams

from sklearn.feature_extraction.text import *
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.svm import LinearSVC

%matplotlib inline

import matplotlib.pyplot as plt
plt.style.use("ggplot")


cats = ['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino', 'french', 'greek', 'indian', 'irish', 'italian', 'jamaican','japanese', 'korean', 'mexican', 'moroccan', 'russian', 'southern_us', 'spanish', 'thai', 'vietnamese']

with open('train.json') as data_file:    
    data = json.load(data_file)

X = []
y =[]
for item in data:
    X.append(', '.join(item['ingredients']))
    y.append(item['cuisine'])    

X_train, X_dev, y_train, y_dev = train_test_split(X, y, random_state=2)

              brazilian  british  cajun_creole  chinese  filipino  french  \
brazilian            46        1            11        0         1       4   
british               0       72             4        1         2      33   
cajun_creole          0        7           284        1         0       6   
chinese               0        3             6      583         8       1   
filipino              4        4             4       42        73       3   
french                1       29            15        1         1     347   
greek                 0        5             6        0         0       8   
indian                2       13             2        3         1       4   
irish                 1       17             5        0         3      21   
italian               2       31            29        2         0     145   
jamaican              4        3             4        1         2       1   
japanese              1        1             0       48         1      12   
korean                1        0             0       54         0       1   
mexican               8        9            15        3         6       7   
moroccan              0        4             3        0         0       2   
russian               0       12             6        0         3      23   
southern_us           3       18           108        9         8      32   
spanish               0        6            13        1         0      28   
thai                  4        0             1       30         2       0   
vietnamese            3        1             0       18         2       2   

              greek  indian  irish  italian  jamaican  japanese  korean  \
brazilian         0       7      0        5         1         0       0   
british           1       6      9        7         1         1       1   
cajun_creole      0       4      1       16         1         1       1   
chinese           0       4      0        1         2        11      12   
filipino          0       3      0        5         0         1       4   
french           19       2      8      102         1         1       0   
greek           165       5      0       38         0         0       0   
indian           13     653      1        5         1         2       0   
irish             1       1     62        9         0         0       0   
italian          40       1      4     1559         0         0       0   
jamaican          0       7      0        3        74         0       1   
japanese          1      29      1        2         0       208      10   
korean            0       0      0        1         0         6     145   
mexican           4      16      2       20         1         0       2   
moroccan          9      14      1        1         0         1       0   
russian           2       2      3       10         0         0       1   
southern_us       5      11      5       48         7         3       1   
spanish           6       0      0       35         1         0       0   
thai              1      11      0        1         0         3       2   
vietnamese        0       0      0        3         0         3       2   

              mexican  moroccan  russian  southern_us  spanish  thai  \
brazilian           6         1        1           19       10     8   
british             1         0        2           63        2     0   
cajun_creole        8         1        1           40        4     0   
chinese             2         1        2            6        0    15   
filipino            4         0        1           28        2    11   
french              7         3        2           76       20     0   
greek               1        11        3            9        7     0   
indian              5        19        1           15        0    17   
irish               1         2        0           48        3     1   
italian            16         9        2           84       34     4   
jamaican            2         4        0           15        0     2   
japanese            2         0        1           14        0    10   
korean              2         0        0            8        0     2   
mexican          1464         3        1           72       22    10   
moroccan            1       167        0            7        3     1   
russian            10         2       39           18        2     0   
southern_us        40         2        6          728       12     6   
spanish            12         3        0           25      115     2   
thai                2         1        0            2        0   311   
vietnamese          1         1        0            1        0    57   

              vietnamese  
brazilian              0  
british                0  
cajun_creole           0  
chinese               13  
filipino               1  
french                 1  
greek                  0  
indian                 1  
irish                  0  
italian                1  
jamaican               0  
japanese               1  
korean                 1  
mexican                3  
moroccan               1  
russian                0  
southern_us            4  
spanish                0  
thai                  20  
vietnamese           101  


In [2]:
X_train[0:5]

[u'peanuts, salt, water',
 u'large eggs, all-purpose flour, baking soda, buttermilk, yellow corn meal, dried sage, double-acting baking powder, unsalted butter, salt',
 u'pepper, jalapeno chilies, salt, oil, masa harina, lime juice, chili powder, cayenne pepper, onions, water, Mexican oregano, salsa, ground beef, fresh cilantro, garlic, peanut oil, cumin',
 u'peeled fresh ginger, noodles, hoisin sauce, salt, water, green onions, five-spice powder, pork tenderloin, peanut oil',
 u'short-grain rice, hot water, soy sauce, scallions, ponzu, cabbage, fresh ginger, shrimp']

The count vectorizor is splitting on every space, not just on the ", " that we'd prefer.

In [3]:
def remove_spaces(recipe_list):
    return ", ".join(map(lambda s: s.replace(" ", "_"), recipe_list.split(", ")))

X_train_nospc = list(map(remove_spaces, X_train))
X_dev_nospc = list(map(remove_spaces, X_dev))
X_train_nospc[5000:5005]

[u'light_brown_sugar, lemongrass, jalapeno_chilies, cilantro_leaves, vegan_mayonnaise, soy_sauce, mo_hanh, cilantro, ground_white_pepper, liquid_aminos, pickled_carrots, vegetable_oil, ground_coriander, baguette, extra_firm_tofu, garlic, cucumber',
 u'ground_cinnamon, soy_sauce, raisins, brown_sugar, water, salt, chestnuts, pinenuts, jujube, sweet_rice, sesame_oil, walnuts',
 u'dry_white_wine, dry_bread_crumbs, olive_oil, garlic, fresh_parsley, lemon, fillets, ground_black_pepper, salt',
 u'filet_mignon, large_eggs, puff_pastry_sheets, veal_demi-glace, shallots, minced_garlic, mushrooms, Madeira, unsalted_butter, gorgonzola',
 u'scallions, light_coconut_milk, salt, water, basmati_rice']

In [4]:
cv_nospc = CountVectorizer()
tf_X_train_nospc= cv_nospc.fit_transform(X_train_nospc)
tf_X_dev_nospc= cv_nospc.transform(X_dev_nospc)

In [5]:
#[print(feat) for feat in cv_nospc.get_feature_names() if "_" in feat]

#fails = [feat for feat in cv_nospc.get_feature_names() if feat[0] == "_"]
#print fails


In [6]:
mnb = MultinomialNB()
mnb.fit(tf_X_train_nospc,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
metrics.f1_score(y_dev, mnb.predict(tf_X_dev_nospc),average='weighted')

0.71755656988693173

Well, taking out the spaces was no good.

## Adding number of ingredients as a feature

In [8]:
def custom_preprocessor(s):
    """
    Preprocess step for working adjusting the ingredients list in comma
    seperated format
    """

    # Add number of ingredients as feature
    s = s.lower()
    num_ingredients = len(s.split(", "))
    s += ", " + str(num_ingredients)
    return s


In [9]:
cv_count = CountVectorizer(preprocessor = custom_preprocessor)
mnb_with_count = MultinomialNB()

tf_X_train_count = cv_count.fit_transform(X_train)
tf_X_dev_count = cv_count.transform(X_dev)
mnb_with_count.fit(tf_X_train_count, y_train) 
preds_with_count = mnb_with_count.predict(tf_X_dev_count)
metrics.f1_score(y_dev, preds_with_count, average='weighted')

0.72357435544647797

In [10]:
lr = LogisticRegression()
lr.fit(tf_X_train_count, y_train)
lr_preds_with_count = lr.predict(tf_X_dev_count)
metrics.f1_score(y_dev, lr_preds_with_count, average='weighted')

0.77598079133209752

Margininal improvement if any.
Including the remove spaces function in the prepropocessor only hurts the accuracy by about 1-2% for both mnb and lr models.

In [79]:
from itertools import combinations

def custom_tokenizer_2(string):
    result = []
    
    #overall note: the point of sorting the ingredients before adding
    #them to the list is to prevent duplicates that are just flipped
    #like "unsalted butter" and "butter unsalted"
    
    #create an empty list where we're going to put the ngrams
    #where n = 1 so we can later create combinations of those
    single_grams = []
    
    
    for ingredient in string.split(', '):
        for n in range(1,len(ingredient.split())+1):
            grams = ngrams(ingredient.split(' '), n)
            for gram in grams:
                #if the length of the ngram we're looking at is 1,
                #add it to our single grams list.
                if n == 1:
                    single_grams.append(gram[0])
                result.append(" ".join(sorted(list(gram))))
    
    #finally add every combination of the n = 1 ngrams
    #so from ['unsalted butter', 'baking powder']
    #we should be adding: 'butter unsalted', 'baking powder',
    #'baking butter', 'baking unsalted', 'butter powder', 'powder unsalted'
    for combo in combinations(single_grams, 2):
        result.append(' '.join(sorted(list(combo))))
    
    #return the unique elements of this list
    #since there will be plenty of duplicates
    return list(set(result))

In [12]:
def custom_preprocessor(ingredients):
    result = []
    for ingredient in ingredients.split(', '):
        temp = ingredient.lower()
        
        temp = re.sub(r'\d+|&', '', temp)
        temp = re.sub(r' +', ' ', temp)
        temp = ' '.join(word for word in temp.split() if len(word)>2)
        
        result.append("".join(temp))
    
    return ", ".join(result)

def custom_tokenizer(string):
    return string.split(', ') + re.split(', | ',string)

In [13]:

model = LogisticRegression()
vectorizer = CountVectorizer(preprocessor = custom_preprocessor,
                             tokenizer = custom_tokenizer,
                             ngram_range = (0,2))
                             


pipe = Pipeline([("vectorize", vectorizer), ("model", model)])
pipe.fit(X_train, y_train)
preds = pipe.predict(X_dev)

print(metrics.f1_score(y_dev, preds, average='weighted'))
print(pipe.score(X_dev, y_dev))

0.786202735318


0.782105262301


In [14]:
set(y_train)

{u'brazilian',
 u'british',
 u'cajun_creole',
 u'chinese',
 u'filipino',
 u'french',
 u'greek',
 u'indian',
 u'irish',
 u'italian',
 u'jamaican',
 u'japanese',
 u'korean',
 u'mexican',
 u'moroccan',
 u'russian',
 u'southern_us',
 u'spanish',
 u'thai',
 u'vietnamese'}

## Our biggest issues are southern_us and italian foods.

Not with eachother, but Creole and southern are very simmilar and iatlian is also similar to a lot of stuff.

# British-Irish Ensemble Model

In [15]:
BI_indexes = [True if y == u"british" or y == u"irish" else False for y in y_train]
BI_indexes = pd.Series(BI_indexes)

BI_y = pd.Series(y_train)[BI_indexes]
BI_x = pd.Series(X_train)[BI_indexes]

X_train_BI, X_dev_BI, y_train_BI, y_dev_BI = train_test_split(BI_x, BI_y)
print len(X_train_BI)

817


In [18]:
class DenseTransformer(TransformerMixin):
    """
    The dense transformer is required when the model being fit requires a
    dense representation matrix, not a sparse one.
    """

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [81]:
model = GradientBoostingClassifier()

vectorizer = CountVectorizer(preprocessor = custom_preprocessor,
                             tokenizer = custom_tokenizer_2)

pipe_BI = Pipeline([("vectorize", vectorizer),
                  ("to_dense", DenseTransformer()),
                  ("model", model)])

pipe_BI.fit(X_train_BI, y_train_BI)
preds_BI = pipe_BI.predict(X_dev_BI)

print(metrics.f1_score(y_dev_BI, preds_BI, average='weighted'))
print(pipe_BI.score(X_dev_BI, y_dev_BI))
print(classification_report(y_dev_BI, preds_BI))


0.783882783883
             precision    recall  f1-score   support

    british       0.78      0.89      0.83       163
      irish       0.79      0.63      0.70       110

avg / total       0.79      0.78      0.78       273



0.778388145194


In [87]:
model = LogisticRegression(penalty = "l2", C = )

vectorizer = CountVectorizer(preprocessor = custom_preprocessor,
                             tokenizer = custom_tokenizer_2)

pipe_BI = Pipeline([("vectorize", vectorizer),
                  #("to_dense", DenseTransformer()),
                  ("model", model)])

pipe_BI.fit(X_train_BI, y_train_BI)
preds_BI = pipe_BI.predict(X_dev_BI)

print(metrics.f1_score(y_dev_BI, preds_BI, average='weighted'))
print(pipe_BI.score(X_dev_BI, y_dev_BI))
print(classification_report(y_dev_BI, preds_BI))

0.798534798535
             precision    recall  f1-score   support

    british       0.82      0.85      0.83       163
      irish       0.76      0.73      0.74       110

avg / total       0.80      0.80      0.80       273



0.797713820408


# Now we have to train a model to predict the british-irish category.

In [72]:
def make_BI_labels(label):
    if label == u"british" or label == u"irish":
        return u"BI"
    else:
        return label

y_train_non_BI = pd.Series(y_train).map(make_BI_labels)
y_dev_non_BI = pd.Series(y_dev).map(make_BI_labels)
print len(y_train_non_BI)
print len(y_train)



29830
29830


In [86]:
# Fit a model to classifier everything with joined British and Irish
model = LogisticRegression(penalty = "l2")

vectorizer = CountVectorizer(preprocessor = custom_preprocessor,
                             tokenizer = custom_tokenizer_2)

pipe_non_BI = Pipeline([("vectorize", vectorizer),
                  #("to_dense", DenseTransformer()),
                  ("model", model)])

pipe_non_BI.fit(X_train, y_train_non_BI)
preds_non_BI = pipe_non_BI.predict(X_dev)

print(metrics.f1_score(y_dev_non_BI, preds_non_BI, average='weighted'))
print(pipe_non_BI.score(X_dev, y_dev_non_BI))


0.807320997586


0.804034698328


In [88]:
# On the British-Irish predicted recipies, predict with the BI model.
BI_prediction_indexes = [True if y == u"BI" else False for y in preds_non_BI]
fill_in_BI = pipe_BI.predict(pd.Series(X_dev)[BI_prediction_indexes])

# Reconnect the predictions for the final model.
preds_refilled = pd.Series(preds_non_BI)
preds_refilled[BI_prediction_indexes] = fill_in_BI

print(metrics.f1_score(y_dev, preds_refilled, average='weighted'))
print(classification_report(y_dev, preds_refilled))

0.800096784998
              precision    recall  f1-score   support

   brazilian       0.83      0.61      0.70       121
     british       0.58      0.54      0.56       206
cajun_creole       0.79      0.74      0.76       376
     chinese       0.85      0.85      0.85       670
    filipino       0.73      0.65      0.69       190
      french       0.64      0.66      0.65       636
       greek       0.79      0.72      0.75       258
      indian       0.87      0.91      0.89       758
       irish       0.59      0.57      0.58       175
     italian       0.81      0.90      0.85      1963
    jamaican       0.88      0.68      0.77       123
    japanese       0.82      0.74      0.78       342
      korean       0.86      0.77      0.82       221
     mexican       0.91      0.93      0.92      1668
    moroccan       0.84      0.78      0.81       215
     russian       0.75      0.44      0.55       133
 southern_us       0.74      0.77      0.76      1056
     spanish

In [None]:
cats_BI = ['BI', 'brazilian', 'cajun_creole', 'chinese', 'filipino', 'french', 'greek', 'indian', 'italian', 'jamaican','japanese', 'korean', 'mexican', 'moroccan', 'russian', 'southern_us', 'spanish', 'thai', 'vietnamese']

cm = confusion_matrix(y_dev_non_BI, preds_non_BI)
cmdf = pd.DataFrame(cm, index = cats_BI, columns = cats_BI)
#print(cmdf)


# Contiential food Source

Here I attempt to improve the model by adding which continent the 

In [24]:
continents ={'brazilian':"America"
             , 'british': "Europe"
             , 'cajun_creole': "America"
             , 'chinese': "Asia"
             , 'filipino': "Asia"
             , 'french': "Europe"
             , 'greek': "Europe"
             , 'indian': "Asia"
             ,'irish': "Europe"
             , 'italian': "Europe"
             , 'jamaican': "America"
             ,'japanese': "Asia"
             , 'korean': "Asia"
             , 'mexican': "America"
             , 'moroccan': "Africa"
             , 'russian': "Europe"
             , 'southern_us':"America"
             , 'spanish': "Europe"
             , 'thai': "Asia"
             ,'vietnamese': "Asia"}

In [25]:
continents["chinese"]

'Asia'

In [30]:
def find_canned_goods(recipe):
    """This function searches for anything that's canned in a recipie"""
    if "can" in recipe.lower():
        return True
    else:
        return False

In [31]:
df_X_train = pd.DataFrame({"Recipe": X_train, "Label": y_train})
df_X_train["Continent"] = [continents[label] for label in df_X_train["Label"]]
df_X_train["Length"] = df_X_train["Recipe"].apply(lambda r: len(r.split(", ")))
df_X_train["Canned"] = df_X_train["Recipe"].apply(find_canned_goods)
df_X_train.head()

         Label                                             Recipe Continent  \
0  southern_us                               peanuts, salt, water   America   
1  southern_us  large eggs, all-purpose flour, baking soda, bu...   America   
2      mexican  pepper, jalapeno chilies, salt, oil, masa hari...   America   
3      chinese  peeled fresh ginger, noodles, hoisin sauce, sa...      Asia   
4      italian  short-grain rice, hot water, soy sauce, scalli...    Europe   

   Length  Canned  
0       3   False  
1       9   False  
2      17    True  
3       9   False  
4       8   False  

In [27]:
BI_train = df_X_train.loc[BI_indexes]
BI_train.head()

       Label                                             Recipe Continent  \
47     irish  ground black pepper, gran marnier, strawberrie...    Europe   
62     irish  ground black pepper, garlic, dried thyme, butt...    Europe   
72     irish  lemon curd, whipping cream, mint sprigs, orang...    Europe   
81     irish  baking soda, salt, russet potatoes, buttermilk...    Europe   
104  british  pudding, fresh rosemary, cracked black pepper,...    Europe   

     Length  
47        6  
62       11  
72        8  
81        6  
104       6  

In [40]:
df_X_train[["Label", "Canned", "Length"]].groupby("Label").agg(["mean", "std"])

                Canned               Length          
                  mean       std       mean       std
Label                                                
brazilian     0.060694  0.239113   9.352601  5.045010
british       0.056856  0.231761   9.720736  4.118912
cajun_creole  0.090598  0.287160  12.582906  4.648506
chinese       0.076385  0.265680  11.992511  4.097274
filipino      0.053097  0.224426  10.111504  3.969738
french        0.046766  0.211190   9.835821  4.163521
greek         0.029444  0.169139  10.170120  3.725204
indian        0.065479  0.247424  12.685078  4.969620
irish         0.034553  0.182830   9.286585  3.751603
italian       0.050043  0.218051   9.898723  3.803273
jamaican      0.133995  0.341070  12.325062  4.949098
japanese      0.055504  0.229068   9.693802  4.247837
korean        0.062397  0.242075  11.321839  3.829190
mexican       0.146541  0.353685  10.843606  4.643087
moroccan      0.034653  0.183052  12.831683  4.819182
russian       0.058989  0.23

All of the categories are within 1 standard deviation of eachother from all the other categories.  Unlikely to be any predictive power in these.