# What's Cooking  in Python
https://www.kaggle.com/manuelatadvice/whats-cooking/noname/code

### Links: [local](http://localhost:8888/notebooks/classes/12-text-mining/whats-cooking-python.ipynb) [github](https://github.com/AnalyticsDojo/materials/blob/master/analyticsdojo/classes/12-text-mining/whats-cooking-python.ipynb) [slides](http://nbviewer.jupyter.org/format/slides/github/AnalyticsDojo/materials/blob/master/analyticsdojo/classes/12-text-mining/whats-cooking-python.ipynb#/)

In [91]:

!pip install nltk



In [92]:
!pip install sklearn



In [93]:
#This imports a bunch of packages.  
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer
from collections import Counter
import json
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import grid_search



In [94]:

#If you import the codes locally, this seems to cause some issues.  
import json
from urllib.request import urlopen

urltrain= 'https://raw.githubusercontent.com/RPI-Analytics/MGMT6963-2015/master/data/whatscooking/whatscookingtrain.json'
urltest = 'https://raw.githubusercontent.com/RPI-Analytics/MGMT6963-2015/master/data/whatscooking/whatscookingtest.json'


train = pd.read_json(urlopen(urltrain))
test = pd.read_json(urlopen(urltest))

In [95]:
#First we want to see the most popular cuisine for the naive model. 
train.groupby('cuisine').size()

cuisine
brazilian        467
british          804
cajun_creole    1546
chinese         2673
filipino         755
french          2646
greek           1175
indian          3003
irish            667
italian         7838
jamaican         526
japanese        1423
korean           830
mexican         6438
moroccan         821
russian          489
southern_us     4320
spanish          989
thai            1539
vietnamese       825
dtype: int64

In [96]:
#Here we write the most popular selection.  This is the baseline by which we will judge other models. 
test['cuisine']='italian'

In [97]:
#THis is a much more simple version that selects out the columns ID and cuisinte
submission=test[['id' ,  'cuisine' ]]
#This is a more complex method I showed that gives same.
#submission=pd.DataFrame(test.ix[:,['id' ,  'cuisine' ]])

In [98]:
#This outputs the file.
submission.to_csv("1_cookingSubmission.csv",index=False)


In [99]:
#So it seems there is some data we need to use the NLTK leemmatizer.  
stemmer = WordNetLemmatizer()
nltk.download('wordnet')

 

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [100]:
#We see this in a Python Solution. 
train['ingredients_clean_string1'] = [','.join(z).strip() for z in train['ingredients']] 

#We also know that we can do something similar though a Lambda function. 
strip = lambda x: ' , '.join(x).strip() 
#Finally, we call the function for name
train['ingredients_clean_string2'] = train['ingredients'].map(strip)

#Now that we used the lambda function, we can reuse this for the test dataset. 
test['ingredients_clean_string1'] = test['ingredients'].map(strip)
 


In [101]:
#We see this in one of the solutions.  We can reconstruct it in a way that makes it abit easier to follow, but I found when doing that it took forever.  

#To interpret this, read from right to left. 
train['ingredients_string1'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in train['ingredients']]       
test['ingredients_string1'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in test['ingredients']]       




In [102]:
train['ingredients_string1']

0        romaine lettuce black olives grape tomatoes ga...
1        plain flour ground pepper salt tomato ground b...
2        egg pepper salt mayonaise cooking oil green ch...
3                           water vegetable oil wheat salt
4        black pepper shallot cornflour cayenne pepper ...
5        plain flour sugar butter egg fresh ginger root...
6        olive oil salt medium shrimp pepper garlic cho...
7        sugar pistachio nuts white almond bark flour v...
8        olive oil purple onion fresh pineapple pork po...
9        chopped tomatoes fresh basil garlic extra virg...
10       pimento sweet pepper dried oregano olive oil g...
11       low sodium soy sauce fresh ginger dry mustard ...
12       Italian parsley leaves walnut hot red pepper f...
13       ground cinnamon fresh cilantro chili powder gr...
14       fresh parmesan cheese butter all purpose flour...
15       tumeric vegetable stock tomato garam masala na...
16       greek yogurt lemon curd confectioners sugar ra.

In [103]:
ingredients = train['ingredients'].apply(lambda x:','.join(x))
ingredients

0        romaine lettuce,black olives,grape tomatoes,ga...
1        plain flour,ground pepper,salt,tomatoes,ground...
2        eggs,pepper,salt,mayonaise,cooking oil,green c...
3                           water,vegetable oil,wheat,salt
4        black pepper,shallots,cornflour,cayenne pepper...
5        plain flour,sugar,butter,eggs,fresh ginger roo...
6        olive oil,salt,medium shrimp,pepper,garlic,cho...
7        sugar,pistachio nuts,white almond bark,flour,v...
8        olive oil,purple onion,fresh pineapple,pork,po...
9        chopped tomatoes,fresh basil,garlic,extra-virg...
10       pimentos,sweet pepper,dried oregano,olive oil,...
11       low sodium soy sauce,fresh ginger,dry mustard,...
12       Italian parsley leaves,walnuts,hot red pepper ...
13       ground cinnamon,fresh cilantro,chili powder,gr...
14       fresh parmesan cheese,butter,all-purpose flour...
15       tumeric,vegetable stock,tomatoes,garam masala,...
16       greek yogurt,lemon curd,confectioners sugar,ra.

In [104]:
#Now we will create a corpus.
corpustr = train['ingredients_string1']
corpusts = test['ingredients_string1']


In [105]:
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
#You could develop an understanding based on each.  
vectorizertr = TfidfVectorizer(stop_words='english',
                             ngram_range = ( 1 , 1 ),analyzer="word", 
                             max_df = .57 , binary=False , token_pattern=r'\w+' , sublinear_tf=False)
vectorizerts = TfidfVectorizer(stop_words='english')

In [106]:
#Note that this doesn't work with the #todense option.  
tfidftr=vectorizertr.fit_transform(corpustr)
predictors_tr = tfidftr

In [107]:
#Note that this doesn't work with the #todense option.  This creates a matrix of predictors from the corpus. 
tfidfts=vectorizertr.transform(corpusts)
predictors_ts= tfidfts

In [108]:
#This is target variable.  
targets_tr = train['cuisine']


In [109]:
#Logistic Regression. 
parameters = {'C':[1, 10]}
#clf = LinearSVC()
clf = LogisticRegression()



In [110]:
#This uses that associated paramters to search a grid space. 
classifier = grid_search.GridSearchCV(clf, parameters)
classifier=classifier.fit(predictors_tr,targets_tr)



In [111]:
#This predicts the outcome for the test set. 
predictions=classifier.predict(predictors_ts)

In [112]:
#This adds it to the resulting dataframe. 
test['cuisine'] = predictions

In [113]:
#This creates the submision dataframe
submission2=test[['id' ,  'cuisine' ]]

In [114]:
#This outputs the file.
submission2.to_csv("../../data/2_logisticSubmission.csv",index=False)

In [115]:
from sklearn.ensemble import RandomForestClassifier 



In [116]:
# Create the random forest object which will include all the parameters
# for the fit
forest = RandomForestClassifier(n_estimators = 10)



In [117]:
# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(predictors_tr,targets_tr)



In [118]:
# Take the same decision trees and run it on the test data
predictions = forest.predict(predictors_ts)

In [119]:
#This adds it to the resulting dataframe. 
test['cuisine'] = predictions

In [120]:
#This creates the submision dataframe
submission2=test[['id' ,  'cuisine' ]]

In [121]:
ingredients = train['ingredients'].apply(lambda x:','.join(x))
ingredients


0        romaine lettuce,black olives,grape tomatoes,ga...
1        plain flour,ground pepper,salt,tomatoes,ground...
2        eggs,pepper,salt,mayonaise,cooking oil,green c...
3                           water,vegetable oil,wheat,salt
4        black pepper,shallots,cornflour,cayenne pepper...
5        plain flour,sugar,butter,eggs,fresh ginger roo...
6        olive oil,salt,medium shrimp,pepper,garlic,cho...
7        sugar,pistachio nuts,white almond bark,flour,v...
8        olive oil,purple onion,fresh pineapple,pork,po...
9        chopped tomatoes,fresh basil,garlic,extra-virg...
10       pimentos,sweet pepper,dried oregano,olive oil,...
11       low sodium soy sauce,fresh ginger,dry mustard,...
12       Italian parsley leaves,walnuts,hot red pepper ...
13       ground cinnamon,fresh cilantro,chili powder,gr...
14       fresh parmesan cheese,butter,all-purpose flour...
15       tumeric,vegetable stock,tomatoes,garam masala,...
16       greek yogurt,lemon curd,confectioners sugar,ra.

In [122]:
#What we really need is a ingredient freqency, inverse cuisine frequency.  
#Let's first create a function to create a count
def tf(count, N):
        return count / float(N)

#Now let's create a function for IDF....
def idf(count, N ):

    # tf-idf calc involves multiplying against a tf value less than 0, so it's
    # necessary to return a value greater than 1 for consistent scoring. 
    # (Multiplying two values less than 1 returns a value less than each of 
    # them.)

    try:
        return 1.0 + log(float(N) / count)
    except ZeroDivisionError:
        return 1.0
    
    

In [123]:
from collections import Counter

#This creates a Corpus for each incredient
corpus=','.join(x.strip() for x in train['ingredients_clean_string1']).split(',')
total=Counter(corpus)
#This creates a corpus for each word
corpus2=','.join(x.strip() for x in train['ingredients_string1']).split(' ')
#This just does some extra cleaning....because of some issues with file. 
corpus3=','.join(x.strip() for x in corpus2).split(',')
total2=Counter(corpus3)


In [124]:
print(len(train))
countcuisines=Counter(train['cuisine'])
countcuisines

39774


Counter({'brazilian': 467,
         'british': 804,
         'cajun_creole': 1546,
         'chinese': 2673,
         'filipino': 755,
         'french': 2646,
         'greek': 1175,
         'indian': 3003,
         'irish': 667,
         'italian': 7838,
         'jamaican': 526,
         'japanese': 1423,
         'korean': 830,
         'mexican': 6438,
         'moroccan': 821,
         'russian': 489,
         'southern_us': 4320,
         'spanish': 989,
         'thai': 1539,
         'vietnamese': 825})

In [125]:
# Now we want to create a function that can take in a dictionary [a lookup that has terms and values] and 
# score different recipes by the lookup.

def score(dictionary, stringx, delimiter):
    sumt=sum((dictionary.get(x,0) for x in stringx.split(delimiter)))
    return  sumt/len(stringx.split(delimiter))

#Alt.   
#    sumt=0
#    for x in stringx.split(delimiter):
#        sumt+=dictionary[x]
    
testdic={'salt': .5, 'vanilla': .2, 'butter': .3,'sugar': .1}
teststring='salt,vanilla,sugar,onion'
teststring2='salt vanilla sugar butter onion'
print(score(testdic,teststring,','))
print(score(testdic,teststring2,' '))

#Here we are just manually calculating the 
print("Test1 Value: ", (.5+.2+.1)/4)
print("Test2 Value: ", (.5+.2+.1+.3)/5)


#Now we are using that separate function in another function.  
#title_fn = lambda x: 1 if has_title(x) else 0
#Finally, we call the function for name
#train['Title'] = train['Name'].map(title_fn)

0.19999999999999998
0.21999999999999997
Test1 Value:  0.19999999999999998
Test2 Value:  0.21999999999999997


In [126]:
from math import log
trainc = pd.DataFrame()
trainc2 = pd.DataFrame()
testc = pd.DataFrame()
testc2 = pd.DataFrame()

#This will loop through each unique cuisine
for cuisine in train['cuisine'].unique():
    #Number of rows in the cuisine
    cuisinerows=train[train['cuisine'] == cuisine]
    #Number of rows not in cuisine
    notcuisinerows=train[train['cuisine'] != cuisine]
    
    #This looks at specific ingredients in the cuisine corpus
    cuisinecorpus=','.join(x.strip() for x in cuisinerows['ingredients_clean_string1']).split(',')
    #This looks at specific ingredients not in cuisine
    notcuisinecorpus=','.join(x.strip() for x in notcuisinerows['ingredients_clean_string1']).split(',')
   
    #This treats all words individually in cuisine
    cuisinecorpus2=','.join(x.strip() for x in cuisinerows['ingredients_string1']).split(' ')
    notcuisinecorpus2=','.join(x.strip() for x in notcuisinerows['ingredients_string1']).split(' ')

    #this extra line is just some additional cuisine
    cuisinecorpus2=','.join(x.strip() for x in cuisinecorpus2).split(',')
    notcuisinecorpus2=','.join(x.strip() for x in cuisinecorpus2).split(',')
    
    #This creates the document term matrix for each.
    tfcuisine=Counter(cuisinecorpus)
    tfnotcuisine=Counter(notcuisinecorpus)
    
    tfcuisine2=Counter(cuisinecorpus2)
    tfnotcuisine2=Counter(notcuisinecorpus2)
     
    #This is creating a dict that indicates TFIDF for each ingredient
    cfincf={k: (tf(tfcuisine[k],len(cuisinecorpus))*idf(tfnotcuisine[k],len(notcuisinecorpus)))  for k in  tfcuisine.keys()}    
    
    #This is creating a dict that indicates TFIDF for each word
    cfincf2={k: (tf(tfcuisine2[k],len(cuisinecorpus2))*idf(tfnotcuisine2[k],len(notcuisinecorpus2)))  for k in  tfcuisine2.keys()}    
    
    #Now we will use our strings to score each outcome. 
    score_fn = lambda x: score(cfincf, x, ',') 
    score_fn2 = lambda x: score(cfincf2, x, ' ')
    
#Finally, we call the function for name
    trainc[cuisine] = train['ingredients_clean_string1'].map(score_fn)
    trainc2[cuisine] = train['ingredients_clean_string1'].map(score_fn2)
    testc[cuisine] = test['ingredients_clean_string1'].map(score_fn)
    testc2[cuisine] = test['ingredients_clean_string1'].map(score_fn2)
    
    

In [127]:
trainc 

Unnamed: 0,greek,southern_us,filipino,indian,jamaican,spanish,italian,mexican,chinese,british,thai,vietnamese,cajun_creole,brazilian,french,japanese,irish,korean,moroccan,russian
0,0.063402,0.014313,0.042993,0.018555,0.027734,0.021350,0.025247,0.028564,0.018063,0.010681,0.019183,0.021842,0.020400,0.019667,0.013159,0.010634,0.014112,0.026451,0.022275,0.011072
1,0.043487,0.058533,0.049211,0.041847,0.054144,0.043117,0.040338,0.036738,0.029228,0.062378,0.022971,0.025563,0.039871,0.047792,0.040042,0.033356,0.055036,0.028616,0.031038,0.058220
2,0.034594,0.054670,0.073175,0.037089,0.048051,0.030905,0.035124,0.033269,0.044232,0.051751,0.024760,0.028254,0.038588,0.037735,0.035919,0.045915,0.054855,0.047717,0.025595,0.052095
3,0.068897,0.093675,0.127790,0.102428,0.104767,0.071939,0.067849,0.072040,0.086195,0.082342,0.066098,0.073295,0.075210,0.083617,0.075126,0.091498,0.092381,0.078978,0.067758,0.107309
4,0.031048,0.040054,0.047440,0.064445,0.039515,0.031350,0.027690,0.036956,0.022852,0.042136,0.020566,0.023596,0.038502,0.036080,0.035890,0.029748,0.042737,0.023796,0.041862,0.042769
5,0.035193,0.083304,0.044253,0.035216,0.051046,0.033618,0.035672,0.025472,0.032754,0.091074,0.020535,0.028287,0.032436,0.042970,0.054128,0.037862,0.081313,0.030673,0.046190,0.074101
6,0.052469,0.034019,0.056384,0.037385,0.044352,0.055274,0.050894,0.048175,0.025828,0.029773,0.027076,0.030762,0.039665,0.050097,0.037727,0.021628,0.036278,0.032287,0.044278,0.034476
7,0.032569,0.051098,0.026654,0.012860,0.026457,0.034987,0.036313,0.018347,0.024357,0.054291,0.015490,0.022329,0.020148,0.033981,0.038185,0.028440,0.044628,0.025368,0.028201,0.052309
8,0.048198,0.030028,0.034463,0.030507,0.035929,0.042568,0.041386,0.066220,0.016749,0.027119,0.026904,0.028878,0.027301,0.044445,0.030485,0.015853,0.029126,0.017866,0.042531,0.028645
9,0.052611,0.018345,0.040204,0.026628,0.025688,0.059186,0.066899,0.030249,0.026617,0.014946,0.028922,0.031394,0.028958,0.027223,0.031454,0.018373,0.016014,0.035546,0.042751,0.012981


In [128]:
#This creates a prediction from the column that has the maximum value and outputs it to a file. 
test['cuisine']=testc.idxmax(axis=1)
submission=test[['id' ,  'cuisine' ]]
submission.to_csv("4_tfidf1.csv",index=False)
test['cuisine']=testc2.idxmax(axis=1)
submission=test[['id' ,  'cuisine' ]]
submission.to_csv("5_tfidf2.csv",index=False)


In [129]:
trainc['prediction']=trainc.idxmax(axis=1)
trainc2['prediction']=trainc2.idxmax(axis=1)
trainc['cuisine']=train['cuisine']
trainc2['cuisine']=train['cuisine']
trainc.to_csv("coded1.csv",index=False)
trainc2.to_csv("coded2.csv",index=False)

In [130]:
print(trainc[['prediction','cuisine']],trainc2[['prediction','cuisine']]]

SyntaxError: invalid syntax (<ipython-input-130-b5addb2a3cb8>, line 1)