In [15]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import scipy as sp
import string
import json
import re
import sklearn.metrics as metrics
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as  plt
import seaborn as sns
import sklearn
from sklearn.metrics import accuracy_score
####

from sklearn.model_selection import train_test_split
from sklearn import linear_model
import statsmodels.api as sm

from sklearn.linear_model import LogisticRegression
import sklearn.linear_model as linear_model
import sklearn.model_selection as model_selection

from sklearn import svm

In [2]:
# open training dataset and parse recipes

with open('train.json') as cooking_file:  
    
    data = json.load(cooking_file)
    

In [3]:
#removing punctuations and spaces before fixing the data and keep them all in a dict  (italian, mexican)

punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

def gather_recipes(recipes, cuisine_country):
    
    data_list = []
    
    for _object in recipes:
        
        if _object.get('cuisine') == cuisine_country:
            
            ingredients = _object.get('ingredients') 
            _id = _object.get('id')
            
            for i in ingredients:
                
                _dict = dict()
                ingr = i.replace(' ', '').lower()
                    
                for ch in ingr: 
                    
                    if ch in punctuations: 
                        
                        ingr = ingr.replace(ch, '')  
                
                _dict['ingredients'] = ingr                
                _dict['id'] = _id
                data_list.append(_dict)
                
    return data_list

clean_italian_cuisine_data = gather_recipes(data, 'italian')
clean_mexican_cuisine_data = gather_recipes(data, 'mexican')

In [4]:
#creating dfs  (italian, mexican)

italian_df = DataFrame(clean_italian_cuisine_data)
italian_ingredients = list(set(italian_df.ingredients))

mexican_df = DataFrame(clean_mexican_cuisine_data)
mexican_ingredients = list(set(mexican_df.ingredients))


In [5]:
#fixing data for CountVectorizer (italian_df)

it_unique_ids = []
it_ingredients = []
it_grouped = italian_df.groupby('id')

for ids, ing in it_grouped:
    
    it_unique_ids.append(ids)
    
    row = str(ing)
    row = row.replace('ingredients','')
    row = row.replace('id', '')
    row = row.replace('\n', '')
    row = re.sub('[0-9]+', '', row)
    row = row.split( )
    row = ' '.join(row)
    
    it_ingredients.append(row)
    

In [6]:
#fixing data for CountVectorizer (mexican_df)

mex_unique_ids = []
mex_ingredients = []
mex_grouped = mexican_df.groupby('id')

for ids, ing in mex_grouped:
    
    mex_unique_ids.append(ids)
    
    row = str(ing)
    row = row.replace('ingredients','')
    row = row.replace('id', '')
    row = row.replace('\n', '')
    row = re.sub('[0-9]+', '', row)
    row = row.split( )
    row = ' '.join(row)
    
    mex_ingredients.append(row)
    


In [7]:
#using CountVectorizer  (italian, mexican) and adding one column Cuisine

it_vectorizer = CountVectorizer(analyzer = 'word', binary = True)
it_array = it_vectorizer.fit_transform(it_ingredients)
it_array = it_array.toarray()

it_df = DataFrame(it_array, columns = it_vectorizer.get_feature_names(), index = it_unique_ids)
it_df['Cuisine'] = 0 #italian
it_df.to_csv('it_cleandata.csv', index_label = 'ID')
print(it_df)

mex_vectorizer = CountVectorizer(analyzer = 'word', binary = True)
mex_array = mex_vectorizer.fit_transform(mex_ingredients)
mex_array = mex_array.toarray()

mex_df = DataFrame(mex_array, columns = mex_vectorizer.get_feature_names(), index = mex_unique_ids)
mex_df['Cuisine'] = 1 #mexican
mex_df.to_csv('mex_cleandata.csv', index_label = 'ID')
print(mex_df)

       abbamele  accent  acinipepe  acornsquash  activedryyeast  adobosauce  \
4             0       0          0            0               0           0   
14            0       0          0            0               0           0   
20            0       0          0            0               0           0   
56            0       0          0            0               0           0   
62            0       0          0            0               0           0   
...         ...     ...        ...          ...             ...         ...   
49679         0       0          0            0               0           0   
49681         0       0          0            0               0           0   
49689         0       0          0            0               0           0   
49697         0       0          0            0               0           0   
49708         0       0          0            0               0           0   

       adoboseasoning  agavenectar  agedbalsamicvin

In [8]:
#putting mexican and italian in one DataFrame

df = pd.DataFrame(it_df)
df = df.append(mex_df, sort = False)
df.fillna(0, inplace = True)
df = df.astype('int64')
print(df['Cuisine'])

4        0
14       0
20       0
56       0
62       0
        ..
49678    1
49687    1
49707    1
49709    1
49717    1
Name: Cuisine, Length: 14276, dtype: int64


In [9]:
X = df.loc[:, df.columns != 'Cuisine'].values  #all columns with values except Cuisine column
print(X)
print(X.shape)

Y = df['Cuisine'] #just the ids in a list
#print(Y)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(14276, 3976)


In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

10707


In [11]:
# fit a model
lr_m = linear_model.LogisticRegression()
model = lr_m.fit(X_train, Y_train)


In [12]:
print("classifier score:",lr_m.score(X_test, Y_test))
Y_pred = lr_m.predict(X_test)
print("\naccuracy:",metrics.accuracy_score(Y_test, Y_pred))
print("\nConfusion matrix:")
print(metrics.confusion_matrix(Y_test, Y_pred))
print("\nPrecision Score per class:")
print(metrics.precision_score(Y_test, Y_pred, average = None))
print("\nAverage Precision Score:")
print(metrics.precision_score(Y_test, Y_pred, average = 'weighted'))
print("\nRecall Score per class:")
print(metrics.recall_score(Y_test, Y_pred, average = None))
print("\nAverage Recall Score:")
print(metrics.recall_score(Y_test, Y_pred, average = 'weighted'))
print("\nF1-score Score per class:")
print(metrics.f1_score(Y_test, Y_pred, average = None))
print("\nAverage F1 Score:")
print(metrics.f1_score(Y_test, Y_pred, average = 'weighted'))

classifier score: 0.9686186606892687

accuracy: 0.9686186606892687

Confusion matrix:
[[1887   40]
 [  72 1570]]

Precision Score per class:
[0.96324655 0.97515528]

Average Precision Score:
0.9687254354733641

Recall Score per class:
[0.97924235 0.95615104]

Average Recall Score:
0.9686186606892687

F1-score Score per class:
[0.97117859 0.96555966]

Average F1 Score:
0.9685934707348253


In [13]:
scores = model_selection.cross_val_score(lr_m, X, Y, scoring = 'f1_weighted', cv = 5)
print('scores:', scores)
print('mean scores:', scores.mean())

scores: [0.96809087 0.97195581 0.96879183 0.96668815 0.96880957]
mean scores: 0.9688672451742961


In [14]:
#scores = model_selection.cross_val_score(#lr_clf,
                                          #svm_clf,
                                          #knn,
                                          #dtree,
                                          #X,
                                          #Y,
                                          #scoring='f1_weighted',
                                          #cv=5)
#print (scores)
#print (scores.mean())

In [None]:
#svm_clf = svm.LinearSVC()
#svm_clf = svm.SVC(kernel = 'poly')
svm_clf = svm.SVC()
svm_clf.fit(X_train,Y_train)
print("classifier score:",svm_clf.score(X_test, Y_test))
Y_pred = svm_clf.predict(X_test)
print("\naccuracy:",metrics.accuracy_score(y_test, Y_pred))
print("\nConfusion matrix:")
print(metrics.confusion_matrix(y_test, Y_pred))
print("\nPrecision Score per class:")
print(metrics.precision_score(Y_test, Y_pred, average = None))
print("\nAverage Precision Score:")
print(metrics.precision_score(Y_test, Y_pred, average = 'weighted'))
print("\nRecall Score per class:")
print(metrics.recall_score(Y_test, Y_pred, average= None))
print("\nAverage Recall Score:")
print(metrics.recall_score(Y_test, Y_pred, average = 'weighted'))
print("\nF1-score Score per class:")
print(metrics.f1_score(Y_test, Y_pred, average = None))
print("\nAverage F1 Score:")
print(metrics.f1_score(Y_test, Y_pred, average = 'weighted'))