In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import scipy as sp
import string
import json
import re
import sklearn.metrics as metrics
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as  plt
import seaborn as sns
import sklearn
from sklearn.metrics import confusion_matrix
####
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn import linear_model
import statsmodels.api as sm

from sklearn.linear_model import LogisticRegression
import sklearn.linear_model as linear_model
import sklearn.model_selection as model_selection

from sklearn import svm
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB

In [2]:
# open training dataset and parse recipes

with open('train.json') as cooking_file:  
    
    data = json.load(cooking_file)
    

In [3]:
#removing punctuations and spaces before fixing the data and keep them all in a dict  (italian, mexican)

punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

def gather_recipes(recipes, cuisine_country):
    
    data_list = []
    
    for _object in recipes:
        
        if _object.get('cuisine') == cuisine_country:
            
            ingredients = _object.get('ingredients') 
            _id = _object.get('id')
            
            for i in ingredients:
                
                _dict = dict()
                ingr = i.replace(' ', '').lower()
                    
                for ch in ingr: 
                    
                    if ch in punctuations: 
                        
                        ingr = ingr.replace(ch, '')  
                
                _dict['ingredients'] = ingr                
                _dict['id'] = _id
                data_list.append(_dict)
                
    return data_list

clean_italian_cuisine_data = gather_recipes(data, 'italian')
clean_mexican_cuisine_data = gather_recipes(data, 'mexican')

In [4]:
#creating dfs  (italian, mexican)

italian_df = DataFrame(clean_italian_cuisine_data)
italian_ingredients = list(set(italian_df.ingredients))

mexican_df = DataFrame(clean_mexican_cuisine_data)
mexican_ingredients = list(set(mexican_df.ingredients))


In [5]:
#fixing data for CountVectorizer (italian_df)

it_unique_ids = []
it_ingredients = []
it_grouped = italian_df.groupby('id')

for ids, ing in it_grouped:
    
    it_unique_ids.append(ids)
    
    row = str(ing)
    row = row.replace('ingredients','')
    row = row.replace('id', '')
    row = row.replace('\n', '')
    row = re.sub('[0-9]+', '', row)
    row = row.split( )
    row = ' '.join(row)
    
    it_ingredients.append(row)
    

In [6]:
#fixing data for CountVectorizer (mexican_df)

mex_unique_ids = []
mex_ingredients = []
mex_grouped = mexican_df.groupby('id')

for ids, ing in mex_grouped:
    
    mex_unique_ids.append(ids)
    
    row = str(ing)
    row = row.replace('ingredients','')
    row = row.replace('id', '')
    row = row.replace('\n', '')
    row = re.sub('[0-9]+', '', row)
    row = row.split( )
    row = ' '.join(row)
    
    mex_ingredients.append(row)

In [7]:
#using CountVectorizer  (italian, mexican) and adding one column Cuisine

it_vectorizer = CountVectorizer(analyzer = 'word', binary = True)
it_array = it_vectorizer.fit_transform(it_ingredients)
it_array = it_array.toarray()
it_df = DataFrame(it_array, columns = it_vectorizer.get_feature_names(), index = it_unique_ids)
it_df['Cuisine'] = 0 #italian
it_df.to_csv('it_cleandata.csv', index_label = 'ID')
#print(it_df)

mex_vectorizer = CountVectorizer(analyzer = 'word', binary = True)
mex_array = mex_vectorizer.fit_transform(mex_ingredients)
mex_array = mex_array.toarray()
mex_df = DataFrame(mex_array, columns = mex_vectorizer.get_feature_names(), index = mex_unique_ids)
mex_df['Cuisine'] = 1 #mexican
mex_df.to_csv('mex_cleandata.csv', index_label = 'ID')
#print(mex_df)

In [8]:
#putting mexican and italian in one DataFrame

df = pd.DataFrame(it_df)
df = df.append(mex_df, sort = False)
df.fillna(0, inplace = True)
df = df.astype('int64')

In [9]:
X = df.loc[:, df.columns != 'Cuisine'].values  #all columns with values except Cuisine column
print(X.shape)

Y = df['Cuisine'] #just the ids in a list

(14276, 3976)


In [10]:
# Multiple Logistic Regression 

lr_m = linear_model.LogisticRegression()

In [11]:
# Support Vector Machines

svm_m = svm.SVC()

In [12]:
#Decision Trees

dtree = tree.DecisionTreeClassifier()

In [13]:
#k-NN

knn = KNeighborsClassifier(n_neighbors = 3)

In [14]:
# Naive Bayes

gnb = BernoulliNB(binarize = None)

In [15]:
s = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']
scores_ = model_selection.cross_validate(lr_m, X, Y, scoring = s, cv = 5)                                 
print('****Logistic Regression mean scores****')
print('scores:', scores_)
print('\nmean score test accuracy:', scores_['test_accuracy'].mean())
print('\nmean score test precision weighted:', scores_['test_precision_weighted'].mean())
print('\nmean score test recall weighted:', scores_['test_recall_weighted'].mean())
print('\nmean score test f1-measure weighted:', scores_['test_f1_weighted'].mean())

Y_pred = cross_val_predict(lr_m, X, Y, cv = 5)
conf_matrix = confusion_matrix(Y, Y_pred)
print('Confusion Matrix:', conf_matrix)



****Logistic Regression mean scores****
scores: {'fit_time': array([2.12911797, 2.07707977, 2.06235981, 2.13533521, 1.99023581]), 'score_time': array([0.11750913, 0.09385228, 0.1032691 , 0.088274  , 0.08785415]), 'test_accuracy': array([0.96813725, 0.97233894, 0.96778711, 0.96671338, 0.9688157 ]), 'test_precision_weighted': array([0.96837424, 0.97241883, 0.96788602, 0.96682014, 0.96883557]), 'test_recall_weighted': array([0.96813725, 0.97233894, 0.96778711, 0.96671338, 0.9688157 ]), 'test_f1_weighted': array([0.96809087, 0.97231503, 0.96775533, 0.96667911, 0.96879857])}

mean score test accuracy: 0.9687584774624636

mean score test precision weighted: 0.9688669615432207

mean score test recall weighted: 0.9687584774624636

mean score test f1-measure weighted: 0.968727783410316




Confusion Matrix: [[7679  159]
 [ 287 6151]]


In [None]:
scores_ = model_selection.cross_validate(svm_m, X, Y, scoring = s, cv = 5)                                 
print('****SVM mean scores****')
print('scores:', scores_)
print('\nmean score test accuracy:', scores_['test_accuracy'].mean())
print('\nmean score test precision weighted:', scores_['test_precision_weighted'].mean())
print('\nmean score test recall weighted:', scores_['test_recall_weighted'].mean())
print('\nmean score test f1-measure weighted:', scores_['test_f1_weighted'].mean())

Y_pred = cross_val_predict(svm_m, X, Y, cv = 5)
conf_matrix = confusion_matrix(Y, Y_pred)
print('Confusion Matrix:', conf_matrix)

In [None]:
scores_ = model_selection.cross_validate(dtree, X, Y, scoring = s, cv = 5)                                 
print('****d-tree mean scores****')
print('scores:', scores_)
print('\nmean score test accuracy:', scores_['test_accuracy'].mean())
print('\nmean score test precision weighted:', scores_['test_precision_weighted'].mean())
print('\nmean score test recall weighted:', scores_['test_recall_weighted'].mean())
print('\nmean score test f1-measure weighted:', scores_['test_f1_weighted'].mean())

Y_pred = cross_val_predict(dtree, X, Y, cv = 5)
conf_matrix = confusion_matrix(Y, Y_pred)
print('Confusion Matrix:', conf_matrix)

In [None]:
scores_ = model_selection.cross_validate(knn, X, Y, scoring = s, cv = 5)                                 
print('****k-NN mean scores****')
print('scores:', scores_)
print('\nmean score test accuracy:', scores_['test_accuracy'].mean())
print('\nmean score test precision weighted:', scores_['test_precision_weighted'].mean())
print('\nmean score test recall weighted:', scores_['test_recall_weighted'].mean())
print('\nmean score test f1-measure weighted:', scores_['test_f1_weighted'].mean())

Y_pred = cross_val_predict(knn, X, Y, cv = 5)
conf_matrix = confusion_matrix(Y, Y_pred)
print('Confusion Matrix:', conf_matrix)

In [None]:
scores_ = model_selection.cross_validate(gnb, X, Y, scoring = s, cv = 5)                                 
print('****Naive-Bayes mean scores****')
print('scores:', scores_)
print('\nmean score test accuracy:', scores_['test_accuracy'].mean())
print('\nmean score test precision weighted:', scores_['test_precision_weighted'].mean())
print('\nmean score test recall weighted:', scores_['test_recall_weighted'].mean())
print('\nmean score test f1-measure weighted:', scores_['test_f1_weighted'].mean())

Y_pred = cross_val_predict(gnb, X, Y, cv = 5)
conf_matrix = confusion_matrix(Y, Y_pred)
print('Confusion Matrix:', conf_matrix)