In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.corpus import stopwords
import string
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
np.random.seed(0)
import re

In [2]:
df= pd.read_csv('all_recipe_data.csv')
df.head()

Unnamed: 0,Recipe_Name,Recipe_Ingredients,Cuisine
0,Bunny chow,2 tbsp vegetable oil ½ tsp cumin seeds ½ tsp f...,african
1,Jollof rice with fried plantains,"1 tbsp olive or vegetable oil 2 large onions, ...",african
2,Jollof rice,400ml/14fl oz passata 3 tbsp tomato purée 2 fr...,african
3,Suya fillet burger with sweet potato cubes and...,4g smoked paprika 2g cayenne pepper 6g ginger...,african
4,Jollof rice with chicken,300g/10½oz basmati rice 1 tbsp vegetable oil 8...,african


In [3]:
df= df.drop_duplicates(subset='Recipe_Name')

In [4]:
df.describe()

Unnamed: 0,Recipe_Name,Recipe_Ingredients,Cuisine
count,475,475,475
unique,475,475,21
top,Saag gosht (lamb and spinach curry) with chapatis,"740g/1lb 10oz lamb fillet, trimmed and cubed 1...",mexican
freq,1,1,24


In [5]:
msk = np.random.rand(len(df)) < 0.8

In [6]:
train = df[msk]
len(train)
# train

382

In [7]:
test = df[~msk]
len(test)
# test

93

In [8]:
train.shape

(382, 3)

In [9]:
test.shape

(93, 3)

In [10]:
data_train= train['Recipe_Ingredients']
target_train= train['Cuisine']
data_test= test['Recipe_Ingredients']
target_test= test['Cuisine']
print(data_train.shape)
print(target_train.shape)
print(data_test.shape)
print(target_test.shape)

(382,)
(382,)
(93,)
(93,)


In [11]:
# data_train=re.sub("[^'?a-z\,?]", ' ', train['Recipe_Ingredients'][i])

In [12]:
nltk.download('stopwords')

stopwords_list = stopwords.words('english')+ list(string.punctuation)
stopwords_list+= ["''", '""', '...', '``']
stopwords_list+= ['tbsp', 'tsp', 'cup', 'cups']
# regex = r"[^'?a-z\,?]"
# stopwords_list

[nltk_data] Downloading package stopwords to /Users/Iffy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
def process_article(article):
    tokens = nltk.word_tokenize(article)
    stopwords_removed = [token.lower() for token in tokens if token not in stopwords_list]
    return stopwords_removed 

In [14]:
# nltk.download('punkt')
processed_train =  list(map(process_article, data_train))
processed_test= list(map(process_article, data_test))

In [15]:
print('cleaned train: ', processed_train[0])
print('----'*28)
print('cleaned test: ', processed_test[0])

cleaned train:  ['2', 'vegetable', 'oil', '½', 'cumin', 'seeds', '½', 'fennel', 'seeds', '2.5cm/1in', 'piece', 'cinnamon', 'stick', '2', 'green', 'cardamom', 'pods', '1', 'star', 'anise', '1', 'bay', 'leaf', '1', 'onion', 'finely', 'chopped', '2', 'south', 'african', 'curry', 'powder', '2', 'tomatoes', 'chopped', '1kg/2lb', '2oz', 'boneless', 'leg', 'lamb', 'cut', '1.5cm/½in', 'dices', '1', 'finely', 'chopped', 'fresh', 'ginger', '1', 'finely', 'chopped', 'garlic', '10-12', 'curry', 'leaves', '2', 'large', 'potatoes', 'cut', 'cubes', 'size', 'meat', 'salt', '2', 'finely', 'chopped', 'coriander', 'leaves', '2', 'lime', 'juice', '2', 'loaves', 'crusty', 'white', 'bread', 'unsliced', 'cut', 'across', 'half', 'middle', 'crumbs', 'removed', 'coriander', 'cress', 'sprigs', 'garnish']
----------------------------------------------------------------------------------------------------------------
cleaned test:  ['butter', 'greasing', '400ml/14fl', 'oz', 'full-fat', 'milk', '50g/1¾oz', 'fresh',

In [16]:
pattern= r"[a-z]+"
p= re.compile(pattern)
ptrain= p.findall(str(processed_train))
len(ptrain)
# print(ptrain)

21056

In [17]:
ptest= p.findall(str(processed_test))
len(ptest)

5023

In [18]:
articles_concat_train = []
for pd in processed_train:
    articles_concat_train += pd
articles_concat_test = []
for pd in processed_test:
    articles_concat_test += pd

In [19]:
articles_freqdist_train = FreqDist(articles_concat_train)
# articles_freqdist_train.most_common(200)
articles_freqdist_test = FreqDist(articles_concat_test)
# articles_freqdist_test.most_common(200)

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
vectorizer = TfidfVectorizer()

In [22]:
tf_idf_ptrain = vectorizer.fit_transform(ptrain)
tf_idf_ptest = vectorizer.transform(ptest)
print(tf_idf_ptrain.shape)
print(tf_idf_ptest.shape)

(21056, 1437)
(5023, 1437)


In [23]:
non_zero_cols = tf_idf_ptrain.nnz / float(tf_idf_ptrain.shape[0])
print("Average Number of Non-Zero Elements in Vectorized Articles: {}".format(non_zero_cols))

percent_sparse = 1 - (non_zero_cols / float(tf_idf_ptrain.shape[1]))
print('Percentage of columns containing 0: {}'.format(percent_sparse))

Average Number of Non-Zero Elements in Vectorized Articles: 0.9395896656534954
Percentage of columns containing 0: 0.9993461449786684


### Naive Bayes and Random Forest models

In [24]:
nb_classifier = MultinomialNB()
rf_classifier = RandomForestClassifier(n_estimators=100)

In [25]:
nb_classifier.fit(tf_idf_ptrain, ptrain)
nb_train_preds = nb_classifier.predict(tf_idf_ptrain)
nb_test_preds = nb_classifier.predict(tf_idf_ptest)

In [26]:
rf_classifier.fit(tf_idf_ptrain, ptrain)
rf_train_preds = rf_classifier.predict(tf_idf_ptrain)
rf_test_preds = rf_classifier.predict(tf_idf_ptest)

In [27]:
nb_train_score = accuracy_score(ptrain, nb_train_preds)
nb_test_score = accuracy_score(ptest, nb_test_preds)
rf_train_score = accuracy_score(ptrain, rf_train_preds)
rf_test_score = accuracy_score(ptest, rf_test_preds)

print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))
print("")
print('-'*70)
print("")
print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

Multinomial Naive Bayes
Training Accuracy: 0.6398 		 Testing Accuracy: 0.6472

----------------------------------------------------------------------

Random Forest
Training Accuracy: 0.9953 		 Testing Accuracy: 0.9685


### KNN Models

In [28]:
from sklearn.neighbors import KNeighborsClassifier as KNC

In [29]:
knn_classifier= KNC()

In [32]:
knn_classifier.fit(tf_idf_ptrain, ptrain)
test_preds= knn_classifier.predict(tf_idf_ptest)

In [37]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
precision_score

In [76]:
def print_knn_metrics(labels, preds, avg):
    print("Precision Score: {}".format(precision_score(labels, preds, average=avg)))
    print("Recall Score: {}".format(recall_score(labels, preds, average=avg)))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds, average=avg)))

In [77]:
print_knn_metrics(ptest, test_preds, 'micro')

Precision Score: 0.9384829782998209
Recall Score: 0.9384829782998209
Accuracy Score: 0.9384829782998209
F1 Score: 0.9384829782998209


### SVM Models

In [78]:
from sklearn import svm

In [84]:
svm_clf = svm.SVC(kernel='linear')

In [85]:
svm_clf.fit(tf_idf_ptrain, ptrain)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [87]:
svm_clf.coef_

<1044735x1437 sparse matrix of type '<class 'numpy.float64'>'
	with 2076465 stored elements in Compressed Sparse Row format>

In [89]:
svm_pred = svm_clf.predict(tf_idf_ptest)
def print_svm_metrics(labels, preds, avg):
    print("Precision Score: {}".format(precision_score(labels, preds, average=avg)))
    print("Recall Score: {}".format(recall_score(labels, preds, average=avg)))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds, average=avg)))

print_svm_metrics(ptest, svm_pred, 'micro')

Precision Score: 0.9530161258212224
Recall Score: 0.9530161258212224
Accuracy Score: 0.9530161258212224
F1 Score: 0.9530161258212224
