### Importing Necessary Libraries

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.corpus import stopwords
import string
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
np.random.seed(0)
import re
import wordcloud


### Loading and Basic Cleaning of Dataframe

In [2]:
df= pd.read_csv('final_database.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,cuisine,recipe_ingredients,recipe_name
0,0,african,2 tbsp vegetable oil ½ tsp cumin seeds ½ tsp f...,Bunny chow
1,1,african,"1 tbsp olive or vegetable oil 2 large onions, ...",Jollof rice with fried plantains
2,2,african,400ml/14fl oz passata 3 tbsp tomato purée 2 fr...,Jollof rice
3,3,african,4g smoked paprika 2g cayenne pepper 6g ginger...,Suya fillet burger with sweet potato cubes and...
4,4,african,300g/10½oz basmati rice 1 tbsp vegetable oil 8...,Jollof rice with chicken


In [3]:
df= df.drop_duplicates(subset='recipe_name')
df.describe()

Unnamed: 0.1,Unnamed: 0
count,1687.0
mean,980.561944
std,521.114071
min,0.0
25%,574.5
50%,998.0
75%,1426.5
max,1852.0


In [4]:
df = df.drop(df.columns[0], axis=1)
df.describe()

Unnamed: 0,cuisine,recipe_ingredients,recipe_name
count,1687,1687,1687
unique,14,1687,1687
top,greek,"1 1/2 lb fresh apricots, quartered lengthwise ...",Grilled Stuffed Mackerel
freq,166,1,1


In [5]:
msk = (np.random.rand(len(df)) < 0.75)

In [6]:
train = df[msk]
len(train)

1246

In [7]:
test = df[~msk]
len(test)

441

In [8]:
print('train shape: ', train.shape)
print('test shape: ',  test.shape)

train shape:  (1246, 3)
test shape:  (441, 3)


In [9]:
data_train= train['recipe_ingredients']
target_train= train['cuisine']
data_test= test['recipe_ingredients']
target_test= test['cuisine']
print(data_train.shape)
print(target_train.shape)
print(data_test.shape)
print(target_test.shape)

(1246,)
(1246,)
(441,)
(441,)


In [48]:
train

Unnamed: 0,cuisine,recipe_ingredients,recipe_name
0,african,2 tbsp vegetable oil ½ tsp cumin seeds ½ tsp f...,Bunny chow
1,african,"1 tbsp olive or vegetable oil 2 large onions, ...",Jollof rice with fried plantains
2,african,400ml/14fl oz passata 3 tbsp tomato purée 2 fr...,Jollof rice
3,african,4g smoked paprika 2g cayenne pepper 6g ginger...,Suya fillet burger with sweet potato cubes and...
4,african,300g/10½oz basmati rice 1 tbsp vegetable oil 8...,Jollof rice with chicken
5,african,2 tbsp ground ginger 1 tbsp ground cinnamon 1 ...,Red-red stew with spiced plantain
6,african,"50ml/2fl oz olive oil 1 onion, chopped 2 garli...",Bobotie
9,african,"2 tbsp groundnut oil 1 large chicken, jointed ...",Zanzibar chicken
11,african,"1 butternut, peeled and sliced 2 large sweet p...",Marinated vegetables
12,african,5 eggs 5 egg yolks 125g/4oz caster sugar 250g/...,Chocolate mealie meal pudding


### Word Cleaning and Vectorizing

In [19]:
pattern = r"[a-z]+"
data_train = nltk.regexp_tokenize(str(data_train), pattern)
data_test= nltk.regexp_tokenize(str(data_test), pattern)

In [46]:
data_test

['butter',
 'for',
 'greasing',
 'ml',
 'fl',
 'oz',
 'full',
 'fat',
 'mi',
 'bone',
 'in',
 'chicken',
 'thighs',
 'or',
 'drumsticks',
 'large',
 'tbsp',
 'olive',
 'oil',
 'small',
 'onion',
 'chopped',
 'carr',
 'kg',
 'lb',
 'oz',
 'goat',
 'meat',
 'cut',
 'into',
 'cm',
 'in',
 'piec',
 'shoulder',
 'of',
 'lamb',
 'approximately',
 'kg',
 'lb',
 'oz',
 'peppercorns',
 'cardamom',
 'pods',
 'cloves',
 'tbs',
 'g',
 'oz',
 'salt',
 'tsp',
 'coriander',
 'seeds',
 'tsp',
 'fenne',
 'g',
 'oz',
 'ground',
 'ginger',
 'g',
 'oz',
 'garlic',
 'granules',
 'g',
 'oz',
 'dried',
 'black',
 'eyed',
 'beans',
 'g',
 'lb',
 'sirl',
 'small',
 'onion',
 'chopped',
 'cloves',
 'garlic',
 'crushe',
 'g',
 'oz',
 'good',
 'quality',
 'black',
 'cherry',
 'jam',
 'ml',
 'g',
 'oz',
 'digestive',
 'biscuits',
 'g',
 'oz',
 'butter',
 'g',
 'g',
 'oz',
 'dark',
 'chocolate',
 'at',
 'least',
 'cocoa',
 'so',
 'g',
 'oz',
 'self',
 'raising',
 'flour',
 'plus',
 'extra',
 'for',
 'du',
 'g',
 'f

In [34]:
# adapt this to do lower() separately
data_train= [i.lower() for i in data_train]
data_test= [i.lower() for i in data_test]

In [35]:
nltk.download('stopwords')

stopwords_list = stopwords.words('english')
#stopwords_list+= ["''", '""', '...', '``']
stopwords_list+= ['tsp','tbsp',
             'tablespoon', 'teaspoon', 'tablespoons', 'teaspoons', 
             'large', 'cup', 'ounces', 'pound', 'oz', 'slice','sliced', 
             'cup', 'cups', 'ounce', 'ounces', 'chopped', 'finely', 'cut',
            'thinly']

[nltk_data] Downloading package stopwords to /Users/Iffy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
# do stopwords step on its own
processed_train=[w for w in data_train if not w in stopwords_list]
processed_test= [w for w in data_test if not w in stopwords_list]

In [38]:
print('cleaned train: ', processed_train)
print('----'*28)
print('cleaned test: ', processed_test)

cleaned train:  ['vegetable', 'oil', 'cumin', 'seeds', 'f', 'olive', 'vegetable', 'oil', 'onions', 'ml', 'fl', 'passata', 'tomato', 'pur', 'e', 'fr', 'g', 'smoked', 'paprika', 'g', 'cayenne', 'pepper', 'g', 'ginger', 'g', 'basmati', 'rice', 'vegetable', 'oil', 'ground', 'ginger', 'ground', 'cinnamon', 'ml', 'fl', 'olive', 'oil', 'onion', 'garli', 'groundnut', 'oil', 'chicken', 'jointed', 'butternut', 'peeled', 'sweet', 'p', 'eggs', 'egg', 'yolks', 'g', 'caster', 'sugar', 'g', 'chicken', 'breasts', 'three', 'long', 'st', 'cloves', 'fennel', 'seeds', 'black', 'olive', 'oil', 'onions', 'g', 'g', 'millet', 'couscous', 'g', 'corn', 'cousc', 'ml', 'fl', 'full', 'fat', 'milk', 'free', 'rang', 'g', 'unsalted', 'butter', 'g', 'caster', 'b', 'g', 'unsalted', 'butter', 'g', 'golden', 'syrup', 'g', 'plain', 'flour', 'g', 'cornmeal', 'yeast', 'caster', 'sugar', 'ml', 'fl', 'ml', 'pint', 'sunflower', 'oil', 'g', 'soft', 'light', 'sweet', 'short', 'crust', 'pastry', 'case', 'packet', 'g', 'plain', 'f

In [39]:
ptrain= processed_train
len(ptrain)

308

In [40]:
# ptest= p.findall(str(processed_test))
ptest= processed_test
len(ptest)

324

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
vectorizer = TfidfVectorizer()

In [52]:
tf_idf_ptrain = vectorizer.fit_transform(ptrain)
tf_idf_ptest = vectorizer.transform(ptest)
print('vectorized trained processed data: ',tf_idf_ptrain.shape)
print('vectorized test processed data: ', tf_idf_ptest.shape)

vectorized trained processed data:  (308, 192)
vectorized test processed data:  (324, 192)


In [35]:
tf_idf_data_train= vectorizer.fit_transform(data_train)
tf_idf_data_test = vectorizer.transform(data_test)
print('vectorized trained data: ', tf_idf_data_train)
print('vectorizer test data: ', tf_idf_data_test)

vectorized trained data:    (0, 2877)	0.32715622836717667
  (0, 3050)	0.05034955389067481
  (0, 2003)	0.031451702542290905
  (0, 2982)	0.11442879678825343
  (0, 1011)	0.06751769769022144
  (0, 2525)	0.11201333878743083
  (0, 1201)	0.09856758613165911
  (0, 281)	0.19713517226331823
  (0, 89)	0.11041046315539162
  (0, 2163)	0.07305098534397828
  (0, 2000)	0.20419723281414084
  (0, 862)	0.0705497470656639
  (0, 2773)	0.06914748960540358
  (0, 1413)	0.05911578065781837
  (0, 754)	0.09155189721018657
  (0, 2225)	0.09658030010987209
  (0, 2759)	0.11445875615343808
  (0, 433)	0.11041046315539162
  (0, 534)	0.07891352833696692
  (0, 1678)	0.0778575699647505
  (0, 2014)	0.050836905128373085
  (0, 1221)	0.1865598907745209
  (0, 844)	0.173630890197736
  (0, 2698)	0.15503238686959475
  (0, 390)	0.14657814679450507
  :	:
  (1245, 1195)	0.1444300634774966
  (1245, 2880)	0.26598571086013223
  (1245, 1924)	0.21007830217244047
  (1245, 2979)	0.12138680163439758
  (1245, 3119)	0.10248642053286368
  (124

In [53]:
non_zero_cols = tf_idf_ptrain.nnz / float(tf_idf_ptrain.shape[0])
print("Average Number of Non-Zero Elements in Vectorized Articles: {}".format(non_zero_cols))

percent_sparse = 1 - (non_zero_cols / float(tf_idf_ptrain.shape[1]))
print('Percentage of columns containing 0: {}'.format(percent_sparse))

Average Number of Non-Zero Elements in Vectorized Articles: 0.8896103896103896
Percentage of columns containing 0: 0.9953666125541125


## Finding the Best Model

### Naive Bayes and Random Forest

In [54]:
nb_classifier = MultinomialNB()
rf_classifier = RandomForestClassifier(n_estimators=100)

In [55]:
nb_classifier.fit(tf_idf_ptrain, ptrain)
nb_train_preds = nb_classifier.predict(tf_idf_ptrain)
nb_test_preds = nb_classifier.predict(tf_idf_ptest)

In [56]:
rf_classifier.fit(tf_idf_ptrain, ptrain)
rf_train_preds = rf_classifier.predict(tf_idf_ptrain)
rf_test_preds = rf_classifier.predict(tf_idf_ptest)

In [57]:
nb_train_score = accuracy_score(ptrain, nb_train_preds)
nb_test_score = accuracy_score(ptest, nb_test_preds)
rf_train_score = accuracy_score(ptrain, rf_train_preds)
rf_test_score = accuracy_score(ptest, rf_test_preds)

print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))
print("")
print('-'*70)
print("")
print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

Multinomial Naive Bayes
Training Accuracy: 0.2078 		 Testing Accuracy: 0.1728

----------------------------------------------------------------------

Random Forest
Training Accuracy: 0.9773 		 Testing Accuracy: 0.5154
