# Import libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd, re
from datetime import datetime
from tqdm import tqdm
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Import Data Files

In [2]:
ctags = pd.read_csv('Cuisine Corpus.csv')
print(len(ctags))
ctags.head()

12964


Unnamed: 0,cuisineName,tagName
0,American,pretzel
1,American,brisket
2,American,bagel
3,American,pretzels
4,American,american goulash


In [3]:
ctags = dict(ctags.groupby('cuisineName')['tagName'].apply(list))

In [4]:
df = pd.read_csv('Menu Items.csv')
print("Number of Rows: ", len(df))
df.head()

Number of Rows:  537824


Unnamed: 0,name,description
0,Mutton Shawarma In Samoli,
1,Veg Shawarma,
2,Chicken Shawarma With Salad In Rumali,
3,Chicken Shawarma Without Salad In Rumali,
4,Veg Clear Soup,


# Data Wrangling

In [5]:
stop_words_str = '- Hot , Hot , 4 Pcs , Chefs Special , spl. , spl , special , pack , 6 Pieces , 1 Piece , 2 Pcs , Gravy , wet , dry,  spicy, Dry,  2 in 1 , Small , Big , 250 GM , 1000ML , 750ml , 250ml  , 500ml , non-vegetarian, non-vegetarien, vegetarian, vegetariens, non veg, with Butter , Full , Half, jumbo, Platter , Combo, combo, comb, com, box , singles, single, half, double, doubles, dbl,  6", 7'', 7", 8", 9",10", 11", 12", 13" , 14", 15", 16", 17", 18", 19", 20", 21", 22", 23", 24", 25", 26", 27", 28", 29" , 30", boneless, bone-less , classic, jains, jain, -veg, 6pc, large, small, meal '
stop_words_lst = stop_words_str.split(',')
stop_words = [x.lower().strip() for x in stop_words_lst]

In [6]:
def useCorpus(name):
    
    # convert to lower case
    name = name.lower()
    
    orgname = name
    
    # remove special characters
    name = re.sub("[\(\[].*?[\)\]]", '', name).replace('-',' ').replace("'s", '').strip()
    
    # remove stop words
    namelist = name.split(' ')
    for word in namelist:
        if word in stop_words:
            name = name.replace(word, '').strip()

    # get first part of any combo item with '+' or 'with' in the name
    name = name.split('+', 1)[0]
    name = name.split('with', 1)[0]
    name = name.strip()
    
    # check for null values
    if len(name) == 0:
        return np.nan
    
    # check for special case
    cuisine = checkForSpecialCases(name)
    if len(cuisine) > 0:
        return cuisine
    
    # get cuisine tag by iterationg over the corpus and finding full name 
    for cuisine, tags in ctags.items():
        if (name in tags) or (orgname in tags):
            return cuisine
    
    # apply word_tokenize method from nltk to get a list of individual words in the name
    tokname = word_tokenize(name)
    
    # get cuisine tag by iterationg over the corpus and finding last word
    lastword = tokname[-1]
    for cuisine, tags in ctags.items():
        if lastword in tags :
            return cuisine
    
    return np.nan

In [7]:
def checkForSpecialCases(name):
    tag = ''
    if name.endswith('fried rice'):
        tag = 'Fast Food'
    elif name.endswith('ice cream'):
        tag = 'Ice Cream'
    elif name.endswith('cup cake'):
        tag = 'Bakery'
    elif name.endswith('paani poori') or name.endswith('pani poori'):
        tag = 'Fast Food'
    elif name.endswith('garlic bread'):
        tag = 'Italian'
    elif name == 'poori':
        tag = 'South Indian'
        
    return tag

### Get cuisine of each item using the scraped corpus saved in 'ctags' dictionary

In [8]:
df['cuisine'] = df['name'].apply(useCorpus)
df.head()

Unnamed: 0,name,description,cuisine
0,Mutton Shawarma In Samoli,,Fast Food
1,Veg Shawarma,,Fast Food
2,Chicken Shawarma With Salad In Rumali,,Fast Food
3,Chicken Shawarma Without Salad In Rumali,,Fast Food
4,Veg Clear Soup,,Chinese


### Check for category imbalance

In [9]:
df['cuisine'].value_counts()

Fast Food       158352
Beverages        88615
North Indian     67904
Chinese          56338
South Indian     42486
Biryani          30646
Italian          22912
Bakery           20791
Desserts         17900
Chicken          16862
Ice Cream         8267
Eggetarian        6751
Name: cuisine, dtype: int64

### The categories balance seems to be good enough to work with.

### We have our data ready for training a ML classifier model now!!

# Text Classifier Modelling

### Feature Engineering

In [10]:
def extractBaseName(name):
    # convert to lower case
    name = name.lower()
    
    # remove special characters
    name = name.replace('(',' ').replace(')',' ').replace('-', ' ').replace("'s", '').replace('  ', ' ').strip()
    
    return name.strip()

In [11]:
df['engineeredName'] = df['name'].apply(extractBaseName)

In [12]:
df.head()

Unnamed: 0,name,description,cuisine,engineeredName
0,Mutton Shawarma In Samoli,,Fast Food,mutton shawarma in samoli
1,Veg Shawarma,,Fast Food,veg shawarma
2,Chicken Shawarma With Salad In Rumali,,Fast Food,chicken shawarma with salad in rumali
3,Chicken Shawarma Without Salad In Rumali,,Fast Food,chicken shawarma without salad in rumali
4,Veg Clear Soup,,Chinese,veg clear soup


In [13]:
# x : engineered names array
x = df['engineeredName'].values

# y : respective cuisine array
y = df['cuisine'].values

In [14]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((430259,), (107565,), (430259,), (107565,))

In [15]:
# Multiple Models dataframe for performance analysis
models = {}

### Logistic Regression

In [16]:
print("Training Model LogisticRegression...")
start = datetime.now()

# Make pipeline
lr = Pipeline([('tfidf', TfidfVectorizer(min_df=10)), # tfidf vectorizer to convert text array into a sparse matrix
               ('clf', LogisticRegression(n_jobs=1, C=1e5)), # logistic regression classifier
                ])

# fit the model
lr.fit(X_train, y_train) 

model_train_time = datetime.now() - start
print('Model Training completed in {}'.format(model_train_time))

Training Model LogisticRegression...
Model Training completed in 0:00:44.790619


In [17]:
# make predictions on the test data , i.e., X_test
y_pred = lr.predict(X_test)

# Measure and print model accuracy
models['Logistic Regression']  = np.round(accuracy_score(y_pred, y_test), 3)
print('Model LogisticRegression Accuracy : {}'.format(models['Logistic Regression']))

Model LogisticRegression Accuracy : 0.975


### Multinomial Naive Bayes

In [18]:
print("Training Model NaiveBayes...")
start = datetime.now()

# Make pipeline
nb = Pipeline([('tfidf', TfidfVectorizer(min_df=10)),
               ('clf', MultinomialNB()),
                ])

# fit the model
nb.fit(X_train, y_train)

model_train_time = datetime.now() - start
print('Model Training completed in {}'.format(model_train_time))

Training Model NaiveBayes...
Model Training completed in 0:00:04.323907


In [19]:
# make predictions on the test data , i.e., X_test
y_pred = nb.predict(X_test)

# Measure and print model accuracy
models['Naive Bayes']  = np.round(accuracy_score(y_pred, y_test), 3)
print('Model NaiveBayes Accuracy : {}'.format(models['Naive Bayes']))

Model NaiveBayes Accuracy : 0.922


### Random Forest

In [20]:
print("Training Model RandomForest...")
start = datetime.now()

# Make pipeline
rf = Pipeline([('tfidf', TfidfVectorizer(min_df=10)),
               ('clf', RandomForestClassifier(n_estimators=400, max_depth=8)),
                ])

# fit the model
rf.fit(X_train, y_train)

model_train_time = datetime.now() - start
print('Model Training completed in {}'.format(model_train_time))

Training Model RandomForest...
Model Training completed in 0:01:07.712986


In [21]:
# make predictions on the test data , i.e., X_test
y_pred = rf.predict(X_test)

# Measure and print model accuracy
models['Random Forest']  = np.round(accuracy_score(y_pred, y_test), 3)
print('Model RandomForest Accuracy : {}'.format(models['Random Forest']))

Model RandomForest Accuracy : 0.443


### Support Vector Machine

In [22]:
print("Training Model SupportVectorMachine...")
start = datetime.now()

# Make pipeline
svm = Pipeline([('tfidf', TfidfVectorizer(min_df=10)),
               ('clf', SGDClassifier(random_state=42)),
                ])

# fit the model
svm.fit(X_train, y_train)

model_train_time = datetime.now() - start
print('Model Training completed in {}'.format(model_train_time))

Training Model SupportVectorMachine...
Model Training completed in 0:00:09.123460


In [23]:
# make predictions on the test data , i.e., X_test
y_pred = svm.predict(X_test)

# Measure and print model accuracy
models['Support Vector Machine']  = np.round(accuracy_score(y_pred, y_test), 3)
print('Model SupportVectorMachine Accuracy : {}'.format(models['Support Vector Machine']))

Model SupportVectorMachine Accuracy : 0.958


## Model Performance Comparision

In [24]:
pd.DataFrame(models, index=['Accuracy'])

Unnamed: 0,Logistic Regression,Naive Bayes,Random Forest,Support Vector Machine
Accuracy,0.975,0.922,0.443,0.958


#### From the above results, we can select the Logistic Regression model as the best classifier with an accuracy of 97.5%

In [25]:
#FINAL MODEL
text_classifier_model = lr

In [26]:
y_pred = text_classifier_model.predict(X_test)
print("Accuracy score: ", accuracy_score(y_pred, y_test))
print("Precision score: ", precision_score(y_pred, y_test, average='macro'))
print("Recall score: ", recall_score(y_pred, y_test, average='macro'))

Accuracy score:  0.9746107005066704
Precision score:  0.9624725459415823
Recall score:  0.9678355539425282


### Classification report of each category for the Logistic Regression Model

In [27]:
target_tags = df['cuisine'].unique().tolist()
class_report_df = pd.DataFrame(classification_report(y_test, lr.predict(X_test), target_names=target_tags, output_dict=True))
class_report_df

Unnamed: 0,Fast Food,Chinese,Chicken,North Indian,Italian,Desserts,Bakery,Beverages,Eggetarian,South Indian,Biryani,Ice Cream,accuracy,macro avg,weighted avg
precision,0.968652,0.993142,0.988642,0.903093,0.97275,0.968311,0.955882,0.974301,0.969584,0.984398,0.97418,0.961091,0.974611,0.967836,0.974479
recall,0.970759,0.993367,0.994776,0.851995,0.96146,0.94717,0.935252,0.981747,0.97564,0.990538,0.97375,0.973217,0.974611,0.962473,0.974611
f1-score,0.969704,0.993254,0.991699,0.8768,0.967072,0.957624,0.945455,0.97801,0.972603,0.987458,0.973965,0.967116,0.974611,0.965063,0.974504
support,4138.0,17639.0,6125.0,3358.0,11287.0,3710.0,1390.0,31666.0,1601.0,4650.0,13600.0,8401.0,0.974611,107565.0,107565.0


# Conclusion

### Making predictions for menu items using the Model

In [28]:
# input any item for demonstrating model output : eg: mango milkshake
item = ['mango milkshake']

# make model prediction on the input item
cuisine = svm.predict(np.array(item)).tolist()

# print results
print("Item: {}".format(item))
print("Cuisine: {}".format(cuisine))

Item: ['mango milkshake']
Cuisine: ['Beverages']


#### More complicated inputs

In [29]:
# input in list form
item = ['pizza dosa', 'egg fried rice', 'samosa chutney', 'chicken biryani with 2 pcs']

cuisine = svm.predict(np.array(item)).tolist()

print("Item: {}".format(item))
print("Cuisine: {}".format(cuisine))

Item: ['pizza dosa', 'egg fried rice', 'samosa chutney', 'chicken biryani with 2 pcs']
Cuisine: ['South Indian', 'Fast Food', 'Fast Food', 'Biryani']
