# Restaurant Classifier 
#### Lila Kosowsky and Sarah Moore

In [105]:
%matplotlib inline 

import random 
import tensorflow
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Embedding, LSTM
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.datasets import mnist
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.regularizers import l2
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')

Exception ignored in: <function SeekableUnicodeStreamReader.__del__ at 0x7ffea1aa48b0>
Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.9/site-packages/nltk/data.py", line 1160, in __del__
    if not self.closed:
  File "/Applications/anaconda3/lib/python3.9/site-packages/nltk/data.py", line 1180, in closed
    return self.stream.closed
AttributeError: 'SeekableUnicodeStreamReader' object has no attribute 'stream'
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lilakosowsky/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lilakosowsky/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [106]:
##load pretrained word2vec model 
filename = '/Users/lilakosowsky/Desktop/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [107]:
##import data
data_file = '/Users/lilakosowsky/Desktop/Neural Nets/TA_restaurants_curated.csv'
data = pd.read_csv(data_file)

##isolate the data we will be using 
data = data[['Name', 'Cuisine Style']]

##drop rows with null values 
data = data.dropna()
data = data.reset_index(drop = True)

##drop rows with non-ascii characters 
for index, row in data.iterrows():
    if not all(ord(c) < 128 for c in str(row)):
        data.drop(index, inplace=True)
data = data.reset_index(drop = True)

In [200]:
##makes sure that only the selected cuisines are included
def filterCuisines(entry):
    new_cuisines = []
    tags = entry.split(", ")
    for tag in tags:
        if tag in selected_cuisines:
            new_cuisines.append(tag)
    if not new_cuisines:
        return None 
    return new_cuisines

def preprocess(entry):
    ##remove punctuation
    entry = re.sub(r'[^\w\s]', '', entry)
    
    ##convert to lowercase
    entry = entry.lower()
    
    ##remove stopwords 
    stopwords = nltk.corpus.stopwords.words('english')
    entry = ' '.join([word for word in entry.split() if word not in stopwords])
    return entry

def tokenize(entry):
    ##tokenize into individual words 
    words = nltk.word_tokenize(entry)
    return words 

##gets vector of each word from google word2vec model
def vectorize(entry):
    vector = []
    for word in entry:
        if word in model.key_to_index.keys():
            if model[word] is not None:
                vector.append(model[word])
    return vector

#takes average of all word vectors in an entry
def average(entry):
    if len(entry) != 0: 
        return sum(entry) / len(entry)

##returns only the first word of an entry
def getFirst(entry):
        return entry[0]


##top 21 cuisines (filtered out generic descriptors such as "vegan friendly")
selected_cuisines = ["'Mediterranean'", "'Italian'", "'Bar'", "'French'", "'Asian'", "'Pizza'", "'Spanish'", "'Pub'", "'Cafe'", "'Fast Food'", "'British'", "'Central European'", "'Chinese'", "'Sushi'", "'American'", "'Portugese'", "'Indian'", "'Middle Eastern'", "'Thai'"]

##filter out extra cuisines
data['filtered_cuisines'] = data['Cuisine Style'].apply(filterCuisines)
data = data.dropna()
data = data.reset_index(drop = True)

##isolate the first word for the restaurant names 
data['first_cuisine'] = data['filtered_cuisines'].apply(getFirst)

print(data['first_cuisine'].value_counts())

'Mediterranean'       4904
'Pizza'               3627
'Bar'                 2849
'British'             2550
'Asian'               2512
'Spanish'             1794
'Central European'    1471
'Cafe'                1467
'Sushi'               1309
'Fast Food'           1246
'Pub'                 1011
'Middle Eastern'       726
'Thai'                 717
'Italian'              633
'French'               563
'American'             550
'Indian'               150
'Chinese'              136
Name: first_cuisine, dtype: int64


In [143]:
##process data
data['cuisines_processed'] = data['first_cuisine'].apply(preprocess)
data['names_processed'] = data['Name'].apply(preprocess)

##tokenize data
data['cuisines_tokenized'] = data['cuisines_processed'].apply(tokenize)
data['names_tokenized'] = data['names_processed'].apply(tokenize)

##get vector values for data
data['cuisine_vector'] = data['cuisines_tokenized'].apply(vectorize)
data['name_vector'] = data['names_tokenized'].apply(vectorize)

##get average vector value for the cuisine types
data['average_cuisine'] = data['cuisine_vector'].apply(average)
data['average_name'] = data['name_vector'].apply(average)

##filter out and last null values
data = data.dropna()
data = data.reset_index(drop = True)

##split into X and y
X = data['average_name']
y = data['average_cuisine']

In [145]:
##split into test and train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 123)

##create model 
model2 = Sequential()
model2.add(Dense(1000, input_dim = 300, activation='ReLU'))
model2.add(Dropout(0.5))
model2.add(Dense(900, activation='ReLU'))
model2.add(Dropout(0.5))
model2.add(Dense(750, activation='ReLU'))
model2.add(Dropout(0.5))
model2.add(Dense(500, activation='ReLU'))
model2.add(Dropout(0.5))
model2.add(Dense(300, activation='ReLU')) 
model2.compile(optimizer='adam',
             loss='mse', 
             metrics=['accuracy'])

##reshape input data
X_train = np.array(X_train.tolist())
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
y_train = np.array(y_train.tolist())
y_train = y_train.reshape(y_train.shape[0], 300) # reshape the output to match the new dimension

##train model
model2.fit(X_train, y_train, epochs=25, batch_size=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7ffdd1a1df70>

In [146]:
##reshape test data
X_test = np.array(X_test.tolist())
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
y_test = np.array(y_test.tolist())
y_test = y_test.reshape(y_test.shape[0], 300) # reshape the output to match the new dimension

##evaluate model 
score = model2.evaluate(X_test, y_test, batch_size=5)
print(score[0])
print(score[1])

##save model
model2.save('my_model2')

0.02513277530670166
0.18802055716514587




INFO:tensorflow:Assets written to: my_model2/assets


INFO:tensorflow:Assets written to: my_model2/assets


In [206]:
##Set a random seed for reproducibility
np.random.seed(42)
tensorflow.random.set_seed(42)

##load saved model
model2 = keras.models.load_model('my_model2')

##plug in new restaurant name
restaurant_name = 'South End Pita'

def getPrediction(new_data):
    processed_data = preprocess(new_data)
    tokenized_data = tokenize(processed_data)
    vectorized_data = vectorize(tokenized_data)
    average_data = average(vectorized_data)
    if average_data is None:
        print("restaurant name not found in word2vec model")
        return
    else:
        input_data = tensorflow.reshape(average_data, (1, 300))
        ##process name to plug into the model 
        input_data = process(restaurant_name)

        ##get prediction from model
        prediction = model2.predict(input_data)

        ##process prediction to plug into word2vec model
        cuisine = tensorflow.reshape(prediction,(300,))
        cuisine = cuisine.numpy()

        ##get most similar cuisine
        similarities = y.apply(lambda y: np.dot(y, cuisine) / (np.linalg.norm(y) * np.linalg.norm(cuisine)))

        ##Print out the most similar word
        print(model.most_similar(y[similarities.idxmax()]))
        return

print(restaurant_name)
getPrediction(restaurant_name)        

South End Pita
[('mediterranean', 1.0), ('Mediterranean', 0.6283091902732849), ('Mediterrean', 0.582373321056366), ('Mediterannean', 0.5725836157798767), ('Mediteranean', 0.5462020635604858), ('Balkan_peninsula', 0.5421025156974792), ('Meditteranean', 0.5334994792938232), ('Mediterranean_basin', 0.527664303779602), ('Monemvasia', 0.5249188542366028), ('Tyrrhenian', 0.5243752598762512)]
