In [1]:
import pandas as pd 
import numpy as np

import tensorflow as tf
import os

from tensorflow.keras.preprocessing.text import Tokenizer   
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

### Loading and Cleaning

In [2]:
la = pd.read_csv("https://raw.githubusercontent.com/jacobh310/food_nlp/master/data_collection/Los%20Angeles_restaurant.csv")
sd = pd.read_csv("https://raw.githubusercontent.com/jacobh310/food_nlp/master/data_collection/San%20Diego_restaurant.csv")
sf = pd.read_csv("https://raw.githubusercontent.com/jacobh310/food_nlp/master/data_collection/San%20Francisco_restaurant.csv")
oc = pd.read_csv("https://raw.githubusercontent.com/jacobh310/food_nlp/master/data_collection/Orange%20County_restaurant.csv")

In [3]:
def clean_name(col):
    if 'http' in col:
        col = col.split('Reviews')[1]
        col = col.split('-')[-2]
        col = col.replace('_',' ')
        return col
    else:
        return col

In [4]:
df = pd.concat([la,sd,oc,sf])
df['rating'] = df['rating']/10 
df['restaurant'] = df['restaurant'].apply(lambda x: clean_name(x))
df['restaurant'] = df['restaurant'].apply(lambda x: 'In-N-Out Burger' if 'In N Out' in x else x)
df = df[pd.to_numeric(df['review'], errors='coerce').isnull()]
df = df.dropna(subset=['rating','review'])

### Model with no pretreained Embeddings

In [6]:
def train_dev_test_df(df, splits):
    """"
    Takes in a dataframe and splits into train dev and test
    """
    np.random.seed(42)
    shuffle_df = df.sample(frac=1)

    train, dev = splits

    X = shuffle_df['review'].to_numpy()
    y = shuffle_df['rating'].to_numpy()

    train_cutoff = int(X.shape[0]*train)
    dev_cutoff = int(X.shape[0]*(train+dev))

    X_train = X[:train_cutoff]
    y_train = y[:train_cutoff]

    X_dev = X[train_cutoff:dev_cutoff]
    y_dev = y[train_cutoff:dev_cutoff]

    X_test = X[dev_cutoff:]
    y_test = y[dev_cutoff:]

    y_train = tf.constant(y_train, dtype=tf.int16)
    y_dev = tf.constant(y_dev, dtype = tf.int16)
    y_test = tf.constant(y_test, dtype=tf.int16)

    return X_train, y_train, X_dev, y_dev, X_test, y_test

In [7]:
X_train, y_train, X_dev, y_dev, X_test, y_test = train_dev_test_df(df,(.7,.15))

In [8]:
vocab_size = 20000
embedding_dim = 100
max_length = 200
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [25]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

word_index_token = tokenizer.word_index

X_train_token = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_token, maxlen=max_length, padding=padding_type, truncating=trunc_type)

X_dev_token = tokenizer.texts_to_sequences(X_dev)
X_dev_padded = pad_sequences(X_dev_token, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [27]:
X_train_padded = tf.cast(X_train_padded, dtype=tf.int16)
X_dev_padded = tf.cast(X_dev_padded, dtype=tf.int16)

In [None]:
callback = tf.keras.callbacks.TensorBoard(log_dir='callbacks', histogram_freq=1)

tf.random.set_seed(42)
model = tf.keras.Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(32)),
    Dropout(0.5),
    Dense(10,activation='relu'),
    Dense(1,activation='relu')
])

model.compile(loss='MAE',optimizer=tf.keras.optimizers.Adam() ,metrics=['mae'])

model.fit(X_train_padded, y_train, batch_size=128, epochs=30, validation_data=(X_dev_padded, y_dev), callbacks=callback)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f7dae984f10>

In [None]:
model.evaluate(X_dev_padded, y_dev)



[0.5397411584854126, 0.5397411584854126]

In [None]:
model.summary()

### Model Using Pretrained Glove Embessings

In [28]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(X_train).batch(128)
vectorizer.adapt(text_ds)

voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [29]:
## Vectorizing the training examples
## Chaning the dtype of the samples and labels for faster processing

X_train_vector = tf.cast(vectorizer(X_train), tf.int16)
X_dev_vector = tf.cast(vectorizer(X_dev), tf.int16)

In [30]:
len(voc)

20000

#### Importing embeddings and making matrix

In [41]:
def import_embeddings(path):
    """
    Loads embeddings from a text file
    """
    import os 
    path_to_glove_file = os.path.join(
      os.path.expanduser("~"), path
    )

    embeddings_index = {}
    with open(path_to_glove_file) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs

    print("Found %s word vectors." % len(embeddings_index))

    return embeddings_index



def make_embedding_matrix(voc, word_index, embedding_dim, embeddings_index):
    """
    Returns embedding matrix
    """
    num_tokens = len(voc) + 2
    hits = 0
    misses = 0

    # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    print("Converted %d words (%d misses)" % (hits, misses))
    return embedding_matrix
    


### 100D Embeddings

In [42]:
embeddings_100 = import_embeddings('/content/glove.6B.100d.txt')
embedding_matrix_100 = make_embedding_matrix(voc,word_index, 100, embeddings_100)

embedding_matrix_100_token = make_embedding_matrix(word_index_token, word_index_token,100, embeddings_100)

Found 400000 word vectors.
Converted 13237 words (6763 misses)
Converted 18169 words (6166 misses)


In [36]:
tf.random.set_seed(42)

embedding_layer_100 = Embedding(
    len(voc)+2,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix_100),
    trainable=False,
)

model_2 = tf.keras.Sequential([
    embedding_layer_100,
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(32)),
    Dropout(0.5),
    Dense(10,activation='relu'),
    Dense(1,activation='relu')
])

model_2.compile(loss='MAE',optimizer=tf.keras.optimizers.Adam() ,metrics=['mae'])

model_2.fit(X_train_vector, y_train, batch_size=128, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f966ff34f10>

In [37]:
model_2.evaluate(X_dev_vector, y_dev)



[0.4808960258960724, 0.4808960258960724]

In [38]:
model_2.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 100)         2000200   
_________________________________________________________________
bidirectional_6 (Bidirection (None, None, 128)         84480     
_________________________________________________________________
dropout_6 (Dropout)          (None, None, 128)         0         
_________________________________________________________________
bidirectional_7 (Bidirection (None, 64)                41216     
_________________________________________________________________
dropout_7 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 10)                650       
_________________________________________________________________
dense_7 (Dense)              (None, 1)                

In [44]:
tf.random.set_seed(42)

embedding_layer_100_token = Embedding(
    len(word_index_token)+2,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix_100_token),
    trainable=False,
)

model_token = tf.keras.Sequential([
    embedding_layer_100_token,
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(32)),
    Dropout(0.5),
    Dense(10,activation='relu'),
    Dense(1,activation='relu')
])

model_token.compile(loss='MAE',optimizer=tf.keras.optimizers.Adam() ,metrics=['mae'])

model_token.fit(X_train_padded, y_train, batch_size=128, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f9649ecbc10>

In [45]:
model_token.evaluate(X_dev_padded, y_dev)



[0.487623929977417, 0.487623929977417]

In [46]:
model_token.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 100)         2433700   
_________________________________________________________________
bidirectional_12 (Bidirectio (None, None, 128)         84480     
_________________________________________________________________
dropout_12 (Dropout)         (None, None, 128)         0         
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 64)                41216     
_________________________________________________________________
dropout_13 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 10)                650       
_________________________________________________________________
dense_13 (Dense)             (None, 1)                

#### Using 200D word Embedding

In [None]:
embeddings_200 = import_embeddings('/content/glove.6B.200d.txt')
embedding_matrix_200 = make_embedding_matrix(voc,word_index, 200, embeddings_200)

In [26]:
tf.random.set_seed(42)

embedding_layer_200 = Embedding(
    len_voc +2,
    embedding_dim_200,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix_200),
    trainable=False,
)

model_3 = tf.keras.Sequential([
    embedding_layer_200,
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(64)),
    Dropout(0.5),
    Dense(5,activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2=0.05)),
    Dense(1,activation='relu')
])

model_3.compile(loss='MAE',optimizer=tf.keras.optimizers.Adam() ,metrics=['mae'])

model_3.fit(X_train_vector, y_train, batch_size=128, epochs=25)

24335