# Sentiment Classification of Yelp Reviews

## Loading libraries and datasets

In [1]:
import os
import re
import string
import numpy as np 
import tensorflow as tf

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [2]:
# Obtaining the relevant data from tensorflow_datasets
import tensorflow_datasets as tfds

# split the provided test set into a validation and test set 
train_ds, info = tfds.load('yelp_polarity_reviews', split = 'train', as_supervised = True, with_info=True)
valid_ds = tfds.load('yelp_polarity_reviews', split = 'test[:50%]', as_supervised = True)
test_ds = tfds.load('yelp_polarity_reviews', split = 'test[50%:]', as_supervised = True)

In [3]:
# Let's check out a few examples and their labels
for text, label in train_ds.take(2).as_numpy_iterator():
  print("Text - {}".format(text))
  print("Label - {}".format(label))

Text - b"The Groovy P. and I ventured to his old stomping grounds for lunch today.  The '5 and Diner' on 16th St and Colter left me with little to ask for.  Before coming here I had a preconceived notion that 5 & Diners were dirty and nasty. Not the case at all.\\n\\nWe walk in and let the waitress know we want to sit outside (since it's so nice and they had misters).  We get two different servers bringing us stuff (talk about service) and I ask the one waitress for recommendations.  I didn't listen to her, of course, and ordered the Southwestern Burger w/ coleslaw and started with a nice stack of rings.\\n\\nThe Onion Rings were perfectly cooked.  They looked like they were prepackaged, but they were very crispy and I could actually bite through the onion without pulling the entire thing out (don't you hate that?!!!)\\n\\nThe Southwestern Burger was order Medium Rare and was cooked accordingly.  Soft, juicy, and pink with a nice crispy browned outer layer that can only be achieved on 

## Removing some common stop words from the given datasets

In [5]:
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [6]:
# Function to eliminate stopwords 
def remove_stopwords(text, label):
    text = " " + text        # needed for to capture some stopwords appearing at the beginnning of the string 
    text = tf.strings.lower(text)  
    for word in stopwords:
        token = " " + word + " "
        text = tf.strings.regex_replace(text, token, " ")
    return text, label

In [7]:
train_ds = train_ds.map(remove_stopwords)
valid_ds = valid_ds.map(remove_stopwords)
test_ds = test_ds.map(remove_stopwords)

In [8]:
# Let's check out a few examples with the stopwords removed
for text, label in train_ds.take(2).as_numpy_iterator():
  print("Text - {}".format(text))
  print("Label - {}".format(label))

Text - b" groovy p. ventured old stomping grounds lunch today.  '5 diner' 16th st colter left little ask for.  coming preconceived notion 5 & diners dirty nasty. not case all.\\n\\nwe walk let waitress know want sit outside (since nice misters).  get two different servers bringing us stuff (talk service) ask one waitress recommendations.  didn't listen her, course, ordered southwestern burger w/ coleslaw started nice stack rings.\\n\\nthe onion rings perfectly cooked.  looked like prepackaged, crispy actually bite onion without pulling entire thing (don't hate that?!!!)\\n\\nthe southwestern burger order medium rare cooked accordingly.  soft, juicy, pink nice crispy browned outer layer can achieved well used grill.  creaminess chipotle mayo paired beautifully green chiles.  unfortunately, ate many onion rings, couldn't finish burger.  shame!\\n\\nthe coleslaw just like it.  hard find really good coleslaw.  prefer mine slightly sweet, not sour.  much vinegar slaw ruins opinion.  slaw pe

In [9]:
# saving just the text of the train_data in a variable for tokenizing
train_ds_text = train_ds.map(lambda text, label : text)

## Tokenizing the train data using the TextVectorization method

In [10]:
MAX_SEQUENCE_LENGTH = 120 # truncating the sentences to this length. Padding is taken care of automatically.
VOCAB_SIZE = 10000 # Maximum vocabulary list to consider

tokenize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int', # indexing tokens with a series of integers
    output_sequence_length=MAX_SEQUENCE_LENGTH)

In [11]:
# Tokenization using the train data only (DONT USE VALID/TEST DATA - DONT WANT DATA LEAKAGE)
tokenize_layer.adapt(train_ds_text)

### Checking the tokenize_layer

In [12]:
# function to convert the texts into token indices 
def tokenize_text(text, label):
  text = tf.expand_dims(text, -1)
  return tokenize_layer(text)[0], label

In [13]:
# Consider a single example ----->
example1, label1  = next(iter(train_ds))
print("Example_text--> ", example1)
print("Label--> ", label1)

Example_text-->  tf.Tensor(b" groovy p. ventured old stomping grounds lunch today.  '5 diner' 16th st colter left little ask for.  coming preconceived notion 5 & diners dirty nasty. not case all.\\n\\nwe walk let waitress know want sit outside (since nice misters).  get two different servers bringing us stuff (talk service) ask one waitress recommendations.  didn't listen her, course, ordered southwestern burger w/ coleslaw started nice stack rings.\\n\\nthe onion rings perfectly cooked.  looked like prepackaged, crispy actually bite onion without pulling entire thing (don't hate that?!!!)\\n\\nthe southwestern burger order medium rare cooked accordingly.  soft, juicy, pink nice crispy browned outer layer can achieved well used grill.  creaminess chipotle mayo paired beautifully green chiles.  unfortunately, ate many onion rings, couldn't finish burger.  shame!\\n\\nthe coleslaw just like it.  hard find really good coleslaw.  prefer mine slightly sweet, not sour.  much vinegar slaw rui

In [14]:
# Now let's view its tokenized form
print("vectorized example text:",
      tokenize_text(example1, label1)[0])

vectorized example text: tf.Tensor(
[   1 3840 4951  170    1 3935  117  395  115 1659 7000 1712    1  128
   38  176  443  249    1    1  115 2279  545 1311    2  715    1  298
  217  200   40   58  372  270   86   28 6249    8   45  153  578 1647
   18  422  609   12  176    9  200 2194   24 1776  887  336   27 6937
  147  990 2790  323   28 3624    1  731 1498  613  305  175    6 6050
  688  152  588  731  234 3695  468  100   19  744 7498 6937  147   30
  740  943  305 5579  820 1208 1674   28  688 9745 6708 3184   21    1
   33  204  978    1 1613 1704 3352 2805  486 7115  608  401  101  731
 1498  231  779  147    1 2790    7    6], shape=(120,), dtype=int64)


In [15]:
# Checking a few of the indices to see if the mapping is as expected --->

print("1 --> {}".format(tokenize_layer.get_vocabulary()[1]))
print("2 --> {}".format(tokenize_layer.get_vocabulary()[3840]))
print("Vocabulary size --> {}".format(len(tokenize_layer.get_vocabulary())))

1 --> [UNK]
2 --> p
Vocabulary size --> 10000


One can in principle convert all the text data into vectors as shown in the above example before feeding it into the model. This can speed up the training process but then one has to save the tokenizing indices separately for inference later as this is unique to the trained model. To generalize, we include the TextVectorization layer as part of the model building process. For anyone interested, the code to convert the data into indices before feeding to the model is also provided in the cell below but is commented out here.

In [None]:
'''

# Converting texts into token indices learned from the training files

int_train_ds = train_ds.map(tokenize_text)
int_valid_ds = valid_ds.map(tokenize_text)
int_test_ds = test_ds.map(tokenize_text)


for text_vec, _ in int_train_ds.take(2).as_numpy_iterator():
  print("Text - {}".format(text_vec))
  

# cache the prepared dataset in memory
int_train_ds = int_train_ds.cache()
int_valid_ds = int_valid_ds.cache()
int_test_ds = int_test_ds.cache()

# Preparing the dataset to be loaded into the model 
BUFFER_SIZE = 300000
BATCH_SIZE = 128

# for model without vectorization layer 
train_dataset = int_train_ds.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
validation_dataset = int_valid_ds.batch(4).prefetch(tf.data.AUTOTUNE)
test_dataset = int_test_ds.batch(32).prefetch(tf.data.AUTOTUNE)


model = tf.keras.Sequential([tf.keras.layers.Embedding(input_dim = VOCAB_SIZE, 
                                                       output_dim = 64, input_length = MAX_SEQUENCE_LENGTH),
                             tf.keras.layers.Conv1D(128,5,activation='relu'),
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(32, activation='relu'),
                             tf.keras.layers.Dense(1,activation='sigmoid')])
'''

In [16]:
# cache the prepared dataset in memory
train_ds = train_ds.cache()
valid_ds = valid_ds.cache()
test_ds = test_ds.cache()

In [17]:
# Preparing the dataset to be loaded into the model 
BUFFER_SIZE = 300000
BATCH_SIZE = 128

# for model with vectorization layer
train_dataset = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
validation_dataset = valid_ds.batch(4).prefetch(tf.data.AUTOTUNE)
test_dataset = test_ds.batch(32).prefetch(tf.data.AUTOTUNE)

In [18]:
model = tf.keras.Sequential([tf.keras.Input(shape = (1,), dtype = tf.string),
                             tokenize_layer,
                             tf.keras.layers.Embedding(input_dim = VOCAB_SIZE, 
                                                       output_dim = 64, input_length = MAX_SEQUENCE_LENGTH),
                             tf.keras.layers.Conv1D(128,5,activation='relu'),
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(32, activation='relu'),
                             tf.keras.layers.Dense(1,activation='sigmoid')])

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 120)               0         
_________________________________________________________________
embedding (Embedding)        (None, 120, 64)           640000    
_________________________________________________________________
conv1d (Conv1D)              (None, 116, 128)          41088     
_________________________________________________________________
global_average_pooling1d (Gl (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 685,249
Trainable params: 685,249
Non-trainable params: 0
__________________________________________________

In [20]:
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
NUM_EPOCHS = 5
history = model.fit(train_dataset, epochs=NUM_EPOCHS, validation_data=validation_dataset)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
model.evaluate(test_dataset)



[0.15592601895332336, 0.9408420920372009]

In [None]:
saved_model_path = "./my_models/{}".format('sentiment_classifier_withVectorizer')

model.save(saved_model_path, save_format = 'tf')

In [None]:
new_model = tf.keras.models.load_model("./my_models/{}".format('sentiment_classifier_withVectorizer'))

new_model.summary()

In [None]:
new_model.evaluate(test_dataset)

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')

Even though the accuracies are high enough there is subastantial overfitting only after the first 2 epochs if the stopwords are not removed (one can check this by keeping the stopwords in the splits). After removing the stopwords (which is what we do here eventually), we still get overfitting but atleast towards the end of the training now. 

When we need to deploy the model, it is better to include as much of the preprocssing steps in the model directly. In the model earlier we did do part of it by making the TextVectorization layer part of the final model. We can further achieve this by including a customized callable into the 'standardize' parameter of the TextVectorization layer. This will significantly increase the training time but it is probably worth it later when we do inference. Let's take a look at it. 

## TextVectorization with a custom standardize

In [23]:
# Function to be fed to the 'standardize' parameter. 
#Note the extra 'stopwords' removal step that was earlier outside the model  

@tf.keras.utils.register_keras_serializable()
def operations(text):
  text = " " + text   # for removing stop words at the beginning of the text
  text = tf.strings.lower(text) 
  stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
  for word in stopwords:
    token = " " + word + " "
    text = tf.strings.regex_replace(text, token, " ")
  remove_regex = f'[{re.escape(string.punctuation)}]'  
  text = tf.strings.regex_replace(text, remove_regex, '') #removing punctuations
  return text

In [24]:
MAX_SEQUENCE_LENGTH = 120
VOCAB_SIZE = 10000

tokenizer_layer = TextVectorization(
  standardize=operations, # now with customized text preprocessing
  max_tokens=VOCAB_SIZE,
  output_mode='int',
  output_sequence_length=MAX_SEQUENCE_LENGTH) 

The next steps are more or less the same as before.

In [25]:
import tensorflow_datasets as tfds

train_ds, info = tfds.load('yelp_polarity_reviews', split = 'train', as_supervised = True, with_info=True)
valid_ds = tfds.load('yelp_polarity_reviews', split = 'test[:50%]', as_supervised = True)
test_ds = tfds.load('yelp_polarity_reviews', split = 'test[50%:]', as_supervised = True)

In [26]:
train_only_text = train_ds.map(lambda text, label : text)

In [27]:
tokenizer_layer.adapt(train_only_text.batch(2048))

In [28]:
example1, label1  = next(iter(train_ds))
print("Example_text--> ", example1)
print("Label--> ", label1)

Example_text-->  tf.Tensor(b"The Groovy P. and I ventured to his old stomping grounds for lunch today.  The '5 and Diner' on 16th St and Colter left me with little to ask for.  Before coming here I had a preconceived notion that 5 & Diners were dirty and nasty. Not the case at all.\\n\\nWe walk in and let the waitress know we want to sit outside (since it's so nice and they had misters).  We get two different servers bringing us stuff (talk about service) and I ask the one waitress for recommendations.  I didn't listen to her, of course, and ordered the Southwestern Burger w/ coleslaw and started with a nice stack of rings.\\n\\nThe Onion Rings were perfectly cooked.  They looked like they were prepackaged, but they were very crispy and I could actually bite through the onion without pulling the entire thing out (don't you hate that?!!!)\\n\\nThe Southwestern Burger was order Medium Rare and was cooked accordingly.  Soft, juicy, and pink with a nice crispy browned outer layer that can 

In [29]:
def tokenize_text2(text, label):
  text = tf.expand_dims(text, -1)
  return tokenizer_layer(text)[0], label

In [30]:
print(tokenize_text2(example1,label1))

(<tf.Tensor: shape=(120,), dtype=int64, numpy=
array([   1, 3840, 4951,  170,    1, 3935,  117,  395,  115, 1659, 7000,
       1712,    1,  128,   38,  176,  443,  249,    1,    1,  115, 2279,
        545, 1311,    2,  715,    1,  298,  217,  200,   40,   58,  372,
        270,   86,   28, 6249,    8,   45,  153,  578, 1647,   18,  422,
        609,   12,  176,    9,  200, 2194,   24, 1776,  887,  336,   27,
       6937,  147,  990, 2790,  323,   28, 3624,    1,  731, 1498,  613,
        305,  175,    6, 6050,  688,  152,  588,  731,  234, 3695,  468,
        100,   19,  744, 7498, 6937,  147,   30,  740,  943,  305, 5579,
        820, 1208, 1674,   28,  688, 9745, 6708, 3184,   21,    1,   33,
        204,  978,    1, 1613, 1704, 3352, 2805,  486, 7115,  608,  401,
        101,  731, 1498,  231,  779,  147,    1, 2790,    7,    6])>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)


In [31]:
tokenizer_layer.get_vocabulary()[4951]

'ventured'

In [32]:
# Preparing the dataset to be loaded into the model 
BUFFER_SIZE = 300000
BATCH_SIZE = 32

train_dataset = train_ds.shuffle(BUFFER_SIZE)
train_dataset = train_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
validation_dataset = valid_ds.batch(4).prefetch(tf.data.AUTOTUNE)
test_dataset = test_ds.batch(4).prefetch(tf.data.AUTOTUNE)

In [33]:
model = tf.keras.Sequential([tf.keras.Input(shape = (1,), dtype = tf.string),
                             tokenizer_layer,
                             tf.keras.layers.Embedding(input_dim = VOCAB_SIZE, 
                                                       output_dim = 64, input_length = MAX_SEQUENCE_LENGTH),
                             tf.keras.layers.Conv1D(128,5,activation='relu'),
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(32, activation='relu'),
                             tf.keras.layers.Dense(1,activation='sigmoid')])

In [34]:
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [35]:
NUM_EPOCHS = 5
history = model.fit(train_dataset, epochs=NUM_EPOCHS, validation_data=validation_dataset)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [36]:
model.evaluate(test_dataset)



[0.23269431293010712, 0.929263174533844]