# Sentiment Classification of Yelp Reviews

## Loading libraries and datasets

In [None]:
import os
import re
import string
import numpy as np 
import tensorflow as tf

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
# Obtaining the relevant data from tensorflow_datasets
import tensorflow_datasets as tfds

# split the provided test set into a validation and test set 
train_ds, info = tfds.load('yelp_polarity_reviews', split = 'train', as_supervised = True, with_info=True)
valid_ds = tfds.load('yelp_polarity_reviews', split = 'test[:50%]', as_supervised = True)
test_ds = tfds.load('yelp_polarity_reviews', split = 'test[50%:]', as_supervised = True)

In [None]:
# Let's check out a few examples and their labels
for text, label in train_ds.take(2).as_numpy_iterator():
  print("Text - {}".format(text))
  print("Label - {}".format(label))

## Removing some common stop words from the given datasets

In [None]:
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [None]:
# Function to eliminate stopwords 
def remove_stopwords(text, label):
    text = " " + text        # needed for to capture some stopwords appearing at the beginnning of the string 
    text = tf.strings.lower(text)  
    for word in stopwords:
        token = " " + word + " "
        text = tf.strings.regex_replace(text, token, " ")
    return text, label

In [None]:
train_ds = train_ds.map(remove_stopwords)
valid_ds = valid_ds.map(remove_stopwords)
test_ds = test_ds.map(remove_stopwords)

In [None]:
# Let's check out a few examples with the stopwords removed
for text, label in train_ds.take(2).as_numpy_iterator():
  print("Text - {}".format(text))
  print("Label - {}".format(label))

In [None]:
# saving just the text of the train_data in a variable for tokenizing
train_ds_text = train_ds.map(lambda text, label : text)

## Tokenizing the train data using the TextVectorization method

In [None]:
MAX_SEQUENCE_LENGTH = 120 # truncating the sentences to this length. Padding is taken care of automatically.
VOCAB_SIZE = 10000 # Maximum vocabulary list to consider

tokenize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int', # indexing tokens with a series of integers
    output_sequence_length=MAX_SEQUENCE_LENGTH)

In [None]:
# Tokenization using the train data only (DONT USE VALID/TEST DATA - DONT WANT DATA LEAKAGE)
tokenize_layer.adapt(train_ds_text)

### Checking the tokenize_layer

In [None]:
# function to convert the texts into token indices 
def tokenize_text(text, label):
  text = tf.expand_dims(text, -1)
  return tokenize_layer(text)[0], label

In [None]:
# Consider a single example ----->
example1, label1  = next(iter(train_ds))
print("Example_text--> ", example1)
print("Label--> ", label1)

In [None]:
# Now let's view its tokenized form
print("vectorized example text:",
      tokenize_text(example1, label1)[0])

In [None]:
# Checking a few of the indices to see if the mapping is as expected --->

print("1 --> {}".format(tokenize_layer.get_vocabulary()[1]))
print("2 --> {}".format(tokenize_layer.get_vocabulary()[3840]))
print("Vocabulary size --> {}".format(len(tokenize_layer.get_vocabulary())))

One can in principle convert all the text data into vectors as shown in the above example before feeding it into the model. This can speed up the training process but then one has to save the tokenizing indices separately for inference later as this is unique to the trained model. To generalize, we include the TextVectorization layer as part of the model building process. For anyone interested, the code to convert the data into indices before feeding to the model is also provided in the cell below but is commented out here.

In [None]:
'''

# Converting texts into token indices learned from the training files

int_train_ds = train_ds.map(tokenize_text)
int_valid_ds = valid_ds.map(tokenize_text)
int_test_ds = test_ds.map(tokenize_text)


for text_vec, _ in int_train_ds.take(2).as_numpy_iterator():
  print("Text - {}".format(text_vec))
  

# cache the prepared dataset in memory
int_train_ds = int_train_ds.cache()
int_valid_ds = int_valid_ds.cache()
int_test_ds = int_test_ds.cache()

# Preparing the dataset to be loaded into the model 
BUFFER_SIZE = 300000
BATCH_SIZE = 128

# for model without vectorization layer 
train_dataset = int_train_ds.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
validation_dataset = int_valid_ds.batch(4).prefetch(tf.data.AUTOTUNE)
test_dataset = int_test_ds.batch(32).prefetch(tf.data.AUTOTUNE)


model = tf.keras.Sequential([tf.keras.layers.Embedding(input_dim = VOCAB_SIZE, 
                                                       output_dim = 64, input_length = MAX_SEQUENCE_LENGTH),
                             tf.keras.layers.Conv1D(128,5,activation='relu'),
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(32, activation='relu'),
                             tf.keras.layers.Dense(1,activation='sigmoid')])
'''

In [None]:
# cache the prepared dataset in memory
train_ds = train_ds.cache()
valid_ds = valid_ds.cache()
test_ds = test_ds.cache()

In [None]:
# Preparing the dataset to be loaded into the model 
BUFFER_SIZE = 300000
BATCH_SIZE = 128

# for model with vectorization layer
train_dataset = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
validation_dataset = valid_ds.batch(4).prefetch(tf.data.AUTOTUNE)
test_dataset = test_ds.batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
model = tf.keras.Sequential([tf.keras.Input(shape = (1,), dtype = tf.string),
                             tokenize_layer,
                             tf.keras.layers.Embedding(input_dim = VOCAB_SIZE, 
                                                       output_dim = 64, input_length = MAX_SEQUENCE_LENGTH),
                             tf.keras.layers.Conv1D(128,5,activation='relu'),
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(32, activation='relu'),
                             tf.keras.layers.Dense(1,activation='sigmoid')])

In [None]:
model.summary()

In [None]:
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
NUM_EPOCHS = 5
history = model.fit(train_dataset, epochs=NUM_EPOCHS, validation_data=validation_dataset)

In [None]:
model.evaluate(test_dataset)

In [None]:
saved_model_path = "./my_models/{}".format('sentiment_classifier_withVectorizer')

model.save(saved_model_path, save_format = 'tf')

In [None]:
new_model = tf.keras.models.load_model("./my_models/{}".format('sentiment_classifier_withVectorizer'))

new_model.summary()

In [None]:
new_model.evaluate(test_dataset)

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')

Even though the accuracies are high enough there is subastantial overfitting only after the first 2 epochs if the stopwords are not removed (one can check this by keeping the stopwords in the splits). After removing the stopwords (which is what we do here eventually), we still get overfitting but atleast towards the end of the training now. 

When we need to deploy the model, it is better to include the TextVectorization layer into the model itself since it contains the Tokenizer and the same indexing needs to be used for any unseen data. This will significantly increase the training time but probably worth it. Let's take a look at it. 

## TextVectorization with a custom standardize

In [None]:
@tf.keras.utils.register_keras_serializable()
def operations(text):
  text = " " + text
  text = tf.strings.lower(text)
  stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
  for word in stopwords:
    token = " " + word + " "
    text = tf.strings.regex_replace(text, token, " ")
  remove_regex = f'[{re.escape(string.punctuation)}]'
  text = tf.strings.regex_replace(text, remove_regex, '')
  return text

In [None]:
MAX_SEQUENCE_LENGTH = 120
VOCAB_SIZE = 10000

tokenizer_layer = TextVectorization(
  standardize=operations,
  max_tokens=VOCAB_SIZE,
  output_mode='int',
  output_sequence_length=MAX_SEQUENCE_LENGTH)

In [None]:
import tensorflow_datasets as tfds

train_ds, info = tfds.load('yelp_polarity_reviews', split = 'train', as_supervised = True, with_info=True)
valid_ds = tfds.load('yelp_polarity_reviews', split = 'test[:50%]', as_supervised = True)
test_ds = tfds.load('yelp_polarity_reviews', split = 'test[50%:]', as_supervised = True)

In [None]:
train_only_text = train_ds.map(lambda text, label : text)

In [None]:
tokenizer_layer.adapt(train_only_text.batch(2048))

In [None]:
example1, label1  = next(iter(train_ds))
print("Example_text--> ", example1)
print("Label--> ", label1)

In [None]:
def tokenize_text2(text, label):
  text = tf.expand_dims(text, -1)
  return tokenizer_layer(text)[0], label

In [None]:
print(tokenize_text2(example1,label1))

In [None]:
tokenizer_layer.get_vocabulary()[4951]

In [None]:
# Preparing the dataset to be loaded into the model 
BUFFER_SIZE = 300000
BATCH_SIZE = 32

train_dataset = train_ds.shuffle(BUFFER_SIZE)
train_dataset = train_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
validation_dataset = valid_ds.batch(4).prefetch(tf.data.AUTOTUNE)
test_dataset = test_ds.batch(4).prefetch(tf.data.AUTOTUNE)

In [None]:
model = tf.keras.Sequential([tf.keras.Input(shape = (1,), dtype = tf.string),
                             tokenizer_layer,
                             tf.keras.layers.Embedding(input_dim = VOCAB_SIZE, 
                                                       output_dim = 64, input_length = MAX_SEQUENCE_LENGTH),
                             tf.keras.layers.Conv1D(128,5,activation='relu'),
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(32, activation='relu'),
                             tf.keras.layers.Dense(1,activation='sigmoid')])

In [None]:
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
NUM_EPOCHS = 5
history = model.fit(train_dataset, epochs=NUM_EPOCHS, validation_data=validation_dataset)

In [None]:
model.evaluate(test_dataset)