# Sentiment analysis from scratch
Using modern NLP techniques on IMDB ratings data to classify text into positive and negative sentiment.

### Setup

In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras import utils, optimizers, layers, models

### Loading the data

In [2]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  32.5M      0  0:00:02  0:00:02 --:--:-- 32.5M


In [3]:
!ls aclImdb/train

labeledBow.feat  pos	unsupBow.feat  urls_pos.txt
neg		 unsup	urls_neg.txt   urls_unsup.txt


In [4]:
!ls aclImdb/test

labeledBow.feat  neg  pos  urls_neg.txt  urls_pos.txt


In [5]:
!rm -r aclImdb/train/unsup

In [6]:
# Creating `tf.data.Dataset` instances from the directory structure

BATCH_SIZE = 32

raw_train_ds = utils.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    subset="training",
    seed=42069
)

raw_val_ds = utils.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    subset="validation",
    seed=42069
)

raw_test_ds = utils.text_dataset_from_directory(
    "aclImdb/test",
    batch_size=BATCH_SIZE
)

print(f"Number of batches in raw_train_ds: {raw_train_ds.cardinality()}")
print(f"Number of batches in raw_val_ds: {raw_val_ds.cardinality()}")
print(f"Number of batches in raw_test_ds: {raw_test_ds.cardinality()}")

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
Number of batches in raw_train_ds: 625
Number of batches in raw_val_ds: 157
Number of batches in raw_test_ds: 782


In [7]:
# Previewing examples before tokenization
for text, label in raw_train_ds.take(1):
  for i in range(3):
    print(text.numpy()[i])
    print(label.numpy()[i])
    print("-------")

b'Well, maybe not immediately before the Rodney King riots, but even a few months before was timely enough. My parents said that they saw it and the next thing you know, the police got acquitted and LA got burned to the ground. It just goes to show the state of race relations in America. The plot has white Mack (Kevin Kline) and African-American Simon (Danny Glover) becoming friends after Simon saves Mack\'s life in the black ghetto. Meanwhile, movie producer Davis (Steve Martin in a serious role) thinks that gratuitous violence is really cool...until he gets shot. There\'s also some existentialism in the movie: Mack and his family come to realize that they aren\'t living as they really want.<br /><br />It seems that "Crash" has somewhat renewed people\'s interest in race relations, but this one came out much earlier. Maybe we\'ll never be able to have stable race relations in this country. But either way, "Grand Canyon" is a great movie. It affirms Kevin Kline as my favorite actor. Al

### Preparing data
Cleaning and standardizing data

In [8]:
import string
import re

In [10]:
def standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_text = tf.strings.regex_replace(lowercase, "<br />", " ")
  return tf.strings.regex_replace(
      stripped_text, f"[{re.escape(string.punctuation)}]", ""
  )

MAX_FEATURES = 20000
EMBED_DIM = 128
SEQ_LENGTH = 500

vectorize_layer = layers.TextVectorization(
    standardize=standardization,
    max_tokens=MAX_FEATURES,
    output_mode="int",
    output_sequence_length=SEQ_LENGTH
)

# Making a text-only dataset
text_ds = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [11]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

train_ds = raw_train_ds.map(vectorize_text).cache().prefetch(buffer_size=1)
val_ds = raw_val_ds.map(vectorize_text).cache().prefetch(buffer_size=1)
test_ds = raw_test_ds.map(vectorize_text).cache().prefetch(buffer_size=1)

### Building the model

In [13]:
inputs = layers.Input(shape=(None,), dtype="int64")
x = layers.Embedding(MAX_FEATURES, EMBED_DIM)(inputs)
x = layers.Dropout(0.5)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
output = layers.Dense(1, activation="sigmoid", name="output_layer")(x)
model = models.Model(inputs, output, name="model_0_conv1d")

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

### Training the model

In [15]:
EPOCHS = 3

model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fdf6041d150>

### Evaluating on test data

In [16]:
model.evaluate(test_ds)



[0.4010775685310364, 0.8628399968147278]

### Saving model to disk

In [17]:
import pickle

pickle.dump({'config': vectorize_layer.get_config(),
             'weights': vectorize_layer.get_weights()},
             open('model_0_conv1dvectorization.pkl', 'wb'))

print("[INFO] Vectorizer saved")

model.save("model_0_conv1d.h5")

[INFO] Vectorizer saved
