# Feb. 2nd- Text Classification from Scratch

Learned basics of TextVectorization, Embedding, more on preprocessing (tf.data.Dataset), and practiced more with keras.Model functional API.

In [1]:
import tensorflow as tf
import numpy as np

2023-02-02 17:49:25.905545: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
# download data

!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  1457k      0  0:00:56  0:00:56 --:--:-- 1756k 1066k      0  0:01:16  0:00:17  0:00:59 1998k0:00:44  0:00:36  0:00:08 2351k   0  0:00:51  0:00:43  0:00:08  131k0:00:51  0:00:06  745k434k      0  0:00:57  0:00:52  0:00:05  934k


In [9]:
# remove subfolders we don't need 
!rm -r aclImdb/train/unsup

In [10]:
# use tf.data.Dataset to split into train, test, validation set

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    "aclImdb/train",
    validation_split=0.2,
    subset="training",
    seed=1337,
)
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    "aclImdb/train",
    validation_split=0.2,
    subset="validation",
    seed=1337,
)
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    "aclImdb/test",
)

print(f"Number of batches in raw_train_ds: {raw_train_ds.cardinality()}")
print(f"Number of batches in raw_val_ds: {raw_val_ds.cardinality()}")
print(f"Number of batches in raw_test_ds: {raw_test_ds.cardinality()}")

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


2023-02-02 17:57:33.346658: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
Number of batches in raw_train_ds: 625
Number of batches in raw_val_ds: 157
Number of batches in raw_test_ds: 782


In [12]:
#look at a few examples
#take(1) just return tf.data.Dataset with size 1
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(5):
        print(text_batch.numpy()[i])
        print(label_batch.numpy()[i])
#we notice there are HTML <br> tags

b'I am very disappointed with "K-911." The original "good" quality of "K-9" doesn\'t exist any more. This is more like a sitcom! Some of casts from original movie returned and got some of my memory back. The captain of Dooley now loves to hit him like a scene from old comedy show. That was crazy. What\'s the deal with the change of Police? It seems like they are now LAPD! Not San Diego PD. It is a completely different movie from "'
0
b"Giallo fans, seek out this rare film. It is well written, and full of all sorts of the usual low lifes that populate these films. I don't want to give anything away, so I wont even say anything about the plot. The whole movie creates a very bizarre atmosphere, and you don't know what to expect or who to suspect. Recommended! The only place I've seen to get this film in english is from European Trash Cinema, for $15."
1
1
b"We expected something great when we went to see this bomb. It is basically a Broadway play put on film. The music is plain terrible. 

In [14]:
from tensorflow.keras.layers import TextVectorization
import string
import re


def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    #replace <br /> with space
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )


embedding_dim = 128
sequence_length = 500

# TextVectorization layer to map to int; we also use our function above
# to apply to texst as well
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    #max size of vocabulary
    max_tokens=20000,
    #output in int
    output_mode="int",
    output_sequence_length=sequence_length,
)

# only text, no labels
text_ds = raw_train_ds.map(lambda x, y: x)

# create vocabulary
vectorize_layer.adapt(text_ds)

In [15]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label


# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [17]:
from tensorflow.keras import layers

max_features = 20000
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# map vocabulary indices into embedding space
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

#  vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# prediction
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

# uses keras.Model
model = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [18]:
epochs = 3

# Fit the model using the train and test datasets.
model.fit(train_ds, validation_data=val_ds, epochs=epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x19ebf0820>

In [19]:
model.evaluate(test_ds)



[0.44944557547569275, 0.8636400103569031]