# Sentence Classifier using 1D CNN

In [1]:
import matplotlib.pyplot as plt
import os
import re
import string
import shutil
import tensorflow as tf
from tensorflow.keras import layers, losses, callbacks, Sequential

2024-04-16 18:38:13.205253: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file(
    "aclImdb_v1" , url,
    untar=True, cache_dir='',
    cache_subdir=''
)
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
[1m84125825/84125825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 1us/step


In [3]:
os.listdir(dataset_dir)

['test', 'train', 'imdbEr.txt', 'imdb.vocab', 'README']

In [4]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'labeledBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [5]:
shutil.rmtree(os.path.join(train_dir, "unsup"))

In [6]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed
)
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed
)
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, 'test'),
    batch_size=batch_size,
)


Found 25000 files belonging to 2 classes.
Using 20000 files for training.


2024-04-16 18:39:21.631096: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-16 18:39:21.631392: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [7]:
for x in iter(raw_test_ds):
    print(x)
    break

(<tf.Tensor: shape=(32,), dtype=string, numpy=
array([b"Unlike some of the former commentators, I was (and am) an avid fan of the Carpenters. Face it, Christmas would never be Christmas without The Carpenters. That said, I believe the movie did a good, not excellent, job at depicting Karen's life. The movie was enjoyable to see on primetime TV, but the content fell a little short. I suggest that you all look into getting some of the Carpenters specials that were shown in the 70's. You cannot believe how awesome a drummer Karen was. Cynthia did not capture the extent of Karen's talent. Also, Karen was beautiful but had a bad hairdresser. My choice for playing Karen is Hilary Swank. I would love to see a more substantive story, because there was more to Karen than meets the eyes when listening to We've Only Just Begun. I have tons of unreleased Carpenters' music, and it is absolutely excellent. (Her singing of California Dreamin is to die for).",
       b'Jimmy Stewart was a real life pi

In [8]:
def custom_standardization(input_data):
    l_case = tf.strings.lower(input_data)
    stp_html = tf.strings.regex_replace(
        l_case, '<br/>', ' '
    )
    return tf.strings.regex_replace(
        stp_html,
        f'[{re.escape(string.punctuation)}]',
        ''
    )

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [9]:
max_features = 10000
seq_len = 250

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=seq_len 
)

In [10]:
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)


2024-04-16 18:39:25.634845: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [11]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = raw_train_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)
val_ds = raw_val_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)
test_ds = raw_test_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)


In [12]:
emb_dims = 128

model = Sequential([
    layers.Embedding(max_features+1, emb_dims),
    layers.Conv1D(16, 5, activation="relu"),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dense(1),
])

model.compile(
    optimizer='adam',
    loss=losses.BinaryCrossentropy(from_logits=True),
    metrics=['accuracy']
)

model.summary()

In [13]:
model.fit(
    train_ds, 
    validation_data=val_ds,
    epochs=10,
    callbacks=[
        callbacks.TensorBoard(log_dir="logs/1dcnn"),
    ]
)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 26ms/step - accuracy: 0.6505 - loss: 0.5562 - val_accuracy: 0.8612 - val_loss: 0.3060
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 27ms/step - accuracy: 0.8857 - loss: 0.2763 - val_accuracy: 0.8700 - val_loss: 0.2870
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 26ms/step - accuracy: 0.9180 - loss: 0.2116 - val_accuracy: 0.8708 - val_loss: 0.2980
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 24ms/step - accuracy: 0.9396 - loss: 0.1630 - val_accuracy: 0.8768 - val_loss: 0.3330
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 25ms/step - accuracy: 0.9521 - loss: 0.1295 - val_accuracy: 0.8674 - val_loss: 0.3557
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 25ms/step - accuracy: 0.9648 - loss: 0.0981 - val_accuracy: 0.8618 - val_loss: 0.4117
Epoch 7/10
[1m6

<keras.src.callbacks.history.History at 0x7f7af6931c40>

In [14]:
%load_ext tensorboard
%tensorboard --logdir logs

In [15]:
model.evaluate(test_ds)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8314 - loss: 0.6873


[0.6865408420562744, 0.8310400247573853]

In [16]:
export_model = tf.keras.Sequential([
    vectorize_layer,
    model,
    layers.Activation('sigmoid')
])

export_model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False),
    optimizer="adam",
    metrics=["accuracy"]
)


export_model.evaluate(raw_test_ds)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8355 - loss: 0.6830


[0.6869987845420837, 0.8350399732589722]

In [28]:
reviews = tf.constant(['The movie is very boring', 'A Good Movie' , 'very bad worst movie',  'Worst movie, boring' ])
print("## Inference")
res = export_model(reviews)
for review, val in zip(reviews, res):
    review = review.numpy().decode()
    val = val.numpy().squeeze()
    print(f"{review:<30}:{val:>.3f}")

## Inference
The movie is very boring      :0.059
A Good Movie                  :0.634
very bad worst movie          :0.018
Worst movie, boring           :0.116


In [None]:
export_model.save('sentence_classificatoin_model.keras')