# Sentence Classifier using 1D CNN

In [3]:
import matplotlib.pyplot as plt
import os
import re
import string
import shutil
import tensorflow as tf
from tensorflow.keras import layers, losses, callbacks, Sequential

In [4]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file(
    "aclImdb_v1" , url,
    untar=True, cache_dir='',
    cache_subdir=''
)
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

In [5]:
os.listdir(dataset_dir)

['test', 'imdbEr.txt', 'imdb.vocab', 'README', 'train']

In [6]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['neg',
 'pos',
 'unsupBow.feat',
 'labeledBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt',
 'unsup']

In [7]:
shutil.rmtree(os.path.join(train_dir, "unsup"))

In [8]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed
)
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed
)
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, 'test'),
    batch_size=batch_size,
)


Found 25000 files belonging to 2 classes.
Using 20000 files for training.


2024-04-03 21:03:26.939663: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-03 21:03:26.940103: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [9]:
for x in iter(raw_test_ds):
    print(x)
    break

(<tf.Tensor: shape=(32,), dtype=string, numpy=
array([b'A town in Japan is being taken over by a horribly brutal abstract shape: the spiral. It\'s becoming a theme in everything from animals to clouds to people and twisting them, mentally and literally. This film shows it happening to several groups of people. Some demonic possession is implied, but nothing is entirely sure except that the best bet is to get the heck out of dodge. The film progresses really well from normal life to abnormal phenomena (giant snails and crazy people) to the truly supernatural (walking dead).<br /><br />As a jaded American horror movie fan, this was just what I needed. Maybe it was just the novelty of a different culture\'s film, but it seemed to have a very original progression, set of characters, and the premise was definitely new. The Japanese may think "horror shapes" (uzumaki means "spiral," I\'m told) are old by now, but it was nice for me because I\'m used to monster/alien/virus/disaster/undead fil

In [10]:
def custom_standardization(input_data):
    l_case = tf.strings.lower(input_data)
    stp_html = tf.strings.regex_replace(
        l_case, '<br/>', ' '
    )
    return tf.strings.regex_replace(
        stp_html,
        f'[{re.escape(string.punctuation)}]',
        ''
    )

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [11]:
max_features = 10000
seq_len = 250

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=seq_len 
)

In [12]:
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)


2024-04-03 21:03:32.778431: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [13]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = raw_train_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)
val_ds = raw_val_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)
test_ds = raw_test_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)


In [20]:
emb_dims = 128

model = Sequential([
    layers.Embedding(
        len(vectorize_layer.get_vocabulary()), 64, mask_zero=True),
    layers.Bidirectional(
        layers.LSTM(64, return_sequences=True)
    ),
    layers.Bidirectional(
        layers.LSTM(32),
    ),
    layers.Dense(64, activation='relu'),
    layers.Dense(1),
])

model.compile(
    optimizer='adam',
    loss=losses.BinaryCrossentropy(from_logits=True),
    metrics=['accuracy']
)

model.summary()

In [21]:
model.fit(
    train_ds, 
    validation_data=val_ds,
    validation_steps=10,
    epochs=10,
    callbacks=[
        callbacks.TensorBoard(log_dir="logs/ex7"),
    ]
)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 198ms/step - accuracy: 0.6524 - loss: 0.5911 - val_accuracy: 0.8344 - val_loss: 0.3420
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 196ms/step - accuracy: 0.8590 - loss: 0.3208 - val_accuracy: 0.8281 - val_loss: 0.3817
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 203ms/step - accuracy: 0.9159 - loss: 0.2117 - val_accuracy: 0.8156 - val_loss: 0.3990
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 193ms/step - accuracy: 0.9431 - loss: 0.1499 - val_accuracy: 0.8719 - val_loss: 0.3425
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 292ms/step - accuracy: 0.9523 - loss: 0.1217 - val_accuracy: 0.8500 - val_loss: 0.5449
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 278ms/step - accuracy: 0.9595 - loss: 0.1029 - val_accuracy: 0.8281 - val_loss: 0.4595
Epoc

2024-04-03 21:47:29.754741: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


<keras.src.callbacks.history.History at 0x7f23a8e9bca0>

In [17]:
%load_ext tensorboard
%tensorboard --logdir logs

In [22]:
model.evaluate(test_ds)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 69ms/step - accuracy: 0.8085 - loss: 0.8425


[0.8463099598884583, 0.8087999820709229]

In [23]:
export_model = tf.keras.Sequential([
    vectorize_layer,
    model,
    layers.Activation('sigmoid')
])

export_model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False),
    optimizer="adam",
    metrics=["accuracy"]
)


export_model.evaluate(raw_test_ds)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 59ms/step - accuracy: 0.8126 - loss: 0.8441


[0.8468451499938965, 0.8138399720191956]

In [24]:
export_model(tf.constant(['The movie is very boring', 'A Good Movie' , 'very bad worst movie',  'Worst movie, boring' ]))

<tf.Tensor: shape=(4, 1), dtype=float32, numpy=
array([[9.1967854e-04],
       [6.6537935e-01],
       [5.3871681e-05],
       [3.1485815e-05]], dtype=float32)>

In [25]:
export_model.save('sentence_classificatoin_model.keras')