In [None]:
import re
import numpy as np
import tensorflow as tf



In [None]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
VOCAB_SIZE =20000
MAX_LEN = 250
EMBED_DIM =32
LSTM_UNITS =32
BATCH_SIZE = 64
EPOCHS=3


In [None]:
def load_imdb():
  (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=VOCAB_SIZE)
  x_train=pad_sequences(x_train,maxlen=MAX_LEN)
  x_test=pad_sequences(x_test,maxlen=MAX_LEN)
  return (x_train,y_train),(x_test,y_test)



In [None]:
def build_model():
  model=tf.keras.Sequential([
      tf.keras.layers.Embedding(
          input_dim=VOCAB_SIZE,
          output_dim=EMBED_DIM,
          input_length=MAX_LEN
      ),
      tf.keras.layers.LSTM(LSTM_UNITS),
      tf.keras.layers.Dense(1,activation='sigmoid')
  ])
  model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )

  return model

In [None]:
def get_word_index():
  word_index=imdb.get_word_index()
  return word_index

In [None]:
model=build_model()

In [None]:
(x_train,y_train),(x_test,y_test)=load_imdb()

In [None]:
model.fit(
    x_train,
    y_train,
    validation_split=0.2,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE
)

Epoch 1/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 117ms/step - accuracy: 0.6711 - loss: 0.5804 - val_accuracy: 0.8558 - val_loss: 0.3431
Epoch 2/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 113ms/step - accuracy: 0.9036 - loss: 0.2541 - val_accuracy: 0.8774 - val_loss: 0.3062
Epoch 3/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 112ms/step - accuracy: 0.9348 - loss: 0.1782 - val_accuracy: 0.8680 - val_loss: 0.3242


<keras.src.callbacks.history.History at 0x7d827ba20920>

In [None]:
loss,acc=model.evaluate(x_test,y_test)


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 22ms/step - accuracy: 0.8613 - loss: 0.3444


In [None]:
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s']", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [None]:
def encode_text(text):
    text= clean_text(text)
    tokens = text.split()

    encoded = [1]
    word_index= get_word_index()
    print(tokens)

    for w in tokens:
        i = word_index.get(w)
        if i is None:
            encoded.append(2)  # <UNK>
        else:
            encoded.append(i + 3)
    encoded= pad_sequences([encoded],maxlen=MAX_LEN)
    return encoded

In [None]:
text= "It was okay, not great but not bad either."

x= encode_text(text)

model.predict(x)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
['it', 'was', 'okay', 'not', 'great', 'but', 'not', 'bad', 'either']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step


array([[0.26814768]], dtype=float32)

In [None]:
text= "It was  great ."

x= encode_text(text)

model.predict(x)


['it', 'was', 'great']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step


array([[0.9393951]], dtype=float32)