<a href="https://colab.research.google.com/github/kk412027247/nlp/blob/main/NLU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install tensorflow_datasets

import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from tensorflow.keras.preprocessing import sequence


imdb_train, ds_info = tfds.load(name='imdb_reviews', split='train', with_info=True, as_supervised=True)
imdb_test = tfds.load(name='imdb_reviews', split='test', as_supervised=True)


tokenizer = tfds.deprecated.text.Tokenizer()


# vocabulary_set = set()
# MAX_TOKENS = 0
# for example, label in imdb_train:
#   some_tokens = tokenizer.tokenize(example.numpy())
#   if MAX_TOKENS < len(some_tokens):
#     MAX_TOKENS = len(some_tokens)
#   vocabulary_set.update(some_tokens)

# imdb_encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary_set, tokenizer=tokenizer)
# imdb_encoder.save_to_file('reviews_vocab')

imdb_encoder = tfds.deprecated.text.TokenTextEncoder.load_from_file('reviews_vocab')
# message = imdb_encoder.decode(imdb_encoder.encode('Good case. Excellent value'))
# print(message)
# vocab_size = imdb_encoder.vocab_size
# print(vocab_size, MAX_TOKENS)


# for example, label in imdb_train.take(1):
#   print(example)
#   encoded = imdb_encoder.encode(example.numpy())
#   print('encoded', encoded)
#   print(imdb_encoder.decode(encoded))

def encode_pad_transform(sample):
  encoded = imdb_encoder.encode(sample.numpy())
  pad = sequence.pad_sequences([encoded], padding='post', maxlen=150)
  return np.array(pad[0], dtype=np.int64)

def encode_tf_fn(sample, label):
  encoded = tf.py_function(encode_pad_transform, inp=[sample], Tout=(tf.int64))
  encoded.set_shape([None])
  label.set_shape([])
  return encoded, label

# subset = imdb_train.take(10)
# tst = subset.map(encode_tf_fn)
# for review, label in tst.take(1):
#   print('review', review)
#   print('label', label)
#   print(imdb_encoder.decode(review))

encoded_train = imdb_train.map(encode_tf_fn)
encoded_test = imdb_test.map(encode_tf_fn)

def build_model_lstm(vocab_size, embedding_dim, rnn_unites, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, mask_zero=True, batch_input_shape=[batch_size, None]),
    # tf.keras.layers.LSTM(rnn_unites),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(rnn_unites)),
    tf.keras.layers.Dense(1, activation='sigmoid')
  ])
  return model

vocab_size = imdb_encoder.vocab_size
embedding_dim = 64
rnn_unites = 64
BATCH_SIZE = 100

model = build_model_lstm(
  vocab_size = vocab_size,
  embedding_dim=embedding_dim,
  rnn_unites = rnn_unites,
  batch_size = BATCH_SIZE
)

model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'Precision', 'Recall'])
encoded_train_batched=encoded_train.batch(BATCH_SIZE)

model.fit(encoded_train_batched, epochs=10)

model.evaluate(encoded_test.batch(BATCH_SIZE))

