In [1]:
from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.strings import regex_replace
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout

In [2]:
def prepareData(dir):
  data = text_dataset_from_directory(dir)
  return data.map(
    lambda text, label: (regex_replace(text, '<br />', ' '), label),
  )

train_data = prepareData('./train')
test_data = prepareData('./test')

for text_batch, label_batch in train_data.take(1):
  print(text_batch.numpy()[0])
  print(label_batch.numpy()[0]) # 0 = negative, 1 = positive

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.
b'I got to see this film at a preview and was dazzled by it. It\'s not the typical romantic comedy. I can\'t remember laughing so hard at a film and yet being moved by it. The laughs aren\'t gags here--they\'re observations, laughs of recognition, little shocks of "Oh, my God, I thought I was the only one who felt that way!" I won\'t give away the plot, which is more than just "Guy falls in love with his brother\'s girlfriend." The whole family plays a part in the relationship here. Probably the best blend of laughter and warmth since "While You Were Sleeping."   Steve Carell goes much deeper than he\'s gone before, and for the first time I really liked him. The cast is amazing, a list of veteran theater actors whom I\'ve loved in other roles, but they blend to make a convincing family. Dianne Wiest is lovely as the mother, Juliette Binoche is luminous and hilarious (who knew she was funny?), and even t

In [3]:
model = Sequential()

# 1. INPUT
model.add(Input(shape=(1,), dtype="string"))

In [4]:
# TEXT VECTORIZATION
max_tokens = 1000
max_len = 100
vectorize_layer = TextVectorization(max_tokens=max_tokens,output_mode="int",output_sequence_length=max_len,)

train_texts = train_data.map(lambda text, label: text)
vectorize_layer.adapt(train_texts)

model.add(vectorize_layer)

In [5]:
# EMBEDDING
model.add(Embedding(max_tokens + 1, 128))

In [6]:
# RECURRENT LAYER
model.add(LSTM(64))

In [7]:
# DENSE HIDDEN LAYER
model.add(Dense(64, activation="relu"))

In [8]:
# OUTPUT
model.add(Dense(1, activation="sigmoid"))

In [9]:
model.layers[-1]

<tensorflow.python.keras.layers.core.Dense at 0x642cd23d0>

In [10]:
# Compile and train the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(train_data, epochs=10)

model.save_weights('rnn')
model.load_weights('rnn')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x642ce0fd0>

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 100)               0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 128)          128128    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                49408     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 181,761
Trainable params: 181,761
Non-trainable params: 0
_________________________________________________________________


In [35]:
model.evaluate(test_data)



[0.5209300518035889, 0.7858399748802185]

In [36]:
print(model.predict([
  "i loved it! highly recommend it to everyone looking for a great movie to watch.",
]))
print(model.predict([
  "this was awful and stupid! i hated it so much, it was just the absolute worst.",
]))

[[0.986112]]
[[0.04109952]]


In [37]:
print(model.predict([
  "this movie is awful and i love it.",
]))
print(model.predict([
  "this movie is awful but i love it.",
]))

[[0.15412727]]
[[0.45177472]]
