# Import necessary libraries

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import re
nltk.download('stopwords')

# Load IMDB dataset
imdb = tf.keras.datasets.imdb
vocab_size = 10000  # Only consider the top 10,000 words by frequency

# Split data into train and test sets
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)

# Function to preprocess text data (optional for more advanced preprocessing)
def preprocess_text(texts):
    # Replace non-alphabet characters with spaces
    return [' '.join(re.sub(r'[^a-zA-Z]', ' ', ' '.join([str(x) for x in text]))) for text in texts]

# Decode reviews for interpretability (optional)
word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in text])

# Example review output (optional for understanding dataset structure)
print(decode_review(X_train[0]))

# Define the maximum review length for padding
maxlen = 500

# Pad sequences to ensure uniform input length for the model
X_train_padded = pad_sequences(X_train, maxlen=maxlen, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test, maxlen=maxlen, padding='post', truncating='post')

# Build the RNN model using LSTM layers
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=maxlen))  # Embedding layer
model.add(LSTM(128, return_sequences=False))  # LSTM layer for sequential data
model.add(Dropout(0.5))  # Dropout for regularization to prevent overfitting
model.add(Dense(1, activation='sigmoid'))  # Output layer with sigmoid for binary classification

# Compile the model with appropriate loss function and optimizer
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary to understand the structure
model.summary()

# Split the training data into train and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train_padded, y_train, test_size=0.2, random_state=42)

# Train the model using the training data and validate on the validation set
# Number of epochs: 10 (the model will go through the entire dataset 10 times)
epochs = 10
history = model.fit(X_train, y_train, epochs=epochs, batch_size=32, validation_data=(X_val, y_val), verbose=1)

# Evaluate the model on the test dataset to see the performance
test_loss, test_acc = model.evaluate(X_test_padded, y_test, verbose=1)
print(f"Test Accuracy: {test_acc}")

# Save the trained model to disk
model.save("imdb_rnn_model.h5")

# Plotting training and validation accuracy and loss (optional, for better understanding of training process)
import matplotlib.pyplot as plt

# Plot accuracy
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy over Epochs')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.show()

# Plot loss
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss over Epochs')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1us/step
? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? o



Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m627s[0m 999ms/step - accuracy: 0.4936 - loss: 0.6944 - val_accuracy: 0.5114 - val_loss: 0.6929
Epoch 2/10
[1m218/625[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m6:16[0m 925ms/step - accuracy: 0.4991 - loss: 0.6934