# Movie Sentiment Analyzer

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('wordnet')

2025-04-21 20:29:32.993393: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745267373.409190      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745267373.530738      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Text cleaning function
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [3]:
# Load and preprocess data
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df['cleaned_review'] = df['review'].apply(clean_text)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [4]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,1,one reviewer mentioned watching oz episode you...
1,A wonderful little production. <br /><br />The...,1,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter matteis love time money visually stunni...


In [5]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_review'], df['sentiment'], test_size=0.2, random_state=42
)

In [6]:
# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
# Padding
max_len = 200
X_train_pad = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_len, padding='post', truncating='post')

In [8]:
# Enhanced LSTM Model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),
    BatchNormalization(),
    # Dropout(0.3),

    # LSTM(128, return_sequences=True, kernel_regularizer=l2(0.01)),
    # BatchNormalization(),

    LSTM(1024, kernel_regularizer=l2(0.01)),
    BatchNormalization(),

    # LSTM(64, kernel_regularizer=l2(0.01)),
    # BatchNormalization(),
    # Dropout(0.3),

    # Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
    # BatchNormalization(),
    # Dropout(0.5),

    Dense(1, activation='sigmoid')
])

# Compile with adjusted learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True,verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=1e-6),
    ModelCheckpoint(
        filepath='best_model.keras',
        monitor='val_accuracy',
        save_best_only=True,
        mode='max'
    )
]

I0000 00:00:1745267432.221044      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1745267432.221686      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [9]:
# Train
history = model.fit(
    X_train_pad, y_train,
    epochs=200,
    batch_size=64,
    validation_split=0.1,
    callbacks=callbacks
)

Epoch 1/200


I0000 00:00:1745267439.407238      90 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 146ms/step - accuracy: 0.4994 - loss: 2.3187 - val_accuracy: 0.4963 - val_loss: 1.2251 - learning_rate: 1.0000e-04
Epoch 2/200
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 158ms/step - accuracy: 0.5385 - loss: 1.0555 - val_accuracy: 0.4963 - val_loss: 1.2604 - learning_rate: 1.0000e-04
Epoch 3/200
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 158ms/step - accuracy: 0.5589 - loss: 0.8532 - val_accuracy: 0.6913 - val_loss: 1.3733 - learning_rate: 1.0000e-04
Epoch 4/200
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 158ms/step - accuracy: 0.6065 - loss: 0.7771 - val_accuracy: 0.5255 - val_loss: 0.8708 - learning_rate: 1.0000e-04
Epoch 5/200
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 158ms/step - accuracy: 0.6283 - loss: 0.7428 - val_accuracy: 0.5735 - val_loss: 1.3824 - learning_rate: 1.0000e-04
Epoch 6/200
[1m563/563[0m [32m━━━━━━━━━

In [10]:
# Assuming you have x_test and y_test
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 36ms/step - accuracy: 0.8663 - loss: 0.4639
Test Loss: 0.46459683775901794
Test Accuracy: 0.8675000071525574


In [11]:
# Save final model
model.save('final_lstm_model.keras')

# Save metadata
metadata = {
    'max_len': max_len,
    'clean_text_func': clean_text
}
with open('metadata.pickle', 'wb') as handle:
    pickle.dump(metadata, handle, protocol=pickle.HIGHEST_PROTOCOL)