In [1]:
# Import required libraries

import pandas as pd    # for loading and handling dataset
import numpy as np     # for mathematical operations
import re              # for regex text cleaning
from nltk.corpus import stopwords   # stopwords for text preprocessing
from sklearn.model_selection import train_test_split       # split dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # encode text to integers
from tensorflow.keras.preprocessing.sequence import pad_sequences   # padding/truncating
from tensorflow.keras.models import Sequential     # sequential model
from tensorflow.keras.layers import Embedding, LSTM, Dense # model layers
from tensorflow.keras.callbacks import ModelCheckpoint     # save best model
from tensorflow.keras.models import load_model             # load saved model

# Load dataset
data = pd.read_csv('Dataset/IMDB Dataset.csv')
print(data.head())


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [2]:
# Define English stopwords

english_stops = set(stopwords.words('english'))

# Function to load and preprocess dataset
def load_dataset():
    df = pd.read_csv('Dataset/IMDB Dataset.csv')
    x_data = df['review']       # reviews (input)
    y_data = df['sentiment']    # sentiment (output)

    # Remove HTML tags
    x_data = x_data.replace({'<.*?>': ''}, regex=True)

    # Remove non-alphabet characters
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex=True)

    # Remove stopwords
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])

    # Convert to lowercase
    x_data = x_data.apply(lambda review: [w.lower() for w in review])

    # Encode labels (positive → 1, negative → 0)
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

# Load preprocessed data
x_data, y_data = load_dataset()

print("Sample reviews:\n", x_data.head())
print("\nSample sentiments:\n", y_data.head())


Sample reviews:
 0    [one, reviewers, mentioned, watching, oz, epis...
1    [a, wonderful, little, production, the, filmin...
2    [i, thought, wonderful, way, spend, time, hot,...
3    [basically, family, little, boy, jake, thinks,...
4    [petter, mattei, love, time, money, visually, ...
Name: review, dtype: object

Sample sentiments:
 0    1
1    1
2    1
3    0
4    1
Name: sentiment, dtype: int64


  y_data = y_data.replace('negative', 0)


In [3]:
# Train Test Split

x_train, x_test, y_train, y_test = train_test_split(
    x_data, y_data, test_size=0.2, random_state=42
)

print("Train size:", len(x_train))
print("Test size:", len(x_test))


Train size: 40000
Test size: 10000


In [4]:
# Function to calculate max review length (average)

def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))
    return int(np.ceil(np.mean(review_length)))

max_length = get_max_length()
print("Maximum review length:", max_length)


Maximum review length: 130


In [5]:
# Tokenize words
token = Tokenizer(lower=False)    # no need lower, already lowercase
token.fit_on_texts(x_train)

# Convert text to sequences
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

# Pad sequences to fixed length
x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

# Total words in vocabulary
total_words = len(token.word_index) + 1   # +1 for padding

print("Vocabulary size:", total_words)
print("Example padded review:\n", x_train[0])


Vocabulary size: 92546
Example padded review:
 [  145     1   702  2078    38  1815  1945  4247  6378   698  4753 21764
   135     2  6062    22   680    30     5  1922    31    48  1031  2286
  1095   361   458   111   820   309 28145    70  2900 35698  4284  2861
   129    17    53  3064   141   339     1   110  4855     1    14     5
 14365    18   269     4   110   212 56913 39814  1772  9974    45   324
 45914 45915  1598 11127   480   330   286     1    46    44   272 11735
     1     1  6717   268 12160  1076 23833   969 56914 56915   373   125
   920  4090    86   345   598   525   143    41   750   287   128   335
   271    10  3660   184 10125    73  1619    98 11519 11324   629  1859
  1110  2803 11325  3635  1599 32614   379     1   136   115   231     1
   980  2755  8590     1  9082  3940     3   903 16627   401]


In [6]:
# Model architecture

EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length=max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))  # binary output

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.build(input_shape=(None, max_length))

# Show model summary
print(model.summary())




None


In [7]:
# Save best model 

checkpoint = ModelCheckpoint(
    'models/LSTM.keras',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

# Train model
history = model.fit(
    x_train, y_train,
    batch_size=128,
    epochs=5,
    callbacks=[checkpoint]
)


Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - accuracy: 0.5124 - loss: 0.6921
Epoch 1: accuracy improved from -inf to 0.51858, saving model to models/LSTM.keras
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 119ms/step - accuracy: 0.5124 - loss: 0.6921
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step - accuracy: 0.6485 - loss: 0.6326
Epoch 2: accuracy improved from 0.51858 to 0.68415, saving model to models/LSTM.keras
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 110ms/step - accuracy: 0.6486 - loss: 0.6325
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - accuracy: 0.6396 - loss: 0.6211
Epoch 3: accuracy did not improve from 0.68415
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 117ms/step - accuracy: 0.6396 - loss: 0.6212
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/ste

In [8]:
# Predict on test data
y_pred = (model.predict(x_test, batch_size=128) > 0.5).astype("int32")

# Calculate accuracy manually
true = np.sum(y_test.values == y_pred.flatten())
print("Correct Predictions:", true)
print("Wrong Predictions:", len(y_pred) - true)
print("Accuracy:", true / len(y_pred) * 100)


[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step
Correct Predictions: 7642
Wrong Predictions: 2358
Accuracy: 76.42


In [10]:
# Load best saved model
from tensorflow.keras.models import load_model  
loaded_model = load_model('models/LSTM.keras')

# Input custom review
review = str(input("Movie Review: "))

# Clean input review
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print("Cleaned:", review)

# Remove stopwords
words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]
print("Filtered:", filtered)

# Convert to sequence
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print("Encoded input:", tokenize_words)

# Predict sentiment
result = loaded_model.predict(tokenize_words)

if result >= 0.7:
    print("Sentiment: Positive")
else:
    print("Sentiment: Negative")


Cleaned: I Love Mountains
Filtered: ['i love mountains']
Encoded input: [[   1   42 4076    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 167ms/step
Sentiment: Positive
