In [137]:
#prerequisite
#!pip install kaggle

**Importing the Libraries**

In [138]:
import os
import json

from zipfile import ZipFile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding,Dropout, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

**Data using Kaggle API**

In [139]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
imdb-dataset-of-50k-movie-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [140]:
# unzip the dataset file
with ZipFile("imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref:
  zip_ref.extractall()

**Loading the Dataset**

In [141]:
data = pd.read_csv("/content/IMDB Dataset.csv")

In [142]:
max_features = 10000  # Number of words to consider as features (vocabulary size)
max_len = 200  # Maximum length of each review (after padding)

In [143]:
data.shape

(50000, 2)

In [144]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [145]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [146]:
data["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [147]:
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

In [148]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [149]:
data["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,25000
0,25000


In [150]:
# split data into training data and test data
train_data, test_data = train_test_split(data, random_state=42, test_size=0.2)

In [151]:
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


**Data Preprocessing**

In [152]:
# Tokenize text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])
#Preprocess the data (padding sequences to ensure uniform length)

x_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [153]:
print(x_train)

[[1935    1 1200 ...  205  351 3856]
 [   3 1651  595 ...   89  103    9]
 [   0    0    0 ...    2  710   62]
 ...
 [   0    0    0 ... 1641    2  603]
 [   0    0    0 ...  245  103  125]
 [   0    0    0 ...   70   73 2062]]


In [154]:
print(x_test)

[[   0    0    0 ...  995  719  155]
 [  12  162   59 ...  380    7    7]
 [   0    0    0 ...   50 1088   96]
 ...
 [   0    0    0 ...  125  200 3241]
 [   0    0    0 ... 1066    1 2305]
 [   0    0    0 ...    1  332   27]]


In [155]:
y_train = train_data["sentiment"]
y_test = test_data["sentiment"]

In [156]:
print(y_train)

39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64


**Build LSTM Model - Long Short Term Memory**

In [157]:
from tensorflow.keras.layers import BatchNormalization

model = Sequential()
model.add(Embedding(input_dim=max_features, output_dim=128, input_length=max_len))
model.add(LSTM(units=128, return_sequences=False))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(units=1, activation='sigmoid'))



In [158]:
model.summary()

In [159]:
# compile the model
import tensorflow as tf

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

**Train the Model**

In [160]:
from tensorflow.keras.callbacks import EarlyStopping

model.fit(x_train, y_train, epochs=10, batch_size=64, validation_data=(x_test, y_test), validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=5)])

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.7589 - loss: 0.4718 - val_accuracy: 0.8664 - val_loss: 0.3228
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.8935 - loss: 0.2647 - val_accuracy: 0.8594 - val_loss: 0.3244
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.9117 - loss: 0.2220 - val_accuracy: 0.8771 - val_loss: 0.3539
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.9267 - loss: 0.1848 - val_accuracy: 0.8498 - val_loss: 0.3594
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.9374 - loss: 0.1671 - val_accuracy: 0.8829 - val_loss: 0.3328
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step - accuracy: 0.9440 - loss: 0.1474 - val_accuracy: 0.8670 - val_loss: 0.3880


<keras.src.callbacks.history.History at 0x7d9ca1f64b20>

**Model Evaluation**

In [161]:
test_loss, test_accuracy = model.evaluate(x_test, y_test)
print(f"Test Loss: {test_loss}")
print(f'Test Accuracy: {test_accuracy:.4f}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8657 - loss: 0.3880
Test Loss: 0.3879960775375366
Test Accuracy: 0.8670


**Building a Predictive System**

In [162]:
def predict_sentiment(text):
    if not model or not tokenizer:
        return "Model or tokenizer not loaded correctly."

    # Tokenize and pad the input text
    sequences = tokenizer.texts_to_sequences([text])
    padded_sequences = pad_sequences(sequences, maxlen=model.input_shape[1])

    # Make prediction
    prediction = model.predict(padded_sequences)
    sentiment = np.argmax(prediction, axis=1)

    # Convert prediction to label
    labels = ['Negative', 'Positive']
    sentiment = np.argmax(prediction, axis=1)
    return labels[sentiment[0]]


In [163]:
# example usage
new_review = "This movie was pathetic. Never going to watch the movie again"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
The sentiment of the review is: Negative


In [164]:
model.save('sentiment_lstm_model.h5')



In [165]:
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle

# Example text data
texts = ['I love programming', 'Streamlit is great']

# Create and fit the tokenizer
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)

# Save the tokenizer using pickle
with open('sentiment_lstm_tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
