# Movie Review Sentiment Analyzer

## Importing the required Libraries and Dependencies

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import pickle

## Loading the Dataset

In [2]:
data = pd.read_csv("IMDB_reviews_dataset.csv")

In [3]:
data.shape

(150000, 4)

In [4]:
data.head()

Unnamed: 0,Ratings,Reviews,Movies,Resenhas
0,1.0,*Disclaimer: I only watched this movie as a co...,Disaster Movie,* Isenção de responsabilidade: eu só assisti e...
1,1.0,I am writing this in hopes that this gets put ...,Disaster Movie,Estou escrevendo isso na esperança de que isso...
2,1.0,"Really, I could write a scathing review of thi...",Disaster Movie,"Realmente, eu poderia escrever uma crítica con..."
3,1.0,If you saw the other previous spoof movies by ...,Disaster Movie,Se você viu os outros filmes falsificados ante...
4,1.0,This movie I saw a day early for free and I st...,Disaster Movie,Este filme eu vi um dia cedo de graça e ainda ...


In [5]:
data.tail()

Unnamed: 0,Ratings,Reviews,Movies,Resenhas
149995,10.0,GoldenEye (1995) is my number 1 personal favor...,GoldenEye,GoldenEye (1995) é o meu filme favorito pessoa...
149996,10.0,"*** 1/2Starring: Pierce Brosnan, Izabella Scor...",GoldenEye,"Estrelando: Pierce Brosnan, Izabella Scorupco,..."
149997,10.0,"I've given this film a 10, not just because I ...",GoldenEye,"Eu dei 10 a este filme, não apenas porque eu g..."
149998,10.0,I absolutely adore this movie. What a comeback...,GoldenEye,Eu absolutamente adoro este filme. Que retorno...
149999,10.0,I rented Boogie Nights last week and I could t...,Boogie Nights,Aluguei o Boogie Nights na semana passada e pu...


## Data Cleaning

In [6]:
data.isnull().sum()

Ratings      0
Reviews      0
Movies      25
Resenhas     0
dtype: int64

## Label Encoding

In [7]:
data['Label'] = data['Ratings'].apply(lambda x: '1' if x >= 7 else ('0' if x < 4 else '2'))
data = data[['Reviews', 'Label']]

In [8]:
data.head()

Unnamed: 0,Reviews,Label
0,*Disclaimer: I only watched this movie as a co...,0
1,I am writing this in hopes that this gets put ...,0
2,"Really, I could write a scathing review of thi...",0
3,If you saw the other previous spoof movies by ...,0
4,This movie I saw a day early for free and I st...,0


In [9]:
data["Label"].value_counts()

Label
1    60000
0    45000
2    45000
Name: count, dtype: int64

### The Distribution of your labels is Unbalanced

In [10]:
# Sample 45,000 data points from each label category
positive = data[data['Label'] == '1'].sample(n=45000, random_state=42)
negative = data[data['Label'] == '0']
neutral = data[data['Label'] == '2']

In [11]:
# Concatenate the sampled datasets
balanced_data = pd.concat([positive, negative, neutral])

In [12]:
# Shuffle the final dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the new distribution of labels
print(balanced_data['Label'].value_counts())

Label
0    45000
1    45000
2    45000
Name: count, dtype: int64


## Splitting dataset into train and test datasets

In [13]:
train_data, test_data = train_test_split(balanced_data, test_size=0.2, random_state=42)

In [14]:
print(train_data.shape)
print(test_data.shape)

(108000, 2)
(27000, 2)


## Data Preprocessing

In [15]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(balanced_data['Reviews'])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["Reviews"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["Reviews"]), maxlen=200)

In [16]:
print(X_train)

[[   0    0    0 ...  102   93  270]
 [   0    0    0 ...   77  842  438]
 [   2 1246   67 ...  222  211  157]
 ...
 [   0    0    0 ...   26 2993   44]
 [   0    0    0 ...  231  927    2]
 [ 252   11   99 ...   26  581  770]]


In [17]:
print(X_test)

[[   0    0    0 ... 1830    1 2857]
 [   0    0    0 ...    2  914  351]
 [   0    0    0 ...   21   10   12]
 ...
 [   0    0    0 ...   61   23  296]
 [   4    1  241 ...   90   10 1139]
 [   0    0    0 ...    7  672  145]]


In [18]:
y_train = train_data['Label'].astype('int')
y_test = test_data['Label'].astype('int')

In [19]:
print(y_train)

62354     0
130705    1
36666     2
3354      0
60070     2
         ..
110268    0
119879    1
103694    0
131932    1
121958    2
Name: Label, Length: 108000, dtype: int32


In [20]:
print(y_test)

73478     1
83763     1
62776     0
88781     1
86013     0
         ..
5093      2
112967    2
43643     0
78903     0
101207    1
Name: Label, Length: 27000, dtype: int32


## Building the Long Short Term Memory(LSTM) Model

In [21]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))



In [22]:
## Compiling the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

## Training the Model

In [25]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/20
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m661s[0m 487ms/step - accuracy: 0.5795 - loss: 0.8824 - val_accuracy: 0.6706 - val_loss: 0.7412
Epoch 2/20
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m647s[0m 479ms/step - accuracy: 0.6952 - loss: 0.6933 - val_accuracy: 0.7123 - val_loss: 0.6504
Epoch 3/20
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m645s[0m 478ms/step - accuracy: 0.7459 - loss: 0.5994 - val_accuracy: 0.7375 - val_loss: 0.6074
Epoch 4/20
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m699s[0m 518ms/step - accuracy: 0.7698 - loss: 0.5468 - val_accuracy: 0.7420 - val_loss: 0.6041
Epoch 5/20
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m785s[0m 582ms/step - accuracy: 0.7866 - loss: 0.5127 - val_accuracy: 0.7410 - val_loss: 0.5979
Epoch 6/20
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m780s[0m 578ms/step - accuracy: 0.8034 - loss: 0.4787 - val_accuracy: 0.7384 - val_loss:

<keras.src.callbacks.history.History at 0x18ae59103e0>

## Model Evaluation

In [26]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m844/844[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 40ms/step - accuracy: 0.7287 - loss: 0.6558
Test Loss: 0.6591445803642273
Test Accuracy: 0.7285555601119995


## Building a Predictive System

In [27]:
def predict_sentiment(review):
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = ['negative', 'positive', 'neutral'][prediction.argmax()]
  return sentiment 

### Example Usage

In [28]:
new_review = "This movie was fantastic. I loved it."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 376ms/step
The sentiment of the review is: positive


In [29]:
new_review = "Movie was ok it is a one time watch."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
The sentiment of the review is: neutral


In [30]:
new_review = "This movie is worse."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
The sentiment of the review is: negative


In [31]:
new_review = "This movie was ok but not that good."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
The sentiment of the review is: neutral


In [32]:
new_review = "This movie is not good."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
The sentiment of the review is: negative


In [33]:
new_review = "The movie graphics and VFX are good but movie is just satisfactory"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
The sentiment of the review is: neutral


## Saving the Model and Tokenizer

In [34]:
# Saving the LSTM Model
model.save('model.h5')

# Saving the Tokenizer
with open('token.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)



## Example Usage on saved model

In [35]:
# Loading the LSTM Model
loaded_model = load_model('model.h5')

# Loading the Tokenizer
with open('tokenizer.pkl', 'rb') as file:
    loaded_tokenizer = pickle.load(file)

# Now use `loaded_model` and `loaded_tokenizer` for predictions
def predict_the_sentiment(review, model=loaded_model, tokenizer=loaded_tokenizer):
    sequence = tokenizer.texts_to_sequences([review])
    padded_sequence = pad_sequences(sequence, maxlen=200)
    prediction = model.predict(padded_sequence)
    sentiment = ['negative', 'positive', 'neutral'][prediction.argmax()]
    return sentiment



In [36]:
new_review = "The movie graphics and VFX are good but movie is just satisfactory"
sentiment = predict_the_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 362ms/step
The sentiment of the review is: neutral


In [37]:
new_review = "This movie is not good."
sentiment = predict_the_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
The sentiment of the review is: negative
