<a href="https://colab.research.google.com/github/harshitabhambhani/ML-DL-models/blob/main/Basic_NLP_Project_Sentiment_Analysis_(IMDb_movie_reviews).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **IMDb movie reviews: Sentiment analysis using deep learning models**

## **Objective:**
To build a simple sentiment analysis model that predicts whether a movie review is positive or negative.

### **Import necessary libraries:**

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re # for regex
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score
import pickle

### **Load and preprocess the dataset:**

In [2]:
imdb_data=pd.read_csv('https://raw.githubusercontent.com/SK7here/Movie-Review-Sentiment-Analysis/master/IMDB-Dataset.csv')
print(imdb_data.shape)
imdb_data.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


### **Exploratery data analysis:**

In [3]:
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


### **Sentiment count:**

In [4]:
imdb_data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

### **Spliting the training dataset:**

In [5]:
#train dataset
train_reviews=imdb_data.review[:40000]
train_sentiments=imdb_data.sentiment[:40000]

#test dataset
test_reviews=imdb_data.review[40000:]
test_sentiments=imdb_data.sentiment[40000:]
print(train_reviews.shape,train_sentiments.shape)
print(test_reviews.shape,test_sentiments.shape)

(40000,) (40000,)
(10000,) (10000,)


### **Data labeling:**

In [6]:
labeling = {
    'positive':1,
    'negative':0
}

imdb_data['sentiment'] = imdb_data['sentiment'].apply(lambda x : labeling[x])
# Output first ten rows
imdb_data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
5,"Probably my all-time favorite movie, a story o...",1
6,I sure would like to see a resurrection of a u...,1
7,"This show was an amazing, fresh & innovative i...",0
8,Encouraged by the positive comments about this...,0
9,If you like original gut wrenching laughter yo...,1


### **Separating labels and features:**

In [7]:
label=imdb_data['sentiment']
imdb_data=imdb_data.drop(['sentiment'],axis=1)
label=label.tolist()
print(type(label))

<class 'list'>


### **Extracting text reviews:**

In [8]:
imdb_data=imdb_data['review'].tolist()
print(type(imdb_data))

<class 'list'>


### **Variable assignment:**

In [9]:
data=imdb_data
labels=label

### **Text tokenization and padding:**

In [10]:
MAX_SEQUENCE_LENGTH = 500
EMBEDDING_DIM = 10
# https://keras-cn-docs.readthedocs.io/zh_CN/latest/blog/word_embedding/
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# tokenizer
texts = data
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index

# sequences
sequences = tokenizer.texts_to_sequences(data)

# padding
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

print('Found %s unique tokens.' % len(word_index))
print('Shape of data tensor:', data.shape)

Found 162 unique tokens.
Shape of data tensor: (50000, 500)


### **Shuffling and splitting data for training and testing:**

In [11]:
import random

index = [i for i in range(len(data))]
random.shuffle(index)
data = np.array(data)[index]
labels = np.array(labels)[index]

TRAIN_SPLIT = 0.8
TRAIN_SIZE = int(len(data) * TRAIN_SPLIT)

X_train, X_test = data[0:TRAIN_SIZE], data[TRAIN_SIZE:]
Y_train, Y_test = labels[0:TRAIN_SIZE], labels[TRAIN_SIZE:]

## **Building a neural network model using CNN+BiLSTM:**

In [12]:
from keras.models import Sequential
from keras.layers import Activation, BatchNormalization
from keras.layers import Dense, LSTM, Convolution1D, MaxPooling1D
from keras.layers import Embedding
from keras.layers import Bidirectional


QA_EMBED_SIZE = 64
DROPOUT_RATE = 0.3

model = Sequential()
model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(Convolution1D(filters=128, kernel_size=3, padding='valid', activation='relu'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(4))
model.add(Bidirectional(LSTM(QA_EMBED_SIZE, return_sequences=False, dropout=DROPOUT_RATE, recurrent_dropout=DROPOUT_RATE)))

model.add(Dense(QA_EMBED_SIZE))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(1))
model.add(BatchNormalization())
model.add(Activation("sigmoid"))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 10)           1630      
                                                                 
 conv1d (Conv1D)             (None, 498, 128)          3968      
                                                                 
 batch_normalization (Batch  (None, 498, 128)          512       
 Normalization)                                                  
                                                                 
 activation (Activation)     (None, 498, 128)          0         
                                                                 
 max_pooling1d (MaxPooling1  (None, 124, 128)          0         
 D)                                                              
                                                                 
 bidirectional (Bidirection  (None, 128)               9

### **Custom evaluation metrics**

In [13]:
import tensorflow as tf
from keras import backend as K

def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall))

### **Model training with callbacks**

In [14]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
# from keras.utils import multi_gpu_model
#from evaluate import *

EPOCHS = 3
BATCH_SIZE = 64
VALIDATION_SPLIT = 0.3

early_stopping = EarlyStopping(monitor='val_loss', patience=10)
model_checkpoint = ModelCheckpoint('model/model-cnn-blstm.h5', save_best_only=True, save_weights_only=True)
tensor_board = TensorBoard('log/tflog-cnn-blstm', write_graph=True, write_images=True)

# model = multi_gpu_model(model)

model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', precision, recall, f1])

model.fit(X_train, Y_train, epochs=EPOCHS, batch_size=BATCH_SIZE,
          validation_split=VALIDATION_SPLIT, shuffle=True,
          callbacks=[early_stopping, model_checkpoint, tensor_board])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7850ac29ea10>

### **Model evaluation on test data:**

In [15]:
model.evaluate(X_test, Y_test, verbose=1, batch_size=BATCH_SIZE)



[0.7038489580154419,
 0.5856000185012817,
 0.9630365967750549,
 0.18367817997932434,
 0.3034539818763733]

## **Modified neural network model (using CNN+BiLSTM):**

In [16]:
from keras.models import Sequential
from keras.layers import Activation, BatchNormalization, Flatten
from keras.layers import Dense, LSTM, Convolution1D, MaxPooling1D
from keras.layers import Embedding
from keras.layers import Bidirectional

QA_EMBED_SIZE = 64
DROPOUT_RATE = 0.3

model = Sequential()
model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(Convolution1D(filters=128, kernel_size=3, padding='valid', activation='relu'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(4))
model.add(Bidirectional(LSTM(QA_EMBED_SIZE, return_sequences=True, dropout=DROPOUT_RATE, recurrent_dropout=DROPOUT_RATE)))
model.add(Convolution1D(filters=128, kernel_size=3, padding='valid', activation='relu'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(4))
model.add(Flatten())
model.add(Dense(QA_EMBED_SIZE))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(1))
model.add(BatchNormalization())
model.add(Activation("sigmoid"))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 10)           1630      
                                                                 
 conv1d_1 (Conv1D)           (None, 498, 128)          3968      
                                                                 
 batch_normalization_3 (Bat  (None, 498, 128)          512       
 chNormalization)                                                
                                                                 
 activation_3 (Activation)   (None, 498, 128)          0         
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 124, 128)          0         
 g1D)                                                            
                                                                 
 bidirectional_1 (Bidirecti  (None, 124, 128)         

### **Model training with callbacks (Modified model)**

In [17]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
# from keras.utils import multi_gpu_model
#from evaluate import *

EPOCHS = 3
BATCH_SIZE = 64
VALIDATION_SPLIT = 0.3

early_stopping = EarlyStopping(monitor='val_loss', patience=10)
model_checkpoint = ModelCheckpoint('model/model-cnn-blstm-cnn.h5', save_best_only=True, save_weights_only=True)
tensor_board = TensorBoard('log/tflog-cnn-blstm-cnn', write_graph=True, write_images=True)

# model = multi_gpu_model(model)

model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', precision, recall, f1])

model.fit(X_train, Y_train, epochs=EPOCHS, batch_size=BATCH_SIZE,
          validation_split=VALIDATION_SPLIT, shuffle=True,
          callbacks=[early_stopping, model_checkpoint, tensor_board])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7850a5049300>

### **Model evaluation on test data (Modified model)**

In [18]:
model.evaluate(X_test, Y_test, verbose=1, batch_size=BATCH_SIZE)



[1.9736465215682983,
 0.5058000087738037,
 0.40764331817626953,
 0.016331851482391357,
 nan]

## **Simplified neural network model (BiLSTM)**

In [19]:
from keras.models import Sequential
from keras.layers import Activation, BatchNormalization
from keras.layers import Dense, LSTM
from keras.layers import Embedding
from keras.layers import Bidirectional

QA_EMBED_SIZE = 64
DROPOUT_RATE = 0.3

model = Sequential()
model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(Bidirectional(LSTM(QA_EMBED_SIZE, return_sequences=False, dropout=DROPOUT_RATE, recurrent_dropout=DROPOUT_RATE)))

model.add(Dense(QA_EMBED_SIZE))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(1))
model.add(BatchNormalization())
model.add(Activation("sigmoid"))

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 500, 10)           1630      
                                                                 
 bidirectional_2 (Bidirecti  (None, 128)               38400     
 onal)                                                           
                                                                 
 dense_4 (Dense)             (None, 64)                8256      
                                                                 
 batch_normalization_7 (Bat  (None, 64)                256       
 chNormalization)                                                
                                                                 
 activation_7 (Activation)   (None, 64)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                

### **Model training with callbacks (Simplified model)**

In [20]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
# from keras.utils import multi_gpu_model
#from evaluate import *

EPOCHS = 3
BATCH_SIZE = 64
VALIDATION_SPLIT = 0.3

early_stopping = EarlyStopping(monitor='val_loss', patience=10)
model_checkpoint = ModelCheckpoint('model/model-blstm.h5', save_best_only=True, save_weights_only=True)
tensor_board = TensorBoard('log/tflog-blstm', write_graph=True, write_images=True)

# model = multi_gpu_model(model)

model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', precision, recall, f1])

model.fit(X_train, Y_train, epochs=EPOCHS, batch_size=BATCH_SIZE,
          validation_split=VALIDATION_SPLIT, shuffle=True,
          callbacks=[early_stopping, model_checkpoint, tensor_board])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7850a3ee6e00>

### **Model evaluation on test data:**

In [21]:
model.evaluate(X_test, Y_test, verbose=1, batch_size=BATCH_SIZE)



[0.656335711479187,
 0.5968999862670898,
 0.5902595520019531,
 0.6518038511276245,
 0.6160317063331604]

## **Neural network model with convolutional layer (sequential LSTM):**

In [22]:
from keras.models import Sequential
from keras.layers import Activation, BatchNormalization, Flatten
from keras.layers import Dense, LSTM, Convolution1D, MaxPooling1D
from keras.layers import Embedding
from keras.layers import Bidirectional

QA_EMBED_SIZE = 64
DROPOUT_RATE = 0.3

model = Sequential()
model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(Bidirectional(LSTM(QA_EMBED_SIZE, return_sequences=True, dropout=DROPOUT_RATE, recurrent_dropout=DROPOUT_RATE)))
model.add(Convolution1D(filters=128, kernel_size=3, padding='valid', activation='relu'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(4))
model.add(Flatten())

model.add(Dense(QA_EMBED_SIZE))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(1))
model.add(BatchNormalization())
model.add(Activation("sigmoid"))

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 500, 10)           1630      
                                                                 
 bidirectional_3 (Bidirecti  (None, 500, 128)          38400     
 onal)                                                           
                                                                 
 conv1d_3 (Conv1D)           (None, 498, 128)          49280     
                                                                 
 batch_normalization_9 (Bat  (None, 498, 128)          512       
 chNormalization)                                                
                                                                 
 activation_9 (Activation)   (None, 498, 128)          0         
                                                                 
 max_pooling1d_3 (MaxPoolin  (None, 124, 128)         

### **Model training with callbacks (LSTM-CNN model):**

In [23]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
# from keras.utils import multi_gpu_model
#from evaluate import *

EPOCHS = 3
BATCH_SIZE = 64
VALIDATION_SPLIT = 0.3

early_stopping = EarlyStopping(monitor='val_loss', patience=10)
model_checkpoint = ModelCheckpoint('model/model-blstm-cnn.h5', save_best_only=True, save_weights_only=True)
tensor_board = TensorBoard('log/tflog-blstm-cnn', write_graph=True, write_images=True)

# model = multi_gpu_model(model)

model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', precision, recall, f1])

model.fit(X_train, Y_train, epochs=EPOCHS, batch_size=BATCH_SIZE,
          validation_split=VALIDATION_SPLIT, shuffle=True,
          callbacks=[early_stopping, model_checkpoint, tensor_board])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x78509c1b6560>

### **Model evaluation on test data (LSTM-CNN Model):**

In [24]:
model.evaluate(X_test, Y_test, verbose=1, batch_size=BATCH_SIZE)



[1.3729488849639893,
 0.5098000168800354,
 0.506460428237915,
 0.997734785079956,
 0.6693865656852722]