In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


# **Reading the movie review data in a dataframe df.**

In [2]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

# **Having a look at the dataframe.**

In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# **We see that we have enough data to train. We have 50,000 review texts and their corresponding sentiments, but, these texts have to be pre-processed using natural language processing techniques as they contain unwanted elements like html tags, hashtags and special characters, that have no use for our model to train on.**

# **Importing necessary libraries for pre-processing text and creation of the recurrent neural network.**

In [4]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, Bidirectional,Flatten,Dropout
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

# **Creating functions to pre-process the review text using natural language processing techniques as they contain unwanted elements like html tags, hashtags, special characters, numbers, etc. and applying them to the review text column.**

In [5]:
def remove_special_chars(tweets): # it unrolls the hashtags to normal words
        alphanumeric=""
        for character in tweets:
            if character.isalpha() or character==" ":
                alphanumeric += character
        return alphanumeric
def remove_tags(text):
     return re.compile(r"<[^>]+>").sub(" ", text)
def remove_num(text):
     return "".join(re.sub(r"([0–9]+)"," ",text))
#data = pd.read_csv('/content/IMDB Dataset.csv')
df.review=df.review.apply(lambda x : remove_tags(x))
df.review=df.review.apply(lambda x : remove_num(x))
df.review=(df.review).apply(remove_special_chars)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming te...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive


# **We will use the Tokenizer class from the keras.preprocessing.text module to create a word-to-index dictionary. In the word-to-index dictionary, each word in the corpus is used as a key, while a corresponding unique index is used as the value for the key. As we want each review to be of the same length so that it can be fit in the model, we pad it to get all reviews of the same length set the maximum size of each list to 500. The lists with a size greater than 500 will be truncated to 500. This process is called padding.**

# **Since the sentiment is in the form of text ("positive" amd "negative"), converting them to numerical values using label encoder for our model to understand the sentiment in numerical form.**

In [6]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df.review)
X = tokenizer.texts_to_sequences(df.review)
X = pad_sequences(X,maxlen=500)
Y = df.sentiment
vocab_size = len(tokenizer.word_index) + 1
# We can then create our train and test sets:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 24)
#We store this tokenizer in a file to use later in web app
import pickle
# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
from sklearn.preprocessing import LabelEncoder
def prepare_targets(y_train, y_test):
     le = LabelEncoder()
     le.fit(y_train)
     y_train_enc = le.transform(y_train)
     y_test_enc = le.transform(y_test)
     return y_train_enc, y_test_enc
ytrain,ytest = prepare_targets(Y_train,Y_test)

# Now we design a sequential model using Keras. We add different layers such as embedding layer, Bi-LSTM layer and a dense layer.
# Bidirectional Long Short-Term Memory (BiLSTM) is a type of recurrent neural networks. It processes data in two directions since it works with two hidden layers. This is the main point of divergence with LSTM. BiLSTM has proven good results in natural language processing. Adding dropout layer to reduce overfitting.

In [7]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=500))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='adam',
 loss='binary_crossentropy',
 metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 50)           8134400   
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               183296    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 8,317,953
Trainable params: 8,317,953
Non-trainable params: 0
_________________________________________________________________


# Training the model for 10 epochs with appropriate hyperparameters.

In [9]:
from keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
history=model.fit(X_train, ytrain,
 batch_size=128,
 epochs=10,
 callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# **Our model achieves good accuracy on the training set (95.41 %).**

# Testing our model for new reviews, typed by me or taken from the internet. 

In [99]:
print("Enter your review:")
string11=str(input())
x_1=tokenizer.texts_to_sequences([string11])
x_1 = pad_sequences(x_1,maxlen=500)
a=model.predict(x_1)
print()
print()
print()
if a>=0.5:
    print("It is a positive review, with sentiment score {}".format(model.predict(x_1)))
elif a<0.5:
    print("It is a negative review, with sentiment score {}".format(model.predict(x_1))) 

Enter your review:
Between the Lovecraftian overtones and Liberato’s performance,  The Beach House offers up beautifully shot terror and will make you think before opening your door.



It is a positive review, with sentiment score [[0.9779469]]


In [80]:
print("Enter your review:")
string11=str(input())
x_1=tokenizer.texts_to_sequences([string11])
x_1 = pad_sequences(x_1,maxlen=500)
a=model.predict(x_1)
print()
print()
print()
if a>=0.5:
    print("It is a positive review, with sentiment score {}".format(model.predict(x_1)))
elif a<0.5:
    print("It is a negative review, with sentiment score {}".format(model.predict(x_1))) 

Enter your review:
suprised to see the extensive bad reviews, I decided to see the movie for myself and judge it. It was a nice watch, brought out some social issues and challenged some beliefs like trust on godmen, blind faith, parents undying love for children...thus while it may not sit well with our preconcieved notions, it challeneges us to think why we have these beliefs in the first place? And whether they are actually universal truths or just more of generational beliefs that are passed on to us.....forming sort of an archetype in out psyche. At the very basic level the movie tries to drill into you the difference between following God and following Godmen, especially giving people the status of God, and taking their word as word of God....it's made it amply clear that protagonist is not against God but against people using name of God for their benefit... it's a realisation and awareness the socio-political situation perhaps really needs....The movie has good twists which come

In [82]:
print("Enter your review:")
string11=str(input())
x_1=tokenizer.texts_to_sequences([string11])
x_1 = pad_sequences(x_1,maxlen=500)
a=model.predict(x_1)
print()
print()
print()
if a>=0.5:
    print("It is a positive review, with sentiment score {}".format(model.predict(x_1)))
elif a<0.5:
    print("It is a negative review, with sentiment score {}".format(model.predict(x_1))) 

Enter your review:
No joke but Sadak 2 was really my most awaited film of 2020. As a school kid, I had bunked classes to watch the first part five times in its first week at Gemini cinema Bandra, my unbeaten record till date!! Featuring Sanjay Dutt as the sanki taxi driver Ravi who rescues his lady love Pooja Bhatt from clutches of the eunuch pimp Maharani, Sadak had terrific performances, Nadeem Shravan's extraordinary music and pulsating action in the climax which made it one of Mahesh Bhatt's most successful as well as memorable film till date. It is a perfect dose of entertainment for me - and I just love it.So naturally, I started watching Sadak 2 with high expectations- after all it had the legendary Bhatt Saahab returning to direction after twenty odd years, Sanju Baba and Pooja reprising their iconic roles with the current A lister Alia Bhatt added to the cast. And what did I get....Alia Bhatt screaming or hamming in every scene she appears, Sanju Baba largely ineffective with 

In [83]:
print("Enter your review:")
string11=str(input())
x_1=tokenizer.texts_to_sequences([string11])
x_1 = pad_sequences(x_1,maxlen=500)
a=model.predict(x_1)
print()
print()
print()
if a>=0.5:
    print("It is a positive review, with sentiment score {}".format(model.predict(x_1)))
elif a<0.5:
    print("It is a negative review, with sentiment score {}".format(model.predict(x_1))) 

Enter your review:
Such foolish reasoning & pointless opinions made no sense. The director really had no idea what he was trying to make.Performance wise only Sanjay Dutt impresses. The villains hardly have much to do & Alia Bhatt irritates at times. Aditya Roy Kapur was rightfully a mere spectator



It is a negative review, with sentiment score [[0.27764797]]


In [85]:
print("Enter your review:")
string11=str(input())
x_1=tokenizer.texts_to_sequences([string11])
x_1 = pad_sequences(x_1,maxlen=500)
a=model.predict(x_1)
print()
print()
print()
if a>=0.5:
    print("It is a positive review, with sentiment score {}".format(model.predict(x_1)))
elif a<0.5:
    print("It is a negative review, with sentiment score {}".format(model.predict(x_1))) 

Enter your review:



It is a negative review, with sentiment score [[0.00284904]]


In [86]:
print("Enter your review:")
string11=str(input())
x_1=tokenizer.texts_to_sequences([string11])
x_1 = pad_sequences(x_1,maxlen=500)
a=model.predict(x_1)
print()
print()
print()
if a>=0.5:
    print("It is a positive review, with sentiment score {}".format(model.predict(x_1)))
elif a<0.5:
    print("It is a negative review, with sentiment score {}".format(model.predict(x_1))) 

Enter your review:
Direction is average we did not expect this type of execution from Bhatt. Dialogues avoidable.Bad VFX for showing  Kailash mansarover at the end



It is a negative review, with sentiment score [[0.01568869]]


In [87]:
print("Enter your review:")
string11=str(input())
x_1=tokenizer.texts_to_sequences([string11])
x_1 = pad_sequences(x_1,maxlen=500)
a=model.predict(x_1)
print()
print()
print()
if a>=0.5:
    print("It is a positive review, with sentiment score {}".format(model.predict(x_1)))
elif a<0.5:
    print("It is a negative review, with sentiment score {}".format(model.predict(x_1))) 

Enter your review:
"Drishyam is an outsanding thriller. Drishyam is a well directed movie, which keeps you hooked on to the screen. Mesmerizing at every move with classical end



It is a positive review, with sentiment score [[0.99414897]]


In [90]:
print("Enter your review:")
string11=str(input())
x_1=tokenizer.texts_to_sequences([string11])
x_1 = pad_sequences(x_1,maxlen=500)
a=model.predict(x_1)
print()
print()
print()
if a>=0.5:
    print("It is a positive review, with sentiment score {}".format(model.predict(x_1)))
elif a<0.5:
    print("It is a negative review, with sentiment score {}".format(model.predict(x_1))) 

Enter your review:
sadak-2, the way it started, it looked very promising, it did live up to its expectation till interval. Post interval, you are gasping for breath as you do not know where the movie is heading to. At the end, you will feel like going to the box office and closing their shutters so that you can save other people's lives by not letting them buy the ticket of this horrible movie.



It is a negative review, with sentiment score [[0.2821839]]


In [91]:
print("Enter your review:")
string11=str(input())
x_1=tokenizer.texts_to_sequences([string11])
x_1 = pad_sequences(x_1,maxlen=500)
a=model.predict(x_1)
print()
print()
print()
if a>=0.5:
    print("It is a positive review, with sentiment score {}".format(model.predict(x_1)))
elif a<0.5:
    print("It is a negative review, with sentiment score {}".format(model.predict(x_1))) 

Enter your review:
Three Idiots is a remarkable ahead of its time Bollywood blockbuster. This film is a comedy movie with strong acting, memorable characters, a perplexing storyline and most importantly, highly motivational movie to choose the right path in your life.



It is a positive review, with sentiment score [[0.9743082]]


In [92]:
print("Enter your review:")
string11=str(input())
x_1=tokenizer.texts_to_sequences([string11])
x_1 = pad_sequences(x_1,maxlen=500)
a=model.predict(x_1)
print()
print()
print()
if a>=0.5:
    print("It is a positive review, with sentiment score {}".format(model.predict(x_1)))
elif a<0.5:
    print("It is a negative review, with sentiment score {}".format(model.predict(x_1))) 

Enter your review:
Over the course of five years, Vince Gilligan and his friends have constructed a world piece by piece, with attention to detail worthy of a Faberge egg. They created a compelling protagonist, a deeply flawed yet charismatic genius. They built a business at which he had savant-like skills, and depicted the family that often drove him crazy. Then blurred lines between the two. And in that way created a life for Walter White that many of us can relate to. But other great and groundbreaking TV dramas had done something similar, most notably David Chase’s The Sopranos, David Simon’s The Wire, and David Milch’s Deadwood. But Breaking Bad did something those iconic shows didn’t do. Show runner Vince Gilligan set his protagonist in motion. Television had always been about a kind of inertia. After every episode of M*A*S*H or The Rockford Files there’d be a cosmic reset button that would allow the characters to return to exactly where they started at the beginning of the episo

In [97]:
print("Enter your review:")
string11=str(input())
x_1=tokenizer.texts_to_sequences([string11])
x_1 = pad_sequences(x_1,maxlen=500)
a=model.predict(x_1)
print()
print()
print()
if a>=0.5:
    print("It is a positive review, with sentiment score {}".format(model.predict(x_1)))
elif a<0.5:
    print("It is a negative review, with sentiment score {}".format(model.predict(x_1)))

Enter your review:
I will forever remember and keep this movie in my heart for the rest of my life.



It is a positive review, with sentiment score [[0.9001584]]


# This model can also be used to get to know the average sentiment of a list of many reviews, like thousands or millions of review texts, where it is not possible to read each and every comment and now how the audience feels about the movie.

In [102]:
review_list=["I really hated this movie!!","This movie was the worst experience of my life!!","I wish I skipped watching this movie.","This movie had absolutely no plot, no direction, poor dialogues","I liked this movie","I absolutely enjoyed this movie from the start to the end!!","I will forever remember this movie in my heart for the rest of my life."]
x_2=tokenizer.texts_to_sequences(review_list)
x_2 = pad_sequences(x_2,maxlen=500)
a=model.predict(x_2)
print("Average sentiment score for the above list of comments : "+str(np.average(a)))

Average sentiment score for the above list of comments : 0.4105387
