In [0]:
import pandas as pd
import os
import sys
import numpy as np
import matplotlib.pyplot as plt

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")


In [0]:
from google.colab import files
uploaded = files.upload()

Saving IMDBdataset.csv to IMDBdataset.csv


In [0]:
import io
imdbdata = pd.read_csv('IMDBdataset.csv',encoding = 'latin-1')

In [0]:
imdbdata.head()

Unnamed: 0,SentimentText,Sentiment
0,"first think another Disney movie, might good, ...",1
1,"Put aside Dr. House repeat missed, Desperate H...",0
2,"big fan Stephen King's work, film made even gr...",1
3,watched horrid thing TV. Needless say one movi...,0
4,truly enjoyed film. acting terrific plot. Jeff...,1


In [0]:
imdbdata.shape

(25000, 2)

# Preprocessing

In [0]:
imdbdata = imdbdata[imdbdata['Sentiment'].isnull() == False]
imdbdata['Sentiment'] = imdbdata['Sentiment'].map(int)
imdbdata = imdbdata[imdbdata['SentimentText'].isnull() == False]
imdbdata.reset_index(inplace=True)
imdbdata.drop('index', axis=1, inplace=True)

In [0]:
imdbdata.describe()

Unnamed: 0,Sentiment
count,25000.0
mean,0.5
std,0.50001
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [0]:
import re

pat_1 = r"(?:\@|https?\://)\S+"
pat_2 = r'#\w+ ?'
combined_pat = r'|'.join((pat_1, pat_2))
www_pat = r'www.[^ ]+'
html_tag = r'<[^>]+>'
negations_ = {"isn't":"is not", "can't":"can not","couldn't":"could not", "hasn't":"has not",
                "hadn't":"had not","won't":"will not",
                "wouldn't":"would not","aren't":"are not",
                "haven't":"have not", "doesn't":"does not","didn't":"did not",
                 "don't":"do not","shouldn't":"should not","wasn't":"was not", "weren't":"were not",
                "mightn't":"might not",
                "mustn't":"must not"}
negation_pattern = re.compile(r'\b(' + '|'.join(negations_.keys()) + r')\b')

In [0]:
def data_cleaner(text):
    try:
        stripped = re.sub(combined_pat, '', text)
        stripped = re.sub(www_pat, '', stripped)
        cleantags = re.sub(html_tag, '', stripped)
        lower_case = cleantags.lower()
        neg_handled = negation_pattern.sub(lambda x: negations_[x.group()], lower_case)
        letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
        return letters_only
    except:
        return 'NC'

In [0]:
imdbdata['SentimentText'] = imdbdata['SentimentText'].apply(data_cleaner)

In [0]:
imdbdata.head()

Unnamed: 0,SentimentText,Sentiment
0,first think another disney movie might good ...,1
1,put aside dr house repeat missed desperate h...,0
2,big fan stephen king s work film made even gr...,1
3,watched horrid thing tv needless say one movi...,0
4,truly enjoyed film acting terrific plot jeff...,1


So data is cleaned now

In [0]:
MAX_SEQUENCE_LENGTH = 200
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 5


In [0]:
sentences = imdbdata['SentimentText'].values
#sentences

In [0]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [0]:
# Words are converted into integers for the model

print(sentences[0])
print(sequences[0])

first think another disney movie  might good  it s kids movie  watch it  can not help enjoy it  ages love movie  first saw movie      years later still love it  danny glover superb could play part better  christopher lloyd hilarious perfect part  tony danza believable mel clark  can not help  enjoy movie  give       
[25, 31, 79, 745, 2, 150, 9, 5, 1, 269, 2, 35, 5, 93, 4, 258, 277, 5, 2019, 45, 2, 25, 133, 2, 75, 220, 57, 45, 5, 1570, 3182, 817, 20, 212, 89, 55, 1282, 3233, 558, 322, 89, 1034, 11095, 783, 3656, 2377, 93, 4, 258, 277, 2, 118]


In [0]:
# Get the word to index mapping

word2idx = tokenizer.word_index
print(word2idx['disney'])

#  As we can see disney has a index of 745 as seen from the above output

745


In [0]:
#Performing the padding upto 200 words

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)

Shape of data tensor: (25000, 200)


In [0]:
max(len(s.split()) for s in sentences) # As we see that the maximum length is 1497 but we are limiting it to 200

1497

In [0]:
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
num_words

20000

In [0]:
len(word2idx) # There are 74119 unique words from the dataset but we are limiting the number of words 20000.

74119

In [0]:
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) # Creating the embedding matrix for 20000 words , each having a EMBEDDING_DIM of 50
embedding_matrix.shape

(20000, 50)

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
os.listdir()

['glove.6B.50d.txt', 'IMDBdataset.csv']

In [0]:
word2vec = {}
with open(os.path.join('glove.6B.50d.txt')) as f:
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))




Found 400000 word vectors.


Each word is represented by 50 numbers

In [0]:
embedding_matrix

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-1.52339995e-01,  9.80849981e-01,  1.00650001e+00, ...,
         3.78430001e-02,  7.17440009e-01,  3.24349999e-01],
       [ 3.08239996e-01,  1.72230005e-01, -2.33390003e-01, ...,
        -9.81769979e-01, -3.21469992e-01,  9.98229980e-01],
       ...,
       [-2.17510000e-01, -2.45059997e-01,  2.56850006e-04, ...,
        -7.20620006e-02,  4.96069998e-01,  2.94990003e-01],
       [-1.82750002e-01,  4.45710011e-02,  2.57620007e-01, ...,
        -4.38789994e-01, -5.78729995e-03, -8.69099975e-01],
       [ 6.61360025e-01, -5.67149997e-01, -5.53359985e-01, ...,
         4.40559983e-02,  1.55670000e-02,  6.01760030e-01]])

Each row of the matrix now represent a word from the word2idx dictionary and all the columns are the representation of that word into numbers with 50 dimensions

In [0]:
# Adding an embedding layer to the model

embedding_layer = Embedding(
  num_words,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  input_length=MAX_SEQUENCE_LENGTH,
  trainable=False                   # as the embeddings are pretrained
)


In [0]:
print('Shape of the input: ',data.shape)  # Batch size = 25000 and Sequence Length = 200

Shape of the input:  (25000, 200)


In [0]:
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))  ## First Layer of the network
print('Shape :',input_.shape)     ## It expects every row to have 200 columns or 200 words in our case

Shape : (?, 200)


In [0]:
## Adding the embedding layer to our input

x = embedding_layer(input_)                 ## Second Layer of the network
print('Shape :',x.shape)   ## Now we see that the size has become 200*50 as each of the word is now represented by 50 vectors

Shape : (?, 200, 50)


In [0]:
## Adding a Bidirectional LSTM layer to the embedding output

x = Bidirectional(LSTM(50, return_sequences=True))(x) ## Third Layer of the network
print('Shape :',x.shape)                     ## Now we see a size of 100 as the the 50 size vector is now represented by 100 hidden states of the LSTM considering Bidirectional

Shape : (?, ?, 100)


In [0]:
## Adding a Bidirectional LSTM layer to the embedding output

x = Bidirectional(LSTM(25, return_sequences=True))(x) ## Third Layer of the network
print('Shape :',x.shape)                     ## Now we see a size of 50 as the the 100 size vector is now represented by 50 hidden states of the LSTM considering Bidirectional

Shape : (?, ?, 50)


In [0]:
## Adding a MaxPooling Layer

x = GlobalMaxPool1D()(x)         ## Fourth Layer of the network
print('Shape :',x.shape)         ## It has performed a maximum function on axis 1 and now we have 2 dim instead of 3 as required by the Dense Layer to follow

Shape : (?, 50)


In [0]:
## Adding Dense Layers

# x = Dense(256,activation = 'relu')(x)  # Fifth Layer
# print('Shape :',x.shape)            # Now the size has changed form 200 to 256
# x = Dense(128,activation = 'relu')(x)  # Sixth Layer
# print('Shape :',x.shape)            # Now the size has changed form 256 to 128
x = Dense(64,activation = 'relu')(x)   # Seventh Layer
print('Shape :',x.shape)            # Now the size has changed form 128 to 64
x = Dense(32,activation = 'relu')(x)   # Eight Layer
print('Shape :',x.shape)            # Now the size has changed form 64 to 32
x = Dense(16,activation = 'relu')(x)   # Ninth Layer
print('Shape :',x.shape)            # Now the size has changed form 32 to 16
output = Dense(1, activation="sigmoid")(x)  # Output Layer
print('Shape :',output.shape)            # Final output


Shape : (?, 64)
Shape : (?, 32)
Shape : (?, 16)
Shape : (?, 1)


In [0]:
model = Model(inputs = input_, outputs = output)  # Initialting the model

model.compile(
  loss='binary_crossentropy',     ## Assiging Loss 
  optimizer=Adam(lr=0.01),        ## Optimizer with Learning Rate
  metrics=['accuracy']            ## Metric 
)

In [0]:
y = imdbdata['Sentiment'].values
y

array([1, 0, 1, ..., 0, 0, 1])

In [0]:
r = model.fit(
  data,
  y,
  batch_size=32,
  epochs=10,
  validation_split = 0.1  
)

Train on 22500 samples, validate on 2500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
target = model.predict(data)

In [0]:
target = np.where(target>=0.5,1,0)

In [0]:
target2 = target.reshape(25000)
target2

array([1, 0, 1, ..., 0, 0, 0])

In [0]:
(y == target2).mean()

0.91672

A training accuracy of 91%