In [1]:
import imdb

In [2]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters


In [3]:
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist

In [4]:
imdb.data_dir = "data/IMDB/"

In [5]:
tf.keras.__version__

'2.1.6-tf'

# Downloading the IMDB dataset 

In [6]:
import os
import download
import glob

########################################################################

# Directory where you want to download and save the data-set.
# Set this before you start calling any of the functions below.
data_dir = "data/IMDB/"

# URL for the data-set on the internet.
data_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"


########################################################################
# Private helper-functions.

def _read_text_file(path):
    """
    Read and return all the contents of the text-file with the given path.
    It is returned as a single string where all lines are concatenated.
    """

    with open(path, encoding='utf8') as file:
        # Read a list of strings.
        lines = file.readlines()

        # Concatenate to a single string.
        text = " ".join(lines)

    return text


########################################################################
# Public functions that you may call to download the data-set from
# the internet and load the data into memory.


def maybe_download_and_extract():
    """
    Download and extract the IMDB Review data-set if it doesn't already exist
    in data_dir (set this variable first to the desired directory).
    """

    download.maybe_download_and_extract(url=data_url, download_dir=data_dir)


def load_data(train=True):
    """
    Load all the data from the IMDB Review data-set for sentiment analysis.
    :param train: Boolean whether to load the training-set (True)
                  or the test-set (False).
    :return:      A list of all the reviews as text-strings,
                  and a list of the corresponding sentiments
                  where 1.0 is positive and 0.0 is negative.
    """

    # Part of the path-name for either training or test-set.
    train_test_path = "train" if train else "test"

    # Base-directory where the extracted data is located.
    dir_base = os.path.join(data_dir, "aclImdb", train_test_path)

    # Filename-patterns for the data-files.
    path_pattern_pos = os.path.join(dir_base, "pos", "*.txt")
    path_pattern_neg = os.path.join(dir_base, "neg", "*.txt")

    # Get lists of all the file-paths for the data.
    paths_pos = glob.glob(path_pattern_pos)
    paths_neg = glob.glob(path_pattern_neg)

    # Read all the text-files.
    data_pos = [_read_text_file(path) for path in paths_pos]
    data_neg = [_read_text_file(path) for path in paths_neg]

    # Concatenate the positive and negative data.
    x = data_pos + data_neg

    # Create a list of the sentiments for the text-data.
    # 1.0 is a positive sentiment, 0.0 is a negative sentiment.
    y = [1.0] * len(data_pos) + [0.0] * len(data_neg)

    return x, y


In [7]:
maybe_download_and_extract()

Data has apparently already been downloaded and unpacked.


# Loading data-

In [8]:
x_train_text, y_train=load_data(train=True)
x_test_text, y_test=load_data(train=False)

In [9]:
data_text=x_train_text+x_test_text #total data

In [10]:
tokenizer=Tokenizer(num_words=10000)  #Taking top 10000 words


In [11]:
tokenizer.fit_on_texts(data_text) #converted top 10000 textual words to number

In [12]:
x_train_tokens=tokenizer.texts_to_sequences(x_train_text) #applying on x_train words and converting it into number

In [13]:
np.array(x_train_tokens[1]) #view of x_train review 1 in number format

array([  38,   14,  744, 3506,   45,   75,   32, 1771,   15,  153,   18,
        110,    3, 1344,    5,  343,  143,   20,    1,  920,   12,   70,
        281, 1228,  395,   35,  115,  267,   36,  166,    5,  368,  158,
         38, 2058,   15,    1,  504,   88,   83,  101,    4,    1, 4339,
         14,   39,    3,  432, 1148,  136, 8697,   42,  177,  138,   14,
       2791,    1,  295,   20, 5276,  351,    5, 3029, 2310,    1,   38,
       8697,   43, 3611,   26,  365,    5,  127,   53,   20,    1, 2032,
          7,    7,   18,   48,   43,   22,   70,  358,    3, 2343,    5,
        420,   20,    1, 2032,   15,    3, 3346,  208,    1,   22,  281,
         66,   36,    3,  344,    1,  728,  730,    3, 3864, 1320,   20,
          1, 1543,    3, 1293,    2,  267,   22,  281, 2734,    5,   63,
         48,   44,   37,    5,   26, 4339,   12,    6, 2079,    7,    7,
       3425, 2891,   35, 4446,   35,  405,   14,  297,    3,  986,  128,
         35,   45,  267,    8,    1,  181,  366, 69

In [14]:
x_test_tokens=tokenizer.texts_to_sequences(x_test_text)

In [15]:
np.array(x_test_tokens[1])

array([ 291,  663,  164,  988, 6162, 1108,   53,   24, 2413, 2084,    1,
       3368,  182,   16,   11,  236, 2845, 2378,  449,   42,    1, 1154,
        580,  849,  117,    3,  186,  283, 5582,   36,   24, 4980,  952,
          5,  288,  450,   24, 6506,    8,   48,   13, 2199,   14,    1,
        811,  465,  123,  253,  145,   54,  326,    4, 5250,    2,  132,
       6826, 2378, 1473,   23,    3, 9865,    3, 2579,   88, 1022,  221,
          5, 1769,  928,   16, 5114,    2, 3673,  128,   18,   47,   85,
         11,   19,   13, 8928,   29,    1,  167,    7,    7,    1,   19,
        537,   16,   47, 1528,  615,  897,  816,    3,  318,    4,    1,
       1270,  615,  897,    4,    2, 5049,   18,  341, 1371,   15,   92,
         86,   31,    1, 1579,  527,  281,    1,  205, 1122,    5,    1,
       1154,  580,  849,  177, 1185,   53,   52,   69, 6162,  124,    3,
        331,  293,    2,  276,    3, 9513,   15, 1168, 3694,   10,  431,
          1, 2790, 9673, 4106,    4,    1,  205,   

In [16]:
#since the length of each review is not same so we have to pad and truncate them in order to get the same length 
#of the review so that it can be applied on the rnn and the embedding layer.
#Let us pad and truncate both the train and test data

In [17]:
num_tokens=[len(token) for token in x_train_tokens+x_test_tokens]
num_tokens=np.array(num_tokens)

In [18]:
num_tokens

array([127, 401, 134, ..., 253, 119, 352])

In [19]:
print(np.mean(num_tokens))
print(np.max(num_tokens))
#Such a large difference  ! 

221.27716
2209


In [20]:
max_tokens=np.mean(num_tokens)+2*np.std(num_tokens)
max_tokens=int(max_tokens)
max_tokens

544

In [21]:
#lets check how many reviews has word textlength<max tokens

count=0
for token in num_tokens:
    if((token)<max_tokens):
        count+=1
    else:
        continue
print(count)
token_whose_length_less_than_max_tokens=count/len(num_tokens)
token_whose_length_less_than_max_tokens

47265


0.9453

In [22]:
#95 percent of the document has the length < max_tokens
#let us pad the tokens to the fixed size input vecor for the GRU

In [23]:
x_train_pad=pad_sequences(x_train_tokens,maxlen=max_tokens,padding='pre',truncating='pre')
x_test_pad=pad_sequences(x_test_tokens,maxlen=max_tokens,padding='pre',truncating='pre')

In [24]:
x_train_pad[23457]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [25]:
model=Sequential()

In [26]:
model.add(Embedding(input_dim=10000,output_dim=8,input_length=max_tokens,name='layer_embedding'))

In [27]:
model.add(GRU(units=16,return_sequences=True))
model.add(GRU(units=16,return_sequences=True))
model.add(GRU(units=32,return_sequences=True))
model.add(GRU(units=64))
model.add(Dense(1,activation='sigmoid'))

In [28]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [29]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 544, 8)            80000     
_________________________________________________________________
gru (GRU)                    (None, 544, 16)           1200      
_________________________________________________________________
gru_1 (GRU)                  (None, 544, 16)           1584      
_________________________________________________________________
gru_2 (GRU)                  (None, 544, 32)           4704      
_________________________________________________________________
gru_3 (GRU)                  (None, 64)                18624     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 106,177
Trainable params: 106,177
Non-trainable params: 0
_________________________________________________________________


In [32]:
%%time
model.fit(x_train_pad,y_train,validation_split=0.1,epochs=6,batch_size=64)
accuracy_calculated=model.evaluate(x_test_pad,y_test)
print("Accuracy: {0:.2%}".format(accuracy_calculated[1]))

Train on 22500 samples, validate on 2500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Accuracy: 86.79%
Wall time: 45min 43s


In [None]:
#Due to i3 processor it took a lot of time in my cpu .Will execute in the shorter span of time depending on the processor and ram 