In [2]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from tensorflow import keras
from keras.preprocessing.text import Tokenizer

pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 4)

## Importing Dataset

In [3]:
data_train=pd.read_csv("../input/imdb-dataset-sentiment-analysis-in-csv-format/Train.csv")
data_valid=pd.read_csv("../input/imdb-dataset-sentiment-analysis-in-csv-format/Valid.csv")
data_test=pd.read_csv("../input/imdb-dataset-sentiment-analysis-in-csv-format/Test.csv")

## Data Preprocessing

In [10]:
def preprocess_data(data = None, stopwords = None, rm_stopwords = True):
    """
    Data -> pandas DataFrame
    2 Columns -> 'text','label'
    """
    def lower_text(data = None):
        """
        Converts everyword to lower case
        Returns : Pandas DataFrame
        """
        data['text'] = data['text'].apply(lambda x : str(x).lower())
        return data
        
    def remove_punctuation(data = None):
        """
        Removes Punctuation Marks 
        Returns : Pandas DataFrame
        """
        data['text'] = data['text'].apply(lambda x : re.sub(r'[^\w\s]+','',x))
        return data
    
    def remove_stopwords(data = None, stopwords = None):
        """
        Removes Stopwords
        Returns : Pandas DataFrame
        """
        data['text'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))
        return data
    
    def remove_url(match_expr = r'([^\s]+www[^\s]+)|([^\s]+https?[^\s]+)', data = None):
        """
        Removes URL that contain http|https|www using Regular Expressions
        Returns : Pandas DataFrame
        """
        data['text'] = data['text'].apply(lambda x : re.sub(match_expr,' ',x))
        return data
    
    def remove_long_words(max_len = 30, data = None):
        """
        Removes blob of characters that are longer than 30 characters
        Returns : Pandas DataFrame
        """
        data['text'] = data['text'].apply(lambda x : re.sub(r'[^\s]{30,}',' ',x))
        return data

    def remove_multiple_spaces(data = None):
        """
        Removes continually occurin spaces.
        Returns : Pandas DataFrame
        """
        data['text'] = data['text'].apply(lambda x : re.sub(r'\s{2,}',' ',x))
        return data
    
    def get_tokens(data = None):
        train_tokens = data['text'].apply(lambda x : word_tokenize(x))
        return train_tokens
    
    def get_lemmatization(token_list = None):
        lemmatizer = WordNetLemmatizer()
        lemmatized_list = []
        lemmatized_sent = []

        for tokens in token_list:
            lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
            lemmatized_sent.append(" ".join(lemmatized_tokens))
        
        return lemmatized_sent
        
    data = lower_text(data=data)
    data = remove_url(data=data)
    data = remove_long_words(data=data)
    if rm_stopwords:
        data = remove_stopwords(data=data,stopwords=stopwords)
    data = remove_punctuation(data)
    data = remove_multiple_spaces(data=data)
    data = get_tokens(data=data)
    data = get_lemmatization(data)
    
    return data

In [11]:
stopword_list = stopwords.words('english')

In [29]:
X = preprocess_data(data_train,stopword_list)
X_valid = preprocess_data(data_valid,stopword_list)
X_test = preprocess_data(data_test,stopword_list)

In [30]:
Xd = X
X_validd = X_valid
X_testd = X_test

In [None]:
"""
X = Xd
X_valid = X_validd
X_test = X_testd
"""

In [31]:
size_of_vocabulary = 10000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = size_of_vocabulary , split=' ')
tokenizer.fit_on_texts(X)

In [32]:
X = tokenizer.texts_to_sequences(X)
X_valid = tokenizer.texts_to_sequences(X_valid)
X_test = tokenizer.texts_to_sequences(X_test)

In [34]:
X = tf.keras.preprocessing.sequence.pad_sequences(
    X,
    dtype='int',
    padding='post',
)
X_valid = tf.keras.preprocessing.sequence.pad_sequences(
    X_valid,
    maxlen = X.shape[1],
    dtype='int',
    padding='post',
)
X_test = tf.keras.preprocessing.sequence.pad_sequences(
    X_test,
    maxlen= X.shape[1],
    dtype='int',
    padding='post',
)

In [35]:
X

array([[1919,  763, 8093, ...,    0,    0,    0],
       [ 143,    2,  171, ...,    0,    0,    0],
       [  19,   32,  684, ...,    0,    0,    0],
       ...,
       [ 214,  188,    2, ...,    0,    0,    0],
       [  23,  188, 2476, ...,    0,    0,    0],
       [  12,    3,    7, ...,    0,    0,    0]])

In [36]:
X.shape

(40000, 1165)

In [37]:
y = np.array(data_train['label'],dtype='float64')
y_valid = np.array(data_valid['label'],dtype='float64')
y_test = np.array(data_test['label'],dtype='float64')

In [42]:
def get_bidirectional_rnn(input_dim = None,output_dim = None,input_shape = None):

    BiDirectional_layers = 3

    model = tf.keras.Sequential()
    model.add(
        tf.keras.layers.Embedding(
          input_dim = input_dim,
          output_dim = output_dim,
          input_length = input_shape[1]
        )
    )
    ## difference between spatial and normal dropout layer
    ## model.add(tf.keras.layers.Dropout(0.2),)

    ## stacking LSTM layers require return_sequence to be set to true

    for i in range(BiDirectional_layers):
      model.add(
          tf.keras.layers.Bidirectional(
            tf.keras.layers.SimpleRNN(            
              units = 16,
              ## relu dont
              activation="tanh",
              dropout=0.2,
              return_sequences=True,
            ),
            merge_mode = "concat"
        )
      )
    
    model.add(
        tf.keras.layers.GlobalMaxPooling1D(),
    )
    model.add(
        tf.keras.layers.Dense(64,activation='relu')
    )
    model.add(
        tf.keras.layers.Dense(32,activation='relu')
    )
    model.add(
        tf.keras.layers.Dense(1,activation='sigmoid')
    )
    
    model.compile(
        
        ## gradient clipping
        optimizer=tf.keras.optimizers.Adam(clipvalue=0.005),
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=tf.keras.metrics.BinaryAccuracy(),

    )
    
    return model

In [43]:
model_bidirectional_rnn = get_bidirectional_rnn(input_dim = size_of_vocabulary ,
                                                output_dim = 128,
                                                input_shape = X.shape)

In [44]:
model_bidirectional_rnn.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1165, 128)         1280000   
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 1165, 32)          4640      
_________________________________________________________________
bidirectional_14 (Bidirectio (None, 1165, 32)          1568      
_________________________________________________________________
bidirectional_15 (Bidirectio (None, 1165, 32)          1568      
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 32)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 64)                2112      
_________________________________________________________________
dense_10 (Dense)             (None, 32)               

In [45]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0.1,
    patience=3,
    mode="min"
)
history_bidir_rnn = model_bidirectional_rnn.fit(
    x = X,
    y = y,
    batch_size = 256,
    validation_split = 0.2, 
    epochs = 5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Observations
1. Model Architecture  
  * Using more than 3 Bidirectional layers almost always leads to a stagnant loss that causes accuracy to be not more than 50%.
  * No amount of gradient clipping was able to fix this.
  * Another possible solution is to use less units in RNN layer but the model does not learn many features on a large vocabulary size

2. Overfit
  * Model Overfits the training dataset.
  * Even after use of dropouts, the model is unable to learn all the features of the dataset.
  * A small amount of error can be attributed to the ambiguity of some comments, where it is difficult to assess the sentiment of the text, even by humans.
  * Future scope may use experimenting with larger architecture with use of recurrent dropouts.
3. Gradient Clipping
  * Gradient Clipping is important while dealing with RNN.
  * Using an Optimizer, clip gradients that are too big to avoid exploding and vanishing gradients