<a href="https://colab.research.google.com/github/iamdsc/deep_learning/blob/master/9_rnn_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Sentiment Analysis on IMDb movie reviews using Multilayer RNN

In [4]:
# Downloading the data from github
!wget https://github.com/iamdsc/sentiment-analysis/raw/master/movie_data.csv

--2019-02-27 01:10:41--  https://github.com/iamdsc/sentiment-analysis/raw/master/movie_data.csv
Resolving github.com (github.com)... 192.30.253.112, 192.30.253.113
Connecting to github.com (github.com)|192.30.253.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/iamdsc/sentiment-analysis/master/movie_data.csv [following]
--2019-02-27 01:10:41--  https://raw.githubusercontent.com/iamdsc/sentiment-analysis/master/movie_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 65862309 (63M) [text/plain]
Saving to: ‘movie_data.csv.3’


2019-02-27 01:10:42 (192 MB/s) - ‘movie_data.csv.3’ saved [65862309/65862309]



In [5]:
# Preparing the data
!pip install pyprind
import pyprind
import pandas as pd
from string import punctuation
import re
import numpy as np

df=pd.read_csv('movie_data.csv',encoding='utf-8')
df.head()

Collecting pyprind
  Downloading https://files.pythonhosted.org/packages/1e/30/e76fb0c45da8aef49ea8d2a90d4e7a6877b45894c25f12fb961f009a891e/PyPrind-2.11.2-py3-none-any.whl
Installing collected packages: pyprind
Successfully installed pyprind-2.11.2


Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [6]:
# Preprocessing the data:
# Seperate words and count each word's occurence
from collections import Counter

counts=Counter()
pbar=pyprind.ProgBar(len(df['review']),title='Counting word occurences')
for i, review in enumerate(df['review']):
  text = ''.join([c if c not in punctuation else ' '+c+' ' for c in review]).lower()
  df.loc[i,'review']=text
  pbar.update()
  counts.update(text.split())


Counting word occurences
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:03:22


In [7]:
## Create a mapping
## Map each unique word to an integer
word_counts=sorted(counts,key=counts.get,reverse=True)
print(word_counts[:5])
word_to_int={word:ii for ii,word in enumerate(word_counts,1)}

mapped_reviews=[]
pbar=pyprind.ProgBar(len(df['review']),title='Map reviews to ints')
for review in df['review']:
  mapped_reviews.append([word_to_int[word] for word in review.split()])
  pbar.update()

Map reviews to ints


['the', '.', ',', 'and', 'a']


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:04


In [0]:
## Define same length sequences
## if sequence length < 200: left-pad with zeros
## if sequence length > 200: use last 200 elements

sequence_length = 200 # known as T in our RNN formulas
sequences = np.zeros((len(mapped_reviews),sequence_length),dtype=int)

for i, row in enumerate(mapped_reviews):
  review_arr=np.array(row)
  sequences[i,-len(row):] = review_arr[-sequence_length:]


In [9]:
sequences[:15,:]

array([[   37,  1956,  1801, ...,    85,    34,  1309],
       [    4,  7431,   256, ...,     1,  1980,    41],
       [  113,   587,     7, ...,   175,    29,     2],
       ...,
       [    3,  3353,     4, ...,  4019,    29,     2],
       [ 1636,     7,    92, ...,     5, 14785,     2],
       [    0,     0,     0, ...,   473,    29,     2]])

In [0]:
# perform train-test split
X_train = sequences[:25000,:]
y_train = df.loc[:25000, 'sentiment'].values
X_test = sequences[25000:,:]
y_test = df.loc[25000:, 'sentiment'].values

In [0]:
np.random.seed(123)

# Define a function to generate mini batches:
def create_batch_generator(x,y=None,batch_size=64):
  n_batches=len(x)//batch_size
  x=x[:n_batches*batch_size]
  if y is not None:
    y=y[:n_batches*batch_size]
  for ii in range(0,len(x),batch_size):
    if y is not None:
      yield x[ii:ii+batch_size], y[ii:ii+batch_size]
    else:
      yield x[ii:ii+batch_size]

## Building an RNN Model

In [0]:
import tensorflow as tf

class SentimentRNN(object):
  def __init__(self,n_words,seq_len=200,lstm_size=256,num_layers=1,batch_size=64,learning_rate=0.0001,embed_size=200):
    self.n_words=n_words
    self.seq_len=seq_len
    self.lstm_size=lstm_size  #number of hidden units
    self.num_layers=num_layers
    self.batch_size=batch_size
    self.learning_rate=learning_rate
    self.embed_size=embed_size
    
    self.g=tf.Graph()
    with self.g.as_default():
      tf.set_random_seed(123)
      self.build()
      self.saver=tf.train.Saver()
      self.init_op=tf.global_variables_initializer()
      
  # the build method
  def build(self):
    # Define the placeholders
    tf_x=tf.placeholder(tf.int32,shape=(self.batch_size,self.seq_len),name='tf_x')
    tf_y=tf.placeholder(tf.float32,shape=(self.batch_size),name='tf_y')
    tf_keepprob=tf.placeholder(tf.float32,name='tf_keepprob')
    
    # Create the embedding layer
    embedding=tf.Variable(tf.random_uniform((self.n_words,self.embed_size),minval=-1,maxval=1),name='embedding')
    embed_x=tf.nn.embedding_lookup(embedding,tf_x,name='embeded_x')
    
    # Define LSTM cell and stack them together
    cells=tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.BasicLSTMCell(self.lstm_size),output_keep_prob=tf_keepprob) for i in range(self.num_layers)])
    
    # Define the initial state
    self.initial_state=cells.zero_state(self.batch_size,tf.float32)
    print(' << initial state >> ',self.initial_state)
    
    lstm_outputs, self.final_state=tf.nn.dynamic_rnn(cells, embed_x, initial_state=self.initial_state)
    
    # lstm_outputs shape: [batch_size,max_time,cells.output_size]
    print('\n <<lstm_output >> ',lstm_outputs)
    print('\n <<final_state >> ',self.final_state)
    
    logits=tf.layers.dense(inputs=lstm_outputs[:,-1],units=1,activation=None,name='logits')
    logits=tf.squeeze(logits, name='logits_squeezed')
    print('\n << logits >> ',logits)
    
    y_proba=tf.nn.sigmoid(logits,name='probabilities')
    predictions={'probabilties':y_proba,'labels':tf.cast(tf.round(y_proba),tf.int32,name='labels')}
    print('\n << predictions >> ',predictions)
    
    # Define the cost function
    cost=tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf_y,logits=logits),name='cost')
    
    # Define the optimizer
    optimizer=tf.train.AdamOptimizer(self.learning_rate)
    train_op=optimizer.minimize(cost,name='train_op')
    
  def train(self, X_train, y_train, num_epochs):
    with tf.Session(graph=self.g) as sess:
      sess.run(self.init_op)
      iteration=1
      for epoch in range(num_epochs):
        state=sess.run(self.initial_state)
        for batch_x,batch_y in create_batch_generator(X_train,y_train,self.batch_size):
          feed={'tf_x:0':batch_x,'tf_y:0':batch_y,'tf_keepprob:0':0.5,self.initial_state:state}
          loss,_,state=sess.run(['cost:0','train_op',self.final_state],feed_dict=feed)
          if iteration % 20 == 0:
            print('Epoch: %d/%d Iteration: %d | Train loss: %.5f' % (epoch+1,num_epochs,iteration,loss))
          iteration+=1
          if(epoch+1)%10 == 0:
            self.saver.save(sess,'model/sentiment-%d.ckpt'%epoch)
  
  def predict(self,X_data,return_proba=False):
    preds=[]
    with tf.Session(graph=self.g) as sess:
      self.saver.restore(sess,tf.train.latest_checkpoint('./model/'))
      test_state=sess.run(self.initial_state)
      for ii, batch_x in enumerate(create_batch_generator(X_data, None, batch_size=self.batch_size),1):
        feed = {'tf_x:0' : batch_x,'tf_keepprob:0' : 1.0,self.initial_state : test_state}
        if return_proba:
          pred,test_state=sess.run(['probabilities:0',self.final_state],feed_dict=feed)
        else:
          pred,test_state=sess.run(['labels:0',self.final_state],feed_dict=feed)
          preds.append(pred)
        
    return np.concatenate(preds)

In [15]:
# Instantiating the SentimentRNN Class
n_words=max(list(word_to_int.values()))+1

rnn=SentimentRNN(n_words=n_words,seq_len=sequence_length,embed_size=256,lstm_size=128,num_layers=1,batch_size=100,learning_rate=0.001)

# Train the model
rnn.train(X_train, y_train, num_epochs=40)

Instructions for updating:
Colocations handled automatically by placer.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
 << initial state >>  (LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0' shape=(100, 128) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0' shape=(100, 128) dtype=float32>),)
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `

In [24]:
# Performing prediction

preds=rnn.predict(X_test)
y_true=y_test[:len(preds)]
print('Test Acc.: %.3f'%(100*np.sum(preds==y_true)/len(y_true)))


INFO:tensorflow:Restoring parameters from ./model/sentiment-39.ckpt
Test Acc.: 85.716
