In [11]:
import numpy as np
import pandas as pd
import tensorflow as tf

from string import punctuation
from collections import Counter
import re

In [2]:
reviews_path = 'C:/Users/LENOVO/Desktop/movie_data.csv'

In [3]:
movie_data = pd.read_csv(reviews_path)

In [4]:
movie_data.head()

Unnamed: 0,review,sentiment
0,Nothing is fantastic! Simple as that! It's a f...,1
1,This really was a waste of time...the movie ha...,0
2,"This might sound weird, but I only got to see ...",0
3,Considering this film was released 8 years bef...,1
4,Very rarely does one come across an indie come...,1


In [5]:
movie_data.tail()

Unnamed: 0,review,sentiment
49995,"I Sell the Dead is a big, sloppy horror comedy...",1
49996,I know this sounds odd coming from someone bor...,1
49997,OK I had higher hopes for this Carnosaur movie...,0
49998,"Laurence Fishburne is a fine actor, and deserv...",1
49999,I lived in Tokyo for 7 months. Knowing the rea...,1


In [17]:
# count unique words
counter = Counter()

# clean text
for i, review in enumerate(movie_data['review']):
    
    text = "".join(c if c not in punctuation else " {} ".format(c) for c in review).lower()
    movie_data.iloc[i, 0] = text
    
    counter.update(text.split())

In [21]:
# word2int dictionary
word2int = {w:i for i, w in enumerate(sorted(counter, key=counter.get, reverse=True), start=1)}

In [23]:
# int-list reviews
mapped_reviews = []
for review in movie_data['review']:
    mapped_reviews.append([word2int[word] for word in review.split()])

In [27]:
sequence_length = 200

# if length < sequence_length : left padd with zeros
# if length > sequence_length : take last 'sequence_length' elements

# padded sequence
sequences = np.zeros(shape=(len(movie_data), sequence_length), dtype=int)

In [28]:
for i, mapped_review in enumerate(mapped_reviews):
    n = len(mapped_review)
    if n < sequence_length:
        sequences[i, -n:] = mapped_review
    else:
        sequences[i, :] = mapped_review[-sequence_length:]

In [31]:
X_train = sequences[:25000, :]
y_train = movie_data.iloc[:25000, 1].values

X_test = sequences[25000:, :]
y_test = movie_data.iloc[25000:, 1].values

X_train.shape, y_train.shape, X_test.shape, y_train.shape

((25000, 200), (25000,), (25000, 200), (25000,))

In [33]:
X_val = X_test[:12500]
y_val = y_test[:12500]
X_test = X_test[12500:]
y_test = y_test[12500:]

X_val.shape, y_val.shape, X_test.shape, y_test.shape

((12500, 200), (12500,), (12500, 200), (12500,))

In [49]:
def batch_generator(X, y=None, batch_size=64):
    if y is not None:
        assert len(X) == len(y)
    n_batch = len(X) // batch_size
    for i in range(n_batch):
        a = i*batch_size
        b = (i+1)*batch_size
        
        if y is not None:
            yield X[a:b], y[a:b]
        else:
            yield X[a:b]

In [69]:
class SentimentRNN:
    
    def __init__(self, word_size, embed_size=200, lstm_size=256, num_layer=1,
                seq_length=200, learning_rate=1e-4, batch_size=32):
        # model hyper parameters
        self.word_size = word_size
        self.embed_size = embed_size
        self.lstm_size = lstm_size
        self.num_layer = num_layer
        self.seq_length = seq_length
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        
        # build model graph
        self.g = tf.Graph()
        with self.g.as_default():
            #tf.set_random_seed(123)
            self.build()
            self.saver = tf.train.Saver()
            self.init_op = tf.global_variables_initializer()
    
    def build(self):
        # placeholder for inputs
        tf_x = tf.placeholder(dtype=tf.int32, 
                              shape=(self.batch_size, self.seq_length), 
                              name='tf_x')
        tf_y = tf.placeholder(dtype=tf.float32, 
                              shape=(self.batch_size), 
                              name='tf_y')
        tf_keepprob = tf.placeholder(dtype=tf.float32, shape=(), 
                                     name='tf_keepprob')
        # embedding vector
        W_embedding = tf.Variable(
            tf.random_uniform(shape=(self.word_size, self.embed_size), minval=-1, maxval=1),
            name='W_embedding')
        
        embed_x = tf.nn.embedding_lookup(W_embedding, tf_x, name='embed_x')
        
        # create rnn cell
        cells = tf.contrib.rnn.MultiRNNCell([
            tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.BasicLSTMCell(self.lstm_size),
                                         output_keep_prob=tf_keepprob)
                                          for i in range(self.num_layer)
        ])
        
        # define the initial state/ rnn steps
        self.initial_state = cells.zero_state(self.batch_size, tf.float32)
        print("  << initial state > ", self.initial_state)
        
        lstm_outputs, self.final_state = tf.nn.dynamic_rnn(cell=cells, inputs=embed_x, 
                                                           initial_state=self.initial_state)
        print("\n << lstm_output >> ", lstm_outputs)
        print("\n << final state >> ", self.final_state)
        
        # dense layer -> logits
        logits = tf.layers.dense(inputs=lstm_outputs[:, -1], units=1, 
                                 activation=None, name='logits')
        
        logits = tf.squeeze(logits, name='logits_squeezed')
        print('\n  << logits     >> ', logits)
        
        # predictions -> prob. | labels
        y_proba = tf.nn.sigmoid(logits, name='probabilities')
        y_labels = tf.cast(tf.round(logits), dtype=tf.int32, name='labels')
        predictions = {
            'probabilities': y_proba,
            'labels': y_labels
        }
        print("\n << predictions  >> ", predictions)
        
        # cost function
        cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf_y, 
                                                                      logits=logits), 
                              name='cost')
        # optimizer
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        train_op = optimizer.minimize(cost, name='train_op')
    
    def train(self, X, y, num_epochs):
        with tf.Session(graph=self.g) as sess:
            sess.run(self.init_op)
            
            batch_total = 0
            for epoch in range(num_epochs):
                # reset cell&hidden states
                state = sess.run(self.initial_state)
                for x_batch, y_batch in batch_generator(X, y, batch_size=self.batch_size):
                    feed = {'tf_x:0': x_batch,
                            'tf_y:0': y_batch,
                            'tf_keepprob:0': 0.5,
                            self.initial_state: state}
                    loss, _, state = sess.run(['cost:0', 'train_op', self.final_state], 
                                              feed_dict=feed)
                    
                    # update training every 20 batches
                    batch_total += 1
                    if (batch_total+1) % 20 == 0:
                        print("Epoch {:3d}, Iterations {:4d}, Training loss: {:.4f}".format(epoch+1, 
                                                                                            batch_total+1,
                                                                                            loss))
                # save every 10 epochs                                                                          
                if (epoch + 1) % 10 == 0:
                    self.saver.save(sess, 
                                    'model/sentiment-rnn-{}.ckpt'.format(epoch+1))
    
    def predict(self, X, predict_proba=False):
        pred = []
        with tf.Session(graph=self.g) as sess:
            # restore latest model
            self.saver.restore(sess, 
                               tf.train.latest_checkpoint('./model/'))
            
            # init model states
            state = sess.run(self.initial_state)
            
            for x_batch in batch_generator(X, y=None, batch_size=self.batch_size):
                feed = {'tf_x:0': x_batch,
                        'tf_keepprob:0': 1.0,
                        self.initial_state: state}
                
                if predict_proba:
                    y_pred, state = sess.run(['probabilities:0', self.final_state], 
                                           feed_dict=feed)
                else:
                    y_pred, state = sess.run(['labels:0', self.final_state], 
                                            feed_dict=feed)
                pred.append(y_pred)
                
        return np.concatenate(pred)

In [70]:
model = SentimentRNN(word_size=max(word2int.values())+1)

  << initial state >  (LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0' shape=(32, 256) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0' shape=(32, 256) dtype=float32>),)

 << lstm_output >>  Tensor("rnn/transpose_1:0", shape=(32, 200, 256), dtype=float32)

 << final state >>  (LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_3:0' shape=(32, 256) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_4:0' shape=(32, 256) dtype=float32>),)

  << logits     >>  Tensor("logits_squeezed:0", shape=(32,), dtype=float32)

 << predictions  >>  {'labels': <tf.Tensor 'labels:0' shape=(32,) dtype=int32>, 'probabilities': <tf.Tensor 'probabilities:0' shape=(32,) dtype=float32>}
