In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from string import punctuation
from collections import Counter
import re
import os

  from ._conv import register_converters as _register_converters


In [2]:
reviews_path = 'C:/Users/p0ng5/OneDrive/movie_data.csv'

In [3]:
#reviews_path = 'C:/Users/LENOVO/Desktop/movie_data.csv'

In [4]:
movie_data = pd.read_csv(reviews_path)

In [5]:
movie_data.head()

Unnamed: 0,review,sentiment
0,Nothing is fantastic! Simple as that! It's a f...,1
1,This really was a waste of time...the movie ha...,0
2,"This might sound weird, but I only got to see ...",0
3,Considering this film was released 8 years bef...,1
4,Very rarely does one come across an indie come...,1


In [6]:
movie_data.tail()

Unnamed: 0,review,sentiment
49995,"I Sell the Dead is a big, sloppy horror comedy...",1
49996,I know this sounds odd coming from someone bor...,1
49997,OK I had higher hopes for this Carnosaur movie...,0
49998,"Laurence Fishburne is a fine actor, and deserv...",1
49999,I lived in Tokyo for 7 months. Knowing the rea...,1


In [7]:
# count unique words
counter = Counter()

# clean text
for i, review in enumerate(movie_data['review']):
    
    text = "".join(c if c not in punctuation else " {} ".format(c) for c in review).lower()
    movie_data.iloc[i, 0] = text
    
    counter.update(text.split())

In [8]:
# word2int dictionary
word2int = {w:i for i, w in enumerate(sorted(counter, key=counter.get, reverse=True), start=1)}

In [9]:
# int-list reviews
mapped_reviews = []
for review in movie_data['review']:
    mapped_reviews.append([word2int[word] for word in review.split()])

In [10]:
sequence_length = 200

# if length < sequence_length : left padd with zeros
# if length > sequence_length : take last 'sequence_length' elements

# padded sequence
sequences = np.zeros(shape=(len(movie_data), sequence_length), dtype=int)

In [11]:
for i, mapped_review in enumerate(mapped_reviews):
    n = len(mapped_review)
    if n < sequence_length:
        sequences[i, -n:] = mapped_review
    else:
        sequences[i, :] = mapped_review[-sequence_length:]

In [12]:
X_train = sequences[:25000, :]
y_train = movie_data.iloc[:25000, 1].values

X_test = sequences[25000:, :]
y_test = movie_data.iloc[25000:, 1].values

X_train.shape, y_train.shape, X_test.shape, y_train.shape

((25000, 200), (25000,), (25000, 200), (25000,))

In [13]:
X_val = X_test[:12500]
y_val = y_test[:12500]
X_test = X_test[12500:]
y_test = y_test[12500:]

X_val.shape, y_val.shape, X_test.shape, y_test.shape

((12500, 200), (12500,), (12500, 200), (12500,))

In [14]:
def batch_generator(X, y=None, batch_size=64):
    if y is not None:
        assert len(X) == len(y)
    n_batch = len(X) // batch_size
    for i in range(n_batch):
        a = i*batch_size
        b = (i+1)*batch_size
        
        if y is not None:
            yield X[a:b], y[a:b]
        else:
            yield X[a:b]

In [2]:
class SentimentRNN:
    
    def __init__(self, word_size, embed_size=200, lstm_size=256, num_layer=1,
                seq_length=200, learning_rate=1e-4, batch_size=32):
        # model hyper parameters
        self.word_size = word_size
        self.embed_size = embed_size
        self.lstm_size = lstm_size
        self.num_layer = num_layer
        self.seq_length = seq_length
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        
        # build model graph
        self.g = tf.Graph()
        with self.g.as_default():
            #tf.set_random_seed(123)
            self.build()
            self.saver = tf.train.Saver()
            self.init_op = tf.global_variables_initializer()
    
    def build(self):
        # placeholder for inputs
        tf_x = tf.placeholder(dtype=tf.int32, 
                              shape=(self.batch_size, self.seq_length), 
                              name='tf_x')
        tf_y = tf.placeholder(dtype=tf.float32, 
                              shape=(self.batch_size), 
                              name='tf_y')
        tf_keepprob = tf.placeholder(dtype=tf.float32, shape=(), 
                                     name='tf_keepprob')
        # embedding vector
        W_embedding = tf.Variable(
            tf.random_uniform(shape=(self.word_size, self.embed_size), minval=-1, maxval=1),
            name='W_embedding')
        
        embed_x = tf.nn.embedding_lookup(W_embedding, tf_x, name='embed_x')
        
        # create rnn cell
        cells = tf.contrib.rnn.MultiRNNCell([
            tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.BasicLSTMCell(self.lstm_size),
                                         output_keep_prob=tf_keepprob)
                                          for i in range(self.num_layer)
        ])
        
        # define the initial state/ rnn steps
        self.initial_state = cells.zero_state(self.batch_size, tf.float32)
        print("  << initial state > ", self.initial_state)
        
        lstm_outputs, self.final_state = tf.nn.dynamic_rnn(cell=cells, inputs=embed_x, 
                                                           initial_state=self.initial_state)
        print("\n << lstm_output >> ", lstm_outputs)
        print("\n << final state >> ", self.final_state)
        
        # dense layer -> logits
        logits = tf.layers.dense(inputs=lstm_outputs[:, -1], units=1, 
                                 activation=None, name='logits')
        
        logits = tf.squeeze(logits, name='logits_squeezed')
        print('\n  << logits     >> ', logits)
        
        # predictions -> prob. | labels
        y_proba = tf.nn.sigmoid(logits, name='probabilities')
        y_labels = tf.cast(tf.round(y_proba), dtype=tf.int32, name='labels')
        predictions = {
            'probabilities': y_proba,
            'labels': y_labels
        }
        print("\n << predictions  >> ", predictions)
        
        # cost function
        cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf_y, 
                                                                      logits=logits), 
                              name='cost')
        # optimizer
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        train_op = optimizer.minimize(cost, name='train_op')
    
    def train(self, X, y, num_epochs):
        with tf.Session(graph=self.g) as sess:
            sess.run(self.init_op)
            
            batch_total = 0
            for epoch in range(num_epochs):
                # reset cell&hidden states
                state = sess.run(self.initial_state)
                for x_batch, y_batch in batch_generator(X, y, batch_size=self.batch_size):
                    feed = {'tf_x:0': x_batch,
                            'tf_y:0': y_batch,
                            'tf_keepprob:0': 0.5,
                            self.initial_state: state}
                    loss, _, state = sess.run(['cost:0', 'train_op', self.final_state], 
                                              feed_dict=feed)
                    
                    # update training every 20 batches
                    batch_total += 1
                    if (batch_total+1) % 20 == 0:
                        print("Epoch {:3d}, Iterations {:4d} | Train loss: {:.4f}".format(epoch+1, 
                                                                                            batch_total+1,
                                                                                            loss))
                # save every 10 epochs                                                                          
                if (epoch + 1) % 10 == 0:
                    self.saver.save(sess, 
                                    'model/sentiment-rnn-{}.ckpt'.format(epoch+1))
    
    def predict(self, X, predict_proba=False):
        pred = []
        with tf.Session(graph=self.g) as sess:
            # restore latest model
            self.saver.restore(sess, 
                               tf.train.latest_checkpoint('./model/'))
            
            # init model states
            state = sess.run(self.initial_state)
            
            for x_batch in batch_generator(X, y=None, batch_size=self.batch_size):
                feed = {'tf_x:0': x_batch,
                        'tf_keepprob:0': 1.0,
                        self.initial_state: state}
                
                if predict_proba:
                    y_pred, state = sess.run(['probabilities:0', self.final_state], 
                                           feed_dict=feed)
                else:
                    y_pred, state = sess.run(['labels:0', self.final_state], 
                                            feed_dict=feed)
                pred.append(y_pred)
                
        return np.concatenate(pred)

In [3]:
sequence_length = 200
word_size = max(word2int.values()) + 1
embed_size = 256
hidden_size = 128
n_layer = 1
batch_size = 100
learning_rate = 0.001

model = SentimentRNN(word_size=word_size, embed_size=embed_size, lstm_size=hidden_size, 
                     num_layer=n_layer, seq_length=sequence_length, batch_size=batch_size, 
                     learning_rate=learning_rate)


NameError: name 'word2int' is not defined

In [24]:
model.train(X_train, y_train, num_epochs=40)

Epoch   1, Iterations   20 | Train loss: 0.6689
Epoch   1, Iterations   40 | Train loss: 0.5714
Epoch   1, Iterations   60 | Train loss: 0.6164
Epoch   1, Iterations   80 | Train loss: 0.6584
Epoch   1, Iterations  100 | Train loss: 0.5446
Epoch   1, Iterations  120 | Train loss: 0.4987
Epoch   1, Iterations  140 | Train loss: 0.4725
Epoch   1, Iterations  160 | Train loss: 0.4996
Epoch   1, Iterations  180 | Train loss: 0.5622
Epoch   1, Iterations  200 | Train loss: 0.5063
Epoch   1, Iterations  220 | Train loss: 0.4509
Epoch   1, Iterations  240 | Train loss: 0.4027
Epoch   2, Iterations  260 | Train loss: 0.5039
Epoch   2, Iterations  280 | Train loss: 0.3833
Epoch   2, Iterations  300 | Train loss: 0.4192
Epoch   2, Iterations  320 | Train loss: 0.4525
Epoch   2, Iterations  340 | Train loss: 0.3851
Epoch   2, Iterations  360 | Train loss: 0.2664
Epoch   2, Iterations  380 | Train loss: 0.2420
Epoch   2, Iterations  400 | Train loss: 0.3306
Epoch   2, Iterations  420 | Train loss:

Epoch  14, Iterations 3440 | Train loss: 0.0023
Epoch  14, Iterations 3460 | Train loss: 0.0097
Epoch  14, Iterations 3480 | Train loss: 0.0064
Epoch  14, Iterations 3500 | Train loss: 0.0044
Epoch  15, Iterations 3520 | Train loss: 0.0028
Epoch  15, Iterations 3540 | Train loss: 0.0016
Epoch  15, Iterations 3560 | Train loss: 0.0060
Epoch  15, Iterations 3580 | Train loss: 0.0148
Epoch  15, Iterations 3600 | Train loss: 0.0145
Epoch  15, Iterations 3620 | Train loss: 0.0009
Epoch  15, Iterations 3640 | Train loss: 0.0011
Epoch  15, Iterations 3660 | Train loss: 0.0178
Epoch  15, Iterations 3680 | Train loss: 0.0008
Epoch  15, Iterations 3700 | Train loss: 0.0006
Epoch  15, Iterations 3720 | Train loss: 0.0062
Epoch  15, Iterations 3740 | Train loss: 0.0729
Epoch  16, Iterations 3760 | Train loss: 0.0598
Epoch  16, Iterations 3780 | Train loss: 0.0007
Epoch  16, Iterations 3800 | Train loss: 0.0042
Epoch  16, Iterations 3820 | Train loss: 0.0054
Epoch  16, Iterations 3840 | Train loss:

Epoch  28, Iterations 6860 | Train loss: 0.0004
Epoch  28, Iterations 6880 | Train loss: 0.1193
Epoch  28, Iterations 6900 | Train loss: 0.0111
Epoch  28, Iterations 6920 | Train loss: 0.0125
Epoch  28, Iterations 6940 | Train loss: 0.0076
Epoch  28, Iterations 6960 | Train loss: 0.1508
Epoch  28, Iterations 6980 | Train loss: 0.0128
Epoch  28, Iterations 7000 | Train loss: 0.0163
Epoch  29, Iterations 7020 | Train loss: 0.0052
Epoch  29, Iterations 7040 | Train loss: 0.0028
Epoch  29, Iterations 7060 | Train loss: 0.0023
Epoch  29, Iterations 7080 | Train loss: 0.0222
Epoch  29, Iterations 7100 | Train loss: 0.0033
Epoch  29, Iterations 7120 | Train loss: 0.0136
Epoch  29, Iterations 7140 | Train loss: 0.0175
Epoch  29, Iterations 7160 | Train loss: 0.0210
Epoch  29, Iterations 7180 | Train loss: 0.0033
Epoch  29, Iterations 7200 | Train loss: 0.0223
Epoch  29, Iterations 7220 | Train loss: 0.0043
Epoch  29, Iterations 7240 | Train loss: 0.0011
Epoch  30, Iterations 7260 | Train loss:

In [25]:
# val accuracy
val_preds = model.predict(X_val)

INFO:tensorflow:Restoring parameters from ./model/sentiment-rnn-40.ckpt


In [21]:
y_val

array([1, 1, 1, ..., 0, 1, 1], dtype=int64)

In [None]:
# tensorboard

In [None]:
model = SentimentRNN(word_size=word_size, embed_size=embed_size, lstm_size=hidden_size, 
                     num_layer=n_layer, seq_length=sequence_length, batch_size=batch_size, 
                     learning_rate=learning_rate)


### Project 2 - implementing an RNN for character-level language modeling 

In [16]:
with open('pg2265.txt', 'r') as f:
    text = f.read()

In [17]:
len(text)

178707

In [18]:
text = text[15858:]

In [19]:
char_set = set(text)

In [20]:
char2int = {c:i for i,c in enumerate(char_set)}
int2char = {i:c for i,c in enumerate(char_set)}

assert len(char2int) == len(int2char)

In [21]:
text_ints = np.array([char2int[c] for c in text], dtype=np.int32)

In [22]:
text_ints.shape

(162849,)

In [34]:
def reshape_date(sequence, batch_size, num_steps):
    '''generate x, y  sequences from the given sequence
    according to batch_size and num_steps
    
    parameters:
    -----------
    sequence : 1-d array
        a sequence of integers (converted from char2int)
    batch_size : int
        size of batch in each iteration
    num_steps : int
        length of sequence in each batch
    '''
    total_batch_length = batch_size * num_steps
    n_batch = len(sequence) // total_batch_length
    
    # trim original sequence fit to batches
    x = sequence[0:n_batch*total_batch_length]
    y = sequence[1:n_batch*total_batch_length+1]
    assert len(x) == len(y)
    
    # reshape to [batch_size, n_batch*num_steps] matrix
    x = np.asarray(x).reshape(batch_size, n_batch*num_steps)
    y = np.asarray(y).reshape(batch_size, n_batch*num_steps)
    
    return x, y

def batch_generator(data_x, data_y, num_steps):
    batch_size, total_length = data_x.shape
    n_batch = total_length // num_steps
    for i in range(n_batch):
        a, b = i*num_steps, (i+1)*num_steps
        yield data_x[a:b], data_y[a:b]

In [30]:
x, y = reshape_date(text_ints, batch_size=64, num_steps=100)

In [31]:
x.shape, y.shape

((64, 2500), (64, 2500))

In [32]:
x[:5, :10]

array([[54, 12, 28, 63, 54,  4, 41, 56, 28, 51],
       [56, 28, 63, 58, 12, 28, 28, 42, 63, 31],
       [28, 42, 63, 18, 12, 28,  4, 28, 63, 20],
       [63, 18, 20, 31, 28, 31, 58, 63, 31, 57],
       [12, 20, 49, 28, 42, 13, 37, 49, 51, 63]])

In [33]:
y[:5, :10]

array([[12, 28, 63, 54,  4, 41, 56, 28, 51, 20],
       [28, 63, 58, 12, 28, 28, 42, 63, 31,  3],
       [42, 63, 18, 12, 28,  4, 28, 63, 20, 58],
       [18, 20, 31, 28, 31, 58, 63, 31, 57,  4],
       [20, 49, 28, 42, 13, 37, 49, 51, 63, 58]])

In [57]:
class CharRnn:
    
    def __init__(self, n_classes, lstm_size=128, n_layers=1, 
                 batch_size=64, n_steps=100, learning_rate=1e-3,
                keep_prob=.5, grad_clip=5, sampling=False, random_state=None):
        # model parameters
        self.n_classes = n_classes
        self.lstm_size = lstm_size
        self.batch_size = batch_size
        self.n_steps = n_steps
        self.n_layers = n_layers
        self.learning_rate = learning_rate
        self.keep_prob = keep_prob
        self.grad_clip = grad_clip
        self.random_state = random_state
        
        self.g = tf.Graph()
        with self.g.as_default():
            tf.set_random_seed(self.random_state)
            self.build(sampling)
            self.saver = tf.train.Saver()
            self.init_op = tf.global_variables_initializer()
            
    
    def build(self, sampling):
        if sampling == True:
            # sampling mode
            batch_size, n_step = 1, 1
        else:
            # traing mode
            batch_size, n_step = self.batch_size, self.n_steps
        
        # init placeholders
        tf_x = tf.placeholder(dtype=tf.int32, shape=(batch_size, n_step), name='tf_x')
        tf_y = tf.placeholder(dtype=tf.int32, shape=(batch_size, n_step), name='tf_y')
        tf_keepprob = tf.placeholder(dtype=tf.float32, shape=(), name='tf_keepprob')
        
        # one-hot encoding
        x_onehot = tf.one_hot(tf_x, depth=self.n_classes)
        y_onehot = tf.one_hot(tf_y, depth=self.n_classes)
        print(" << y_onehot >> ", y_onehot)
        # build multi-layers RNN cells
        cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(
            tf.contrib.rnn.BasicLSTMCell(num_units=self.lstm_size), 
            input_keep_prob=tf_keepprob) for i in range(self.n_layers)])
        
        # define initial state
        self.initial_state = cells.zero_state(batch_size=batch_size, dtype=tf.float32)
        
        # run sequence step through the RNN
        lstm_outputs, self.final_state = tf.nn.dynamic_rnn(cells, 
                                                      inputs=x_onehot, 
                                                      initial_state=self.initial_state)
        print("  << lstm_outputs  >> ", lstm_outputs)
        # reshape output 3-d [batch_size, steps, lstm_size] -> 2-d [batch_size*steps, lstm_size]
        seq_outputs = tf.reshape(lstm_outputs, shape=[-1, self.lstm_size], name='seq_outputs_reshape')
        
        # dense layer -> out shape [seq_length, n_classes]
        logits = tf.layers.dense(seq_outputs, units=self.n_classes, 
                                 activation=None, name='logits')
        print(" << logits >> ", logits)
        proba = tf.nn.softmax(logits, axis=-1, name='probabilities')
        
        y_reshaped = tf.reshape(y_onehot, shape=(-1, self.n_classes), name='y_reshaped')
        
        # cost function
        cost = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits_v2(
                                                    labels=y_reshaped, 
                                                    logits=logits), 
                    name='cost')
        
        # gradient clipping to avoiding exploding gradients
        train_vars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(t_list=tf.gradients(cost, train_vars), 
                                          clip_norm=self.grad_clip)
        # optimizer 
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        train_op = optimizer.apply_gradients(grads_and_vars=zip(grads, train_vars), 
                                             name='train_op')
        
    def train(self, x_train, y_train, n_epochs, ckpt_dir='C:\\Users\\LENOVO\\OneDrive\\tmp\\model'):
        
        # check valid ckpt_dir
        if not os.path.exists(ckpt_dir):
            os.makedirs(ckpt_dir)
        self.ckpt_dir = ckpt_dir
        self.best_loss_ = float('inf')
        with tf.Session(graph=self.g) as sess:
            sess.run(self.init_op)
            
            n_batch = x_train.shape[1]//self.n_steps
            total_iterations = 0
            for i in range(n_epochs):
                # init zero_state
                state = sess.run(self.initial_state)
                avg_loss = 0.0
                for j, (x_batch, y_batch) in enumerate(batch_generator(x_train, y_train, 
                                                                       num_steps=self.n_steps)):
                    feed = {'tf_x:0': x_batch,
                            'tf_y:0': y_batch,
                            'tf_keepprob:0': self.keep_prob, 
                             self.initial_state: state}
                    loss, _, state = sess.run(['cost:0', 'train_op', self.final_state], 
                                              feed_dict=feed)
                    
                    avg_loss += loss            
                    # display every 20 iterations
                    total_iterations += 1
                    if (total_iterations % 20 == 0):
                        print("Epoch {:2d}/{}, Iterations {:4d} | Train loss {:.5f}".format(i, n_epochs, 
                                                                                            total_iterations,
                                                                                            loss))
                # one pass finish
                avg_loss /= (j+1)
                # save model if better than best_loss
                if avg_loss < self.best_loss_:
                    self.saver.save(sess, os.path.join(self.ckpt_dir, 
                                                       'char_model-{}.ckpt'.format(total_iterations)))
    
                        
    def sample(self, output_length, ckpt_dir='C:\\Users\\LENOVO\\OneDrive\\tmp\\model',
               starter_seq='The '):
        pass

In [58]:
model = CharRnn(n_classes=len(char_set))

 << y_onehot >>  Tensor("one_hot_1:0", shape=(64, 100, 65), dtype=float32)
  << lstm_outputs  >>  Tensor("rnn/transpose_1:0", shape=(64, 100, 128), dtype=float32)
 << logits >>  Tensor("logits/BiasAdd:0", shape=(6400, 65), dtype=float32)


In [56]:
2 < float('inf')

True