## Sources and Credits
This RNN model is based on Mat Leonard's Sentiment Prediction course on Udacity and the course materials he provided. 

In [1]:
import numpy as np
import tensorflow as tf

## Load Data

In [2]:
with open('rnntext.txt', 'r') as f: #inputs
    texts = f.read()
with open('rnntype.txt', 'r') as f: #labels
    types = f.read()

In [28]:
texts[:1000]

'Replace every "What if..." with "Why the fuck not\nThere are two ways to reach me: by way of kisses or by way of the imagination. But there is a hierarchy: the kisses alone don\'t work.\nScience is not separate from politics. As much as I would like it to be a pure thing, existing only in some intellectual realm unsullied by human struggle, it will always be entangled with the world we live in.\nThink those fake spider webs on your bushes are spooky? London woke up to an extra creepy layer of fog Monday morning, just in time for Halloween.\nIn difficult times, the only strategy which works is -\'Patience.\nThere is no secret ingredient\nThere is always something Right in something that is Wrong and something Wrong in something that is Right.\nDrone footage shows Rohingya refugees fleeing in a mass exodus from Myanmar into Bangladesh\nThere is no religion without love. Accept who you are and revel in it\nWar is an ugly thing, but not the ugliest of things: the decayed and degraded stat

In [29]:
from string import punctuation
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

all_text = ''.join([c.lower() for c in texts if c not in punctuation]) #lowercasing, punctuations removed
texts = all_text.split('\n')

#Tokenizing
quotes = []
for text in texts:
    tokens = tknzr.tokenize(text)
    quotes.append(' '.join(tokens))    
texts = quotes

#Optional: Stemming
'''
from nltk.stem import SnowballStemmer
snowball = SnowballStemmer(language='english')

texts_stem = []
for text in texts:
    quote_stem = []
    for token in text.split(' '):
        stem = snowball.stem(token)
        quote_stem.append(stem)
    texts_stem.append(' '.join(quote_stem))    
texts = texts_stem
'''
#Optional: Lemmatizing
'''
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
wnl = WordNetLemmatizer()

texts_lemma = []
for text in texts:
    quote_lemma = []
    for word, tag in pos_tag(word_tokenize(text)):
        wntag = tag[0].lower()
        if wntag == 'j':
            wntag = 'a'
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
        if not wntag:
            lemma = word
        else:
            lemma = wnl.lemmatize(word, wntag)
        quote_lemma.append(lemma)
    texts_lemma.append(' '.join(quote_lemma))    
texts = texts_lemma
'''
all_text = ' '.join(texts)
words = all_text.split()

## Inspect Data

In [30]:
all_text[:150]

'replace every what if with why the fuck not there be two way to reach me by way of kiss or by way of the imagination but there be a hierarchy the kiss'

In [31]:
words[:20]

['replace',
 'every',
 'what',
 'if',
 'with',
 'why',
 'the',
 'fuck',
 'not',
 'there',
 'be',
 'two',
 'way',
 'to',
 'reach',
 'me',
 'by',
 'way',
 'of',
 'kiss']

## Encode the words

In [32]:
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True) #descending
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)} 

texts_ints = []
for text in texts:
    texts_ints.append([vocab_to_int[word] for word in text.split()])

## Encode the labels

In [33]:
strtypes = str(types)
types = strtypes.split('\n')
types = np.array([1 if type == 'mundane' else 0 for type in types])

In [35]:
text_lens = Counter([len(x) for x in texts_ints])
print("Zero-length texts: {}".format(text_lens[0]))
print("Maximum text length: {}".format(max(text_lens)))

Zero-length texts: 0
Maximum text length: 628


In [36]:
non_zero_idx = [ii for ii, text in enumerate(texts_ints) if len(text) != 0]
len(non_zero_idx) #check if there is any empty input

18500

In [38]:
texts_ints = [texts_ints[ii] for ii in non_zero_idx]
types = np.array([types[ii] for ii in non_zero_idx])

In [39]:
seq_len = 200
features = np.zeros((len(texts_ints), seq_len), dtype=int) #feft padding with 0's if an input has fewer than 200 words
for i, row in enumerate(texts_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]

In [40]:
features[92,150:200]

array([   1, 2138,    3,   50,  258,    9,  286,    1, 1216,   99,  124,
          2, 1880, 2275,    4,   26,  135, 1053,    4,  383,    9,  286,
          9,   78,    1,  130,    3,  177,  330,    5,   46, 6622,    5,
          1,  421,    3,  710, 2468,   29,  396,   20,    6, 4429, 1412,
          4,    2, 1321,   20,   77, 1840])

## Training, Validation, Test



In [41]:
split_frac = 0.8
split_idx = int(len(features)*0.8)
train_x, val_x = features[3700:], features[:3700]
train_y, val_y = types[3700:], types[:3700]

test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(14800, 200) 
Validation set: 	(1850, 200) 
Test set: 		(1850, 200)


## Build the graph

In [42]:
#defining the hyperparameters.
lstm_size = 256
lstm_layers = 1
batch_size = 500
learning_rate = 0.01

In [43]:
n_words = len(vocab_to_int) + 1 # Adding 1 because we use 0's for padding, dictionary started at 1

# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    texts_ = tf.placeholder(tf.int32, [None, None], name='texts')
    types_ = tf.placeholder(tf.int32, [None, None], name='types')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

## Embedding

In [44]:
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 300 

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, texts_)

## LSTM cells

In [45]:
with graph.as_default():
    # Your basic LSTM cell
    lstm = tf.nn.rnn_cell.LSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

## Forward Pass

In [46]:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed,
                                             initial_state=initial_state)

## Output

In [47]:
with graph.as_default():
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(types_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

## Validation Accuracy

In [48]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), types_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

## Batching

In [49]:
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

## Training

In [50]:
### Make sure the checkpoints directory exists
epochs = 10

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {texts_: x,
                    types_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%5==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {texts_: x,
                            types_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 0/10 Iteration: 5 Train loss: 0.202
Val acc: 0.737
Epoch: 0/10 Iteration: 10 Train loss: 0.135
Val acc: 0.791
Epoch: 0/10 Iteration: 15 Train loss: 0.105
Val acc: 0.845
Epoch: 0/10 Iteration: 20 Train loss: 0.124
Val acc: 0.826
Epoch: 0/10 Iteration: 25 Train loss: 0.124
Val acc: 0.865
Epoch: 1/10 Iteration: 30 Train loss: 0.086
Val acc: 0.868
Epoch: 1/10 Iteration: 35 Train loss: 0.060
Val acc: 0.872
Epoch: 1/10 Iteration: 40 Train loss: 0.069
Val acc: 0.881
Epoch: 1/10 Iteration: 45 Train loss: 0.068
Val acc: 0.882
Epoch: 1/10 Iteration: 50 Train loss: 0.044
Val acc: 0.885
Epoch: 1/10 Iteration: 55 Train loss: 0.043
Val acc: 0.887
Epoch: 2/10 Iteration: 60 Train loss: 0.040
Val acc: 0.893
Epoch: 2/10 Iteration: 65 Train loss: 0.038
Val acc: 0.889
Epoch: 2/10 Iteration: 70 Train loss: 0.040
Val acc: 0.881
Epoch: 2/10 Iteration: 75 Train loss: 0.039
Val acc: 0.889
Epoch: 2/10 Iteration: 80 Train loss: 0.027
Val acc: 0.892
Epoch: 2/10 Iteration: 85 Train loss: 0.040
Val acc: 0.88

## Testing

In [51]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {texts_: x,
                types_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoints\sentiment.ckpt
Test accuracy: 0.892
