# Sentiment Analysis by RRN with LSTM Units

This notebook is based on [LSTM-Sentiment-Analysis](https://github.com/adeshpande3/LSTM-Sentiment-Analysis).

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sys

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
import glob
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Embedding

from random import randint
import re
import datetime

%matplotlib inline

Using TensorFlow backend.


In [2]:
print(sys.version)

3.6.5 (default, Jul 18 2018, 11:31:17) 
[GCC 4.2.1 Compatible Apple LLVM 9.1.0 (clang-902.0.39.2)]


# Loading Data 

Loading words list and word vectors

- We downloaded wordsList_glove_dim200.npy and wordVectors_glove_dim200.npy from [GloVe](http://nlp.stanford.edu/projects/glove/).

- We made wordsList_TR.npy and wordVectors_TR.npy from Thomson Reuters News Archive from 2003 to 2016 (8856M words).

In [3]:
wordsList = np.load('wordsList_glove_dim200.npy').tolist()
#wordsList = np.load('wordsList_TR.npy').tolist()
print('Length of word list: ',len(wordsList))

wordVectors = np.load('wordVectors_glove_dim200.npy')
#wordVectors = np.load('wordVectors_TR.npy')
print(wordVectors.dtype)

print('Shape of word vectors: ',wordVectors.shape)
print('Dimensions for each word vector: ',wordVectors.shape[1])
numDimensions = wordVectors.shape[1] #Dimensions for each word vector
print('numDimensions = ',numDimensions)

Length of word list:  400000
float32
Shape of word vectors:  (400000, 200)
Dimensions for each word vector:  200
numDimensions =  200


Loading positive documents and negative documents

- Users must prepare positive documents ond negative documents by yourself.

- In this git, positiveDocs.zip and negativeDocs.zip are movie reviews .

In [4]:
positiveFiles = glob.glob('positiveDocs/*')
negativeFiles = glob.glob('negativeDocs/*')
def read_files(file_paths):
    texts=[]
    for path in tqdm(file_paths):
        with open(path, "r", encoding='utf-8') as f:
            texts.append(f.readline())
    return texts

positive_texts = read_files(positiveFiles)
negative_texts = read_files(negativeFiles)

100%|██████████| 12500/12500 [00:21<00:00, 580.51it/s] 
100%|██████████| 12500/12500 [00:32<00:00, 389.01it/s] 


In [5]:
df_positive = pd.DataFrame(positive_texts, columns=['text'])
df_positive['sentiment'] = 1
df_negative = pd.DataFrame(negative_texts, columns=['text'])
df_negative['sentiment'] = -1
df = pd.concat([df_positive, df_negative]).reset_index(drop=True)

In [6]:
# Removes punctuation, parentheses, question marks, etc., and leaves only alphanumeric characters
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentences(string):
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

In [7]:
df['cleanedText'] = df['text'].apply(cleanSentences)

In [8]:
df.tail(10)

Unnamed: 0,text,sentiment,cleanedText
24990,Stocks on the move [HOT-RTRS] Real-time Equity...,-1,stocks on the move hotrtrs realtime equity new...
24991,"(Adds Apple, Forest, updates prices) NEW YORK,...",-1,adds apple forest updates prices new york nov ...
24992,"(Adds byline, analyst quote, details) By Bill ...",-1,adds byline analyst quote details by bill rigb...
24993,"SAN FRANCISCO, July 13 (Reuters) - Apple Compu...",-1,san francisco july 13 reuters apple computer ...
24994,Stocks on the move [HOT-RTRS] Real-time Equity...,-1,stocks on the move hotrtrs realtime equity new...
24995,* Q2 EPS $0.25 vs est $0.27 * Q2 was $114 mln ...,-1,q2 eps 025 vs est 027 q2 was 114 mln vs est ...
24996,"By Eric Onstad AMSTERDAM, May 5 (Reuters) - A ...",-1,by eric onstad amsterdam may 5 reuters a new ...
24997,"NEW YORK, July 31 (Reuters) - Wachovia Corp <W...",-1,new york july 31 reuters wachovia corp wbn sa...
24998,"FARNBOROUGH, England, July 14 (Reuters) - An o...",-1,farnborough england july 14 reuters an order ...
24999,"By Bernhard Warner, European Internet Correspo...",-1,by bernhard warner european internet correspon...


# Text to id

In [9]:
vectorizer = CountVectorizer(binary=True, stop_words=stopwords.words('english'), 
                             lowercase=True, min_df=3, max_df=0.98)
text_onehot = vectorizer.fit_transform(df['cleanedText'].values)
print(text_onehot.shape)

(25000, 74091)


In [10]:
word2idx = {word: idx for idx, word in enumerate(vectorizer.get_feature_names())}
tokenize = vectorizer.build_tokenizer()
preprocess = vectorizer.build_preprocessor()
 
def to_sequence(tokenizer, preprocessor, index, text):
    words = tokenizer(preprocessor(text))
    indexes = [index[word] for word in words if word in index]
    return indexes
 
print(to_sequence(tokenize, preprocess, word2idx, "This is an important test!"))  # [2269, 4453]
df['text_toid'] = [to_sequence(tokenize, preprocess, word2idx, x) for x in df['cleanedText'].values]
df.head()

[44786, 68462]


Unnamed: 0,text,sentiment,cleanedText,text_toid
0,Stocks on the move [HOT-RTRS] Real-time Equity...,1,stocks on the move hotrtrs realtime equity new...,"[66507, 52349, 39752, 59985, 33644, 53468, 609..."
1,Real-time equity news [U E] U.S. stock market ...,1,realtime equity news u e us stock market repor...,"[59985, 33644, 53468, 71063, 66481, 50213, 608..."
2,"March 16 (Reuters) - Nearly two years ago, act...",1,march 16 reuters nearly two years ago activis...,"[50125, 5806, 61225, 53202, 70141, 73725, 1944..."
3,06 Nov Q3 AutoNation <AN> 0.29 11 0.39 06 Nov ...,1,06 nov q3 autonation an 029 11 039 06 nov q3 b...,"[1078, 54399, 59167, 21875, 450, 2926, 617, 10..."
4,"(Recasts first paragraph, adds details SAN FRA...",1,recasts first paragraph adds details san franc...,"[60075, 35513, 56257, 18985, 30768, 62803, 362..."


In [11]:
df['length'] = df['text_toid'].apply(lambda x: len(x))

In [12]:
df['length'].describe()

count    25000.000000
mean       325.504040
std        208.537934
min         32.000000
25%        159.000000
50%        304.000000
75%        433.000000
max       1215.000000
Name: length, dtype: float64

In [13]:
maxSeqLength = 550
N_FEATURES = len(vectorizer.get_feature_names())
texts_padding = pad_sequences(df['text_toid'].values, maxlen=maxSeqLength, value=N_FEATURES)
#print(texts_padding[0])
ids = texts_padding
print(ids.shape)

(25000, 550)


In [14]:
def getTrainBatch():
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        if (i % 2 == 0): 
            num = randint(1,11499)
            labels.append([1,0])
        else:
            num = randint(13499,24999)
            labels.append([0,1])
        arr[i] = ids[num-1:num]
    return arr, labels

def getTestBatch():
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(11499,13499)
        if (num <= 12499):
            labels.append([1,0])
        else:
            labels.append([0,1])
        arr[i] = ids[num-1:num]
    return arr, labels

# RNN with LSTM Units Model

In [15]:
batchSize = 24
lstmUnits = 64
numClasses = 2
iterations = 100050

In [16]:
tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors,input_data)

lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

Instructions for updating:
Colocations handled automatically by placer.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [17]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

Instructions for updating:
Use tf.cast instead.


# Training

In [18]:
sess = tf.Session()
#with tf.Session() as sess:
tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

Users enter 'tensorboard --logdir=tensorboard' on your terminal and visit http://localhost:6006/ with a browser to keep an eye on your training progress.

In [None]:
sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

for i in range(iterations):
   #Next Batch of reviews
    nextBatch, nextBatchLabels = getTrainBatch();
    sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})
   
   #Write summary to Tensorboard
    if (i % 50 == 0):
        summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
        writer.add_summary(summary, i)

   #Save the network every 1,000 training iterations
    if (i % 10000 == 0 and i != 0):
        save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
        save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
        print("saved to %s" % save_path)
writer.close()

saved to models_kai/pretrained_lstm.ckpt-10000
saved to models_kai/pretrained_lstm.ckpt-20000
saved to models_kai/pretrained_lstm.ckpt-30000
