# SENTIMENT ANALYSIS USING ANY NETWORK AND ANY DATASET:

DATASET - IMDB REVIEW DATASET

Importing basic libraries

Defining function to preprocess data to remove all the punctuations and remove words with 5 or fewer occurences from each review

In [2]:
import numpy as np
import tensorflow as tf

# Preprocessing dataset

Importing dataset and preprocessing it. In preprocesing, we extract the words from each review and remove punctuations.

In [3]:
import re
from collections import Counter
from nltk.corpus import stopwords

def preprocess(text):
    
    # Replace punctuation with tokens so we can use them in our model
    text = text.lower()
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <PERIOD> ')
    text = text.replace('"', ' <PERIOD> ')
    text = text.replace(';', ' <PERIOD> ')
    text = text.replace('!', ' <PERIOD> ')
    text = text.replace('?', ' <PERIOD> ')
    text = text.replace('(', ' <PERIOD> ')
    text = text.replace(')', ' <PERIOD> ')
    text = text.replace('--', ' <PERIOD> ')
    text = text.replace('?', ' <PERIOD> ')
    text = text.replace('<br />', ' <PERIOD> ')
    text = text.replace('\\', ' <PERIOD> ')
    # text = text.replace('\n', ' <NEW_LINE> ')
    text = text.replace(':', ' <PERIOD> ')
    text = text.replace(' <PERIOD> ', ' ')
    words = text.split()
    
    return words

def removing_noise(words):
    word_count = Counter(words)
    #stops = set(stopwords.words("english"))
    words_new = [word for word in words if (word_count[word]>5) #and (not word in stops)
                ]
    return words_new
    
    

In [4]:
import csv

filename = 'data/labeledTrainData.tsv'
review_ids = []
reviews = []
labels = []
#importing dataset into lists
with open(filename, 'r') as f:
    next(f)
    reader = csv.reader(f, delimiter='\t')
    
    for row in reader:
        review_ids.append(row[0])
        labels.append([int(row[1])] )
        reviews.append(row[2])

In [5]:
len(reviews)

25000

In [5]:
reviews[0]

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [6]:
labels[:10]

[[1], [1], [0], [0], [1], [1], [0], [0], [0], [1]]

In [6]:
reviews_pp = []
words = []

for review in reviews:
    review_pp = preprocess(review)
    reviews_pp.append(review_pp)
    words.extend(review_pp)
    
words = removing_noise(words)

## Encoding the words to generate a map for word to integer mapping

Generating a map to convert each word to an integer. The integer assigned to a word is decided by the no. of times it occurs. Higher the occurences, lower is the integer assigned.
After that, each review is converted in the form of integers to feed them well in the network.

In [7]:
#Converting word to integers and making the vocabulary
vocab = set(words)
vocab_size = len(vocab)
words_count = Counter(words)
sorted_vocab = sorted(words_count, key = words_count.get, reverse = True)
word_to_int = {word:i for i,word in enumerate(sorted_vocab)}

#Converting each review in the form of integers
reviews_pp_ints = []
for review in reviews_pp:
    this_review_int = []
    for word in review:
        if word in vocab:
            this_review_int.append(word_to_int[word])
    reviews_pp_ints.append(this_review_int)

In [8]:
reviews_pp_ints[20]

[6278,
 1437,
 5,
 26,
 3,
 0,
 113,
 1,
 87,
 1160,
 1114,
 1301,
 99,
 3,
 0,
 1709,
 1,
 124,
 50,
 16199,
 51,
 29,
 1857,
 1494,
 254,
 2988,
 1,
 2,
 111,
 1918,
 0,
 11273,
 1208,
 1,
 0,
 10489,
 838,
 773,
 13,
 10,
 56,
 124,
 235,
 50,
 4227,
 1,
 5460,
 9,
 576,
 9376,
 0,
 2089,
 3,
 2368,
 6607,
 1526,
 82,
 3,
 89,
 555,
 172,
 1,
 44,
 1540,
 825,
 10,
 8,
 1229,
 1097,
 349,
 0,
 157,
 762,
 3,
 16924,
 5,
 2,
 2201,
 266,
 4,
 25,
 1202,
 14,
 16925,
 30,
 14379,
 8842,
 1,
 0,
 931,
 3,
 168,
 5584,
 1659,
 2171,
 0,
 59,
 789,
 3,
 1659,
 1389,
 5,
 18674,
 1,
 7709,
 29,
 0,
 6278,
 1138,
 5764,
 3557,
 6,
 2,
 83,
 3093,
 291,
 2,
 1036,
 5641,
 0,
 581,
 3,
 26,
 3,
 87,
 19740,
 7104,
 1,
 24,
 5765,
 717,
 18675,
 1,
 462,
 3841,
 0,
 225,
 5,
 2,
 111,
 28,
 204,
 1,
 0,
 1326,
 148,
 60,
 209,
 12,
 2,
 199,
 846,
 124,
 0,
 818,
 5,
 50,
 3028,
 1,
 7105,
 0,
 5,
 357,
 16926,
 1,
 688,
 26,
 3,
 0,
 87,
 6279,
 373,
 6,
 1709,
 434,
 2421,
 1583,
 4621,
 5,

In [9]:
max_l = 0
min_l = 10000
for review in reviews_pp_ints:
    if len(review) > max_l:
        max_l = len(review)
    if len(review) < min_l:
        min_l = len(review)

min_l, max_l

(10, 2354)

## Function to make the input layer for the review passed as the argument

This function generates a vector of length same as the vocabulary. Each index corresponds to the equivalent word. The value at that index corresponds to the frequency of that word in the review.

In [20]:
def generate_input_layer(review_int):
    input_layer = np.zeros(( vocab_size))
    for word in review_int:
        input_layer[word] += 1
        
    return input_layer

## Splitting dataset into training, validation and testing dataset

The dataset is split into the three categories. The split fraction of training is 0.8 and validation, testing set equally from the remaining dataset.

In [21]:
split_frac = 0.8
split_index = int(len(reviews_pp_ints)*split_frac)

train_x, val_x = reviews_pp_ints[:split_index], reviews_pp_ints[split_index:]
train_y, val_y = labels[:split_index], labels[split_index:]

test_index = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_index], val_x[test_index:]
val_y, test_y = val_y[:test_index], val_y[test_index:]



In [18]:
inp = [generate_input_layer(i) for i in train_x]
inp[0]

array([ 19.,  10.,  10., ...,   0.,   0.,   0.])

In [19]:
inp[0].shape

(28260,)

In [32]:
#Hyperparameters
input_dim = vocab_size
hidden_dim_1 = 300
hidden_dim_2 = 200
output_dim = 1 


#Network Architecture
X = tf.placeholder(tf.float32, [None, input_dim])
Y = tf.placeholder(tf.float32, [None, 1])

tf.set_random_seed(5)
weights_0_1 = tf.Variable( tf.truncated_normal([input_dim, hidden_dim_1], stddev = 0.1) )
bias_0_1 = tf.Variable( tf.zeros(hidden_dim_1))
weights_1_2 = tf.Variable( tf.truncated_normal([hidden_dim_1, output_dim], stddev = 0.1) ) 
bias_1_2 = tf.Variable(tf.zeros(output_dim))
#weights_2_3 = tf.Variable( tf.truncated_normal([hidden_dim_2, output_dim], stddev = 0.1) )
#bias_2_3 = tf.Variable(tf.zeros(output_dim))

layer_1 = tf.nn.relu(tf.add(tf.matmul(X, weights_0_1), bias_0_1))
layer_2 = tf.sigmoid( tf.add(tf.matmul(layer_1, weights_1_2), bias_1_2 ))
#layer_3 = tf.sigmoid(tf.add(tf.matmul(layer_2, weights_2_3), bias_2_3))

loss = tf.reduce_mean(tf.square(Y - layer_2))
minimise = tf.train.AdamOptimizer(0.01).minimize(loss)


In [33]:
#Training 
sess = tf.Session()
sess.run(tf.global_variables_initializer())

n_epoch = 10
batch_size = 500
n_batch = len(train_x)/batch_size
#print(n_batch)
for epoch in range(n_epoch):
    start=0
    for batch in range(int(n_batch)):
        inp,out = train_x[start:start+batch_size], train_y[start:start+batch_size] 
        inp = [generate_input_layer(i) for i in inp] 
        
        _, loss_ = sess.run([minimise, loss],feed_dict={X:inp , Y: out})
        #print(len(out))
        start += batch_size
        #print(start)
        if batch%20 == 0:
            print ('Epoch={}  Loss={} '.format(epoch, loss_))


Epoch=0  Loss=0.29273682832717896 
Epoch=0  Loss=0.10419298708438873 
Epoch=1  Loss=0.01050470769405365 
Epoch=1  Loss=0.02367044985294342 
Epoch=2  Loss=0.00276096910238266 
Epoch=2  Loss=0.006804957985877991 
Epoch=3  Loss=0.0011871934402734041 
Epoch=3  Loss=0.0041468702256679535 
Epoch=4  Loss=4.2644016502890736e-05 
Epoch=4  Loss=0.0021637319587171078 
Epoch=5  Loss=1.8886674297391437e-05 
Epoch=5  Loss=0.0020790284033864737 
Epoch=6  Loss=1.762185820552986e-05 
Epoch=6  Loss=0.002045560162514448 
Epoch=7  Loss=1.2842588148487266e-05 
Epoch=7  Loss=0.002033216878771782 
Epoch=8  Loss=9.094972483580932e-06 
Epoch=8  Loss=0.0020375631283968687 
Epoch=9  Loss=8.683432497491594e-06 
Epoch=9  Loss=0.0020246501080691814 


In [34]:
#Testing
def prediction(inp):
    inp = [generate_input_layer(inp)]     
    sigmoid_output = sess.run([layer_2],feed_dict={X:inp , Y: out})[0][0]
    return int(sigmoid_output>=0.5)        

accuracy_batch = []
for datapoint_index in range(len(test_x)):
    result = prediction(test_x[datapoint_index])
    if result == test_y[datapoint_index][0]:
        accuracy_batch.append(1)
    else:
        accuracy_batch.append(0)

print ('accuracy: {}'.format(sum(accuracy_batch)*100.0/len(accuracy_batch)  ))

accuracy: 89.12
