# SENTIMENT ANALYSIS USING ANY NETWORK AND ANY DATASET:

DATASET - IMDB REVIEW DATASET

Importing basic libraries

Defining function to preprocess data to remove all the punctuations and remove words with 5 or fewer occurences from each review

In [1]:
import numpy as np
import tensorflow as tf

# Preprocessing dataset

Importing dataset and preprocessing it. In preprocesing, we extract the words from each review and remove punctuations.

In [2]:
import re
from collections import Counter
from nltk.corpus import stopwords

def preprocess(text):
    
    # Replace punctuation with tokens so we can use them in our model
    text = text.lower()
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <PERIOD> ')
    text = text.replace('"', ' <PERIOD> ')
    text = text.replace(';', ' <PERIOD> ')
    text = text.replace('!', ' <PERIOD> ')
    text = text.replace('?', ' <PERIOD> ')
    text = text.replace('(', ' <PERIOD> ')
    text = text.replace(')', ' <PERIOD> ')
    text = text.replace('--', ' <PERIOD> ')
    text = text.replace('?', ' <PERIOD> ')
    text = text.replace('<br />', ' <PERIOD> ')
    text = text.replace('\\', ' <PERIOD> ')
    # text = text.replace('\n', ' <NEW_LINE> ')
    text = text.replace(':', ' <PERIOD> ')
    text = text.replace(' <PERIOD> ', ' ')
    words = text.split()
    
    return words

def removing_noise(words):
    word_count = Counter(words)
    stops = set(stopwords.words("english"))
    words_new = [word for word in words if (word_count[word]>5) #and (not word in stops)
                ]
    return words_new
    
    

In [37]:
import csv

filename = 'data/labeledTrainData.tsv'
review_ids = []
reviews = []
labels = []
#importing dataset into lists
with open(filename, 'r') as f:
    next(f)
    reader = csv.reader(f, delimiter='\t')
    
    for row in reader:
        review_ids.append(row[0])
        labels.append([int(row[1])] )
        reviews.append(row[2])

In [38]:
len(reviews)

25000

In [39]:
reviews[0]

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [40]:
labels[:10]

[[1], [1], [0], [0], [1], [1], [0], [0], [0], [1]]

In [41]:
reviews_pp = []
words = []

for review in reviews:
    review_pp = preprocess(review)
    reviews_pp.append(review_pp)
    words.extend(review_pp)
    
words = removing_noise(words)

## Encoding the words to generate a map for word to integer mapping

Generating a map to convert each word to an integer. The integer assigned to a word is decided by the no. of times it occurs. Higher the occurences, lower is the integer assigned.
After that, each review is converted in the form of integers to feed them well in the network.

In [42]:
#Converting word to integers and making the vocabulary
vocab = set(words)
vocab_size = len(vocab)
words_count = Counter(words)
sorted_vocab = sorted(words_count, key = words_count.get, reverse = True)
word_to_int = {word:i for i,word in enumerate(sorted_vocab)}

#Converting each review in the form of integers
reviews_pp_ints = []
for review in reviews_pp:
    this_review_int = []
    for word in review:
        if word in vocab:
            this_review_int.append(word_to_int[word])
    reviews_pp_ints.append(this_review_int)

In [43]:
max_l = 0
min_l = 10000
for review in reviews_pp_ints:
    if len(review) > max_l:
        max_l = len(review)
    if len(review) < min_l:
        min_l = len(review)

min_l, max_l

(10, 2354)

## Function to make the input layer for the review passed as the argument

This function generates a vector of length same as the vocabulary. Each index corresponds to the equivalent word. The value at that index corresponds to the frequency of that word in the review.

In [44]:
def generate_input_layer(review_int):
    input_layer = np.zeros(( vocab_size))
    for word in review_int:
        input_layer[word] += 1
        
    return input_layer

## Splitting dataset into training, validation and testing dataset

The dataset is split into the three categories. The split fraction of training is 0.8 and validation, testing set equally from the remaining dataset.

In [45]:
split_frac = 0.8
split_index = int(len(reviews_pp_ints)*0.8)

train_x, val_x = reviews_pp_ints[:split_index], reviews_pp_ints[split_index:]
train_y, val_y = labels[:split_index], labels[split_index:]

test_index = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_index], val_x[test_index:]
val_y, test_y = val_y[:test_index], val_y[test_index:]



In [46]:
len(train_x)

20000

In [None]:
#Hyperparameters
input_dim = vocab_size
hidden_dim = 100
output_dim = 1 

#Network Architecture
X = tf.placeholder(tf.float32, [None, input_dim])
Y = tf.placeholder(tf.float32, [None, 1])

weights_0_1 = tf.Variable( tf.zeros([input_dim, hidden_dim]) )
weights_1_2 = tf.Variable( tf.zeros([hidden_dim, output_dim]) ) 

layer_1 = tf.matmul(X, weights_0_1)
layer_2 = tf.sigmoid( tf.matmul(layer_1, weights_1_2) )

loss = Y-layer_2
minimise = tf.train.GradientDescentOptimizer(0.01).minimize(loss)



In [None]:
#Training 
sess = tf.Session()
sess.run(tf.global_variables_initializer())

n_epoch = 10
batch_size = 100
n_batch = len(train_x)/batch_size
for epoch in range(n_epoch):
    start=0
    for batch in range(n_batch):
        inp,out = train_x[start:start+batch_size], train_y[start:start+batch_size] 
        inp = [generate_input_layer(i) for i in inp] 
        
        _, loss_ = sess.run([minimise, loss],feed_dict={X:inp , Y: out})
        start += batch_size 
    print 'Epoch={}  Loss={} '.format(epoch, loss_)


Epoch=0  Loss=[[-0.5]
 [-0.5]
 [ 0.5]
 [ 0.5]
 [ 0.5]
 [-0.5]
 [-0.5]
 [ 0.5]
 [ 0.5]
 [ 0.5]
 [-0.5]
 [ 0.5]
 [ 0.5]
 [-0.5]
 [ 0.5]
 [ 0.5]
 [-0.5]
 [ 0.5]
 [-0.5]
 [ 0.5]
 [ 0.5]
 [ 0.5]
 [ 0.5]
 [-0.5]
 [ 0.5]
 [ 0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [ 0.5]
 [ 0.5]
 [ 0.5]
 [ 0.5]
 [-0.5]
 [-0.5]
 [ 0.5]
 [-0.5]
 [ 0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [ 0.5]
 [ 0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [ 0.5]
 [-0.5]
 [ 0.5]
 [ 0.5]
 [-0.5]
 [ 0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [ 0.5]
 [-0.5]
 [ 0.5]
 [-0.5]
 [ 0.5]
 [-0.5]
 [-0.5]
 [ 0.5]
 [-0.5]
 [ 0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [ 0.5]
 [ 0.5]
 [ 0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [ 0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [-0.5]
 [ 0.5]
 [ 0.5]
 [ 0.5]
 [ 0.5]
 [-0.5]
 [ 0.5]
 [-0.5]
 [-0.5]
 [-0.5]] 
Epoch=1  Loss=[[-0.5]
 [-0.5]
 [ 0.5]
 [ 0.5]
 [ 0.5]
 [-0.5]
 [-0.5]
 [ 0.5]
 [ 0.5]
 [ 0.5]
 [-0.5]
 [ 0.5]
 [ 0.5]
 [-0.5]
 [ 0.5]
 [ 0.5]
 [-0.5]
 [ 0.5]
 [-0.5]
 [ 0.5]
 [ 0.5]
 [

In [36]:
train_y[start:start+batch_size]

AttributeError: 'list' object has no attribute 'reshape'