# Sentiment Analysis on Movie reviews dataset


In [1]:
import numpy as np
import pandas as pd
import sys
import time
from collections import Counter

In [2]:
def read_data(data):
    data=list(map(lambda x:x[:-1],data.readlines()))
    return data
def upper_data(data):
    data=list(map(lambda x:x.upper(),iter(data)))
    return data

In [25]:
data=open('reviews.txt','r')
reviews=read_data(data)

data=open('labels.txt','r')
labels=read_data(data)
labels=upper_data(labels)

In [4]:
words=Counter()
for i in range(len(reviews)):
    for word in reviews[i].split(' '):
        words[word]+=1

In [5]:
vocab=set(words.keys())
vocab_size=len(vocab)

In [6]:
word2index={}
for i,word in enumerate(vocab):
    word2index[word]=i

In [7]:
def map_input(review):
    layer=np.zeros((1,vocab_size))
    for word in review.split(' '):
        layer[0][word2index[word]]=1
    return layer

In [8]:
def get_target(label):
    if(label=="POSITIVE"):
        return 1
    elif(label=="NEGATIVE"):
        return 0

In [9]:
def sigmoid(x):
    return (1/(1+np.exp(-x)))

In [43]:
np.random.seed(40)
def train(reviews,labels,learning_rate,hidden_nodes):
    correct=0
    start_time=time.time()
    weights_0_1=np.zeros((vocab_size,hidden_nodes))
    weights_1_2=np.random.normal(0.0,1,(hidden_nodes,1))
    del_0_1=np.zeros(weights_0_1.shape)
    del_1_2=np.zeros(weights_1_2.shape)
    for i in range(len(reviews)):
        review=reviews[i]
        label=labels[i]
        target=get_target(label)
        
        x=map_input(review)
        one_input=np.dot(x,weights_0_1)
        one_output=one_input
        
        two_input=np.dot(one_output,weights_1_2)
        two_output=sigmoid(two_input)
        
        error=two_output-target
        two_error_term=error*two_output*(1-two_output)
        one_error_term=np.dot(two_error_term,weights_1_2.T)
        
        delta_1_2=two_error_term*one_output.T
        delta_0_1=one_error_term*x.T
        
        weights_1_2 -=learning_rate*delta_1_2
        weights_0_1 -=learning_rate*delta_0_1
        
        if(two_output >= 0.5 and label == 'POSITIVE'):
            correct += 1
        elif(two_output < 0.5 and label == 'NEGATIVE'):
            correct += 1
        
        elapsed_time = float(time.time() - start_time)
        if(i%1000==0):
            progress=(100 * i/(len(reviews)))
            accuracy=(correct * 100 / float(i+1))
            speed= i / elapsed_time if elapsed_time > 0 else 0
            print("Progress:{}  Correct:{} Accuracy:{} Speed(reviews/per sec):{}".format(progress,correct,accuracy,speed))
            
    return weights_0_1,weights_1_2

In [44]:
weights_0_1,weights_1_2=train(reviews[:-1000],labels[:-1000],learning_rate=0.001,hidden_nodes=10)

Progress:0.0  Correct:1 Accuracy:100.0 Speed(reviews/per sec):0.0
Progress:4.166666666666667  Correct:734 Accuracy:73.32667332667333 Speed(reviews/per sec):64.03791698197278
Progress:8.333333333333334  Correct:1521 Accuracy:76.0119940029985 Speed(reviews/per sec):63.616480549241324
Progress:12.5  Correct:2362 Accuracy:78.70709763412196 Speed(reviews/per sec):63.503809626894615
Progress:16.666666666666668  Correct:3163 Accuracy:79.05523619095226 Speed(reviews/per sec):63.4686022612289
Progress:20.833333333333332  Correct:3971 Accuracy:79.40411917616477 Speed(reviews/per sec):63.55679088502266
Progress:25.0  Correct:4787 Accuracy:79.77003832694551 Speed(reviews/per sec):63.556849954209
Progress:29.166666666666668  Correct:5642 Accuracy:80.58848735894873 Speed(reviews/per sec):63.51509513375379
Progress:33.333333333333336  Correct:6486 Accuracy:81.06486689163854 Speed(reviews/per sec):63.52832754114502
Progress:37.5  Correct:7325 Accuracy:81.37984668370181 Speed(reviews/per sec):63.539825

In [45]:
np.random.seed(40)
def train(reviews,labels,learning_rate,hidden_nodes):
    correct=0
    start_time=time.time()
    weights_0_1=np.zeros((vocab_size,hidden_nodes))
    weights_1_2=np.random.normal(0.0,1,(hidden_nodes,1))
    del_0_1=np.zeros(weights_0_1.shape)
    del_1_2=np.zeros(weights_1_2.shape)
    for i in range(len(reviews)):
        review=reviews[i]
        label=labels[i]
        target=get_target(label)
        
        x=map_input(review)
        indices=set()
        for word in review.split(' '):
            indices.add(word2index[word])

        one_input=np.zeros((1,10))
        for index in indices:
            one_input += weights_0_1[index]
        one_output=one_input
        
        two_input=np.dot(one_output,weights_1_2)
        two_output=sigmoid(two_input)
        
        error=two_output-target
        two_error_term=error*two_output*(1-two_output)
        one_error_term=np.dot(two_error_term,weights_1_2.T)
        
        delta_1_2=two_error_term*one_output.T
        delta_0_1=one_error_term*x.T
        
        weights_1_2 -=learning_rate*delta_1_2
        weights_0_1 -=learning_rate*delta_0_1
        
        if(two_output >= 0.5 and label == 'POSITIVE'):
            correct += 1
        elif(two_output < 0.5 and label == 'NEGATIVE'):
            correct += 1
        
        elapsed_time = float(time.time() - start_time)
        if(i%1000==0):
            progress=(100 * i/(len(reviews)))
            accuracy=(correct * 100 / float(i+1))
            speed= i / elapsed_time if elapsed_time > 0 else 0
            print("Progess:{}  Correct:{} Accuracy:{} Speed:{}".format(progress,correct,accuracy,speed))
            
    return weights_0_1,weights_1_2

In [46]:
weights_0_1,weights_1_2=train(reviews[:-1000],labels[:-1000],learning_rate=0.001,hidden_nodes=10)

Progess:0.0  Correct:1 Accuracy:100.0 Speed:0.0
Progess:4.166666666666667  Correct:734 Accuracy:73.32667332667333 Speed:108.85926099106864
Progess:8.333333333333334  Correct:1521 Accuracy:76.0119940029985 Speed:108.05906733878452
Progess:12.5  Correct:2362 Accuracy:78.70709763412196 Speed:107.40643343001203
Progess:16.666666666666668  Correct:3163 Accuracy:79.05523619095226 Speed:106.86994639601942
Progess:20.833333333333332  Correct:3971 Accuracy:79.40411917616477 Speed:106.49105511281468
Progess:25.0  Correct:4787 Accuracy:79.77003832694551 Speed:106.61171090938163
Progess:29.166666666666668  Correct:5642 Accuracy:80.58848735894873 Speed:106.54992368558295
Progess:33.333333333333336  Correct:6486 Accuracy:81.06486689163854 Speed:106.49551297022731
Progess:37.5  Correct:7325 Accuracy:81.37984668370181 Speed:106.56404437995948
Progess:41.666666666666664  Correct:8183 Accuracy:81.82181781821818 Speed:106.59305403746787
Progess:45.833333333333336  Correct:9031 Accuracy:82.09253704208709 

In [47]:
def test(reviews,labels,weights_0_1,weights_1_2):
    correct=0
    for i in range(len(reviews)):
        review=reviews[i]
        label=labels[i]
        target=get_target(label)
        
        x=map_input(review)
        one_input=np.dot(x,weights_0_1)
        one_output=one_input

        two_input=np.dot(one_output,weights_1_2)
        two_output=sigmoid(two_input)
        if(two_output >= 0.5 and label == 'POSITIVE'):
            correct+=1
        elif(two_output < 0.5 and label == 'NEGATIVE'):
            correct+=1
        if(i%100==0):
            accuracy=correct*100/len(reviews)
            print("Accuracy:{}".format(accuracy))

In [48]:
test(reviews[-1000:],labels[-1000:],weights_0_1,weights_1_2)

Accuracy:0.1
Accuracy:8.8
Accuracy:17.6
Accuracy:26.4
Accuracy:34.9
Accuracy:43.8
Accuracy:52.5
Accuracy:59.8
Accuracy:67.4
Accuracy:76.3


In [49]:
def prediction(review,weights_0_1,weights_1_2):
    x=map_input(review)
    one_input=np.dot(x,weights_0_1)
    one_output=one_input
    two_input=np.dot(one_output,weights_1_2)
    two_output=sigmoid(two_input)
    
    if(two_output>0.5):
        output="POSITIVE"
    elif(two_output>0.5):
        output="NEGATIVE"
    print(output)

In [50]:
prediction(reviews[0],weights_0_1,weights_1_2)

POSITIVE
