In [1]:
import sys

f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('labels.txt')
raw_labels = f.readlines()
f.close()

tokens = list(map(lambda x:set(x.split(" ")),raw_reviews))

In [2]:
vocab = set()
for sent in tokens:
    for word in sent:
        if len(word)>0:
            vocab.add(word)
vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i


In [3]:
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            pass
    input_dataset.append(list(set(sent_indices)))


In [4]:
target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)

In [5]:
import numpy as np
np.random.seed(1)

In [6]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [40]:
alpha,iteration = (0.01,2)
hidden_size = 100

weight_0_1 = 0.2*np.random.random((len(vocab),hidden_size)) - 0.1
weight_1_2 = 0.2*np.random.random((hidden_size,1)) - 0.1

In [41]:
correct, total = (0,0)
for j in range(iteration):
    for i in range(len(input_dataset)-1000):
        x,y = (input_dataset[i],target_dataset[i])
        layer_1 = sigmoid(np.sum(weight_0_1[x],axis=0))
        layer_2 = sigmoid(np.dot(layer_1,weight_1_2))

        layer_2_delta = layer_2 - y
        layer_1_delta = layer_2_delta.dot(weight_1_2.T)

        weight_0_1[x] -= layer_1_delta*alpha
        weight_1_2 -= np.outer(layer_1, layer_2_delta) * alpha

        if np.abs(layer_2_delta) < 0.5:
            correct+=1
        total+=1
        if(i%10==9):
            progress = str(i/float(len(input_dataset)))
            sys.stdout.write('\rIter:'+str(j)\
            +' Progress:'+progress[2:4]\
            +'.'+progress[4:6]\
            +'% Training Accuracy:'\
            + str(correct/float(total)) + '%')
    print()

Iter:0 Progress:95.99% Training Accuracy:0.8331666666666667%
Iter:1 Progress:95.99% Training Accuracy:0.8669166666666667%


In [42]:
from collections import Counter
import math

In [43]:
def similar(target):
    target_index = word2index[target]
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weight_0_1[index] -weight_0_1[target_index]
        squared_difference = raw_difference**2
        scores[word] = -math.sqrt(sum(squared_difference))
        
    return scores.most_common(10)

In [44]:
similar('beautiful')

[('beautiful', -0.0),
 ('outstanding', -0.708857796462495),
 ('realistic', -0.7122709826169423),
 ('freedom', -0.7491271062297961),
 ('best', -0.7668683241791592),
 ('awesome', -0.7684767131032523),
 ('shows', -0.7693739292245926),
 ('sent', -0.7700535217091992),
 ('heart', -0.771640929220134),
 ('remember', -0.7751921801370062)]

In [45]:
similar('terrible')

[('terrible', -0.0),
 ('disappointing', -0.734925428445247),
 ('annoying', -0.7548979778499003),
 ('worse', -0.7596787198105618),
 ('boring', -0.774359705311699),
 ('dull', -0.7968526865052934),
 ('fails', -0.7981457526577221),
 ('disappointment', -0.8010134597765881),
 ('avoid', -0.8032134607648821),
 ('horrible', -0.8039548044397373)]