## Data preprocessing

In [1]:
import sys

In [2]:
f = open('data/reviews.txt')

In [3]:
raw_reviews = f.readlines()
f.close()

In [4]:
f = open('data/labels.txt')
raw_labels = f.readlines()
f.close()

In [5]:
tokens = list(map(lambda x: set(x.split(' ')), raw_reviews))

In [8]:
vocab = set()
for sent in tokens:
    for word in sent:
        if len(word) > 0:
            vocab.add(word)
vocab = list(vocab)

In [9]:
len(vocab)

74074

In [10]:
word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i

In [16]:
inputs = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            pass
    inputs.append(list(set(sent_indices)))

In [17]:
len(inputs)

25000

In [13]:
targets = list(map(lambda label: 1 if label == 'positive\n' else 0, raw_labels))

In [14]:
len(targets)

25000

## Define neural network

In [19]:
import numpy as np
np.random.seed(1)

In [20]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [21]:
lr = 0.01
n_epochs = 2
hidden_dim = 100

In [24]:
w_0_1 = 0.2 * np.random.rand(len(vocab), hidden_dim) - 0.1
w_1_2 = 0.2 * np.random.rand(hidden_dim, 1) - 0.1

In [25]:
correct, total = 0, 0

## Train

In [58]:
for e in range(n_epochs):
    for i in range(len(inputs) - 1000):
        x, y = inputs[i], targets[i]
        
        # Feed forward
        layer_1 = sigmoid(np.sum(w_0_1[x], axis=0))
        layer_2 = sigmoid(np.dot(layer_1, w_1_2))
        
        # Compare loss
        layer_2_delta = layer_2 - y
        layer_1_delta = layer_2_delta.dot(w_1_2.T)
        
        # Backprops (Learning)
        w_0_1[x] -= layer_1_delta * lr
        w_1_2 -= np.outer(layer_1, layer_2_delta) * lr
        
        if(np.abs(layer_2_delta) < 0.5): 
            correct += 1
        total += 1
        
        if(i % 10 == 9):
            progress = str(i/float(len(inputs))) 
            sys.stdout.write('\rIter:'+ str(e)\
                +' Progress:'+ progress[2:4]\
                + '.'+ progress[4:6]\
                + '% Training Accuracy:'\
                + str(correct/float(total)) + '%')

Iter:1 Progress:95.99% Training Accuracy:0.9505510204081633%

## Validate

In [59]:
correct,total = (0,0)
for i in range(len(inputs)-1000, len(inputs)):
    x = inputs[i] 
    y = targets[i]
    layer_1 = sigmoid(np.sum(w_0_1[x],axis=0)) 
    layer_2 = sigmoid(np.dot(layer_1,w_1_2))
    if(np.abs(layer_2 - y) < 0.5): 
        correct += 1
    total += 1
print("Test Accuracy:" + str(correct/float(total)))

Test Accuracy:0.847


## Similarity

In [60]:
from collections import Counter 
import math

def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word, i in word2index.items():
        raw_diff = w_0_1[i] - w_0_1[target_index]
        squared_diff = raw_diff * raw_diff
        scores[word] = -math.sqrt(sum(squared_diff))
    return scores.most_common(10)

In [62]:
similar('beautiful')

[('beautiful', -0.0),
 ('overwhelmed', -0.6708802890215515),
 ('coaster', -0.704180637943038),
 ('marvel', -0.719600915406048),
 ('success', -0.725215928067329),
 ('blew', -0.7293311751014006),
 ('abrupt', -0.7348427545210663),
 ('spain', -0.7377739615522476),
 ('wrapped', -0.7397429336486291),
 ('carrie', -0.7404587129960085)]

In [63]:
similar('terrible')

[('terrible', -0.0),
 ('redeeming', -0.7664680910662351),
 ('unwatchable', -0.7817053622502813),
 ('ludicrous', -0.7862764855535681),
 ('boring', -0.79269943619506),
 ('mildly', -0.806670140217177),
 ('forgettable', -0.8081272794659068),
 ('obnoxious', -0.8111341204820561),
 ('skip', -0.815202790395448),
 ('stupidity', -0.818122070925573)]