In [3]:
import sys, random, math
from collections import Counter
import numpy as np

np.random.seed(1)
random.seed(1)

In [4]:
f = open('data/reviews.txt')
raw_reviews = f.readlines()
f.close()

In [6]:
tokens = list(map(lambda x: x.split(' '), raw_reviews))

In [8]:
wordcnt = Counter()
for sent in tokens:
    for word in sent:
        wordcnt[word] -= 1

In [15]:
vocab = list(set(map(lambda x: x[0], wordcnt.most_common())))

In [16]:
len(vocab)

74075

In [17]:
word2index = {}
for i, w in enumerate(vocab):
    word2index[w] = i

In [19]:
concatenated = list()
inputs = list()

for sent in tokens:
    indices = list()
    for word in sent:
        try:
            indices.append(word2index[word])
            concatenated.append(word2index[word])
        except:
            pass
    inputs.append(indices)

concatenated = np.array(concatenated)
np.random.shuffle(inputs)

In [23]:
lr = 0.05
n_epochs = 2
hidden_dim = 50
window_size = 2
negative_size = 5

In [24]:
w_0_1 = (np.random.rand(len(vocab), hidden_dim) - 0.5) * 0.2
w_1_2 = np.random.rand(len(vocab), hidden_dim) * 0.1

layer_2_target = np.zeros(negative_size + 1)
layer_2_target[0] = 1

In [32]:
def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    
    for word, index in word2index.items():
        diff = w_0_1[index] - w_0_1[target_index]
        scores[word] = -math.sqrt(sum(diff*diff))
    return scores.most_common(10)

In [26]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [53]:
for epoch in range(n_epochs):
    for rev_i, review in enumerate(inputs):
        for target_i in range(len(review)):
            target_samples = (np.random.rand(negative_size) * len(concatenated)).astype('int').tolist()
            target_samples = [review[target_i]] + list(concatenated[target_samples])
            
            left_context = review[max(0, target_i-window_size):target_i]
            right_context = review[target_i+1: min(len(review), target_i+window_size)]
            
            layer_1 = np.mean(w_0_1[left_context+right_context], axis=0)
            layer_2 = layer_1.dot(w_1_2[target_samples].T)
            
            layer_2_delta = layer_2 - layer_2_target
            layer_1_delta = layer_2_delta.dot(w_1_2[target_samples])
            
            w_0_1[left_context+right_context] -= layer_1_delta * lr
            w_1_2[target_samples] -= np.outer(layer_2_delta, layer_1) * lr
            
        if(rev_i % 250 == 0): 
            sys.stdout.write('\rProgress:'+str(rev_i/float(len(inputs))) + " " + str(similar('terrible'))) 
        sys.stdout.write('\rProgress:'+str(rev_i/float(len(inputs))))
print(similar('terrible'))

Progress:0.99996'terrible', -0.0), ('horrible', -1.0048027372288646), ('dreadful', -1.1053679657346602), ('horrendous', -1.2064280852346534), ('fantastic', -1.2975359203184298), ('ridiculous', -1.317513293499622), ('phenomenal', -1.327543326728076), ('spectacular', -1.3367534547492388), ('hideous', -1.3382692303188353), ('lame', -1.3765826439977096)])]2)]]64)][('terrible', -0.0), ('horrible', -0.8950533592188322), ('dreadful', -1.1003424568447575), ('horrendous', -1.1652813716457397), ('brilliant', -1.2749581810654615), ('fantastic', -1.2761867764563264), ('ridiculous', -1.2928733485944284), ('phenomenal', -1.3016815312550982), ('spectacular', -1.304072365070491), ('lame', -1.3148772722836826)]


In [54]:
similarlar('terrible')

[('terrible', -0.0),
 ('horrible', -0.8950533592188322),
 ('dreadful', -1.1003424568447575),
 ('horrendous', -1.1652813716457397),
 ('brilliant', -1.2749581810654615),
 ('fantastic', -1.2761867764563264),
 ('ridiculous', -1.2928733485944284),
 ('phenomenal', -1.3016815312550982),
 ('spectacular', -1.304072365070491),
 ('lame', -1.3148772722836826)]

In [55]:
similar('beautiful')

[('beautiful', -0.0),
 ('gorgeous', -1.054680577883133),
 ('creepy', -1.0912263123244257),
 ('cynical', -1.2933691313458417),
 ('charming', -1.2953615704372208),
 ('lovable', -1.3033254314074523),
 ('courageous', -1.3076863434388148),
 ('lovely', -1.3152572095219595),
 ('shallow', -1.3199527940451148),
 ('bitchy', -1.3430094296132342)]

In [58]:
def analogy(positive=['terrible', 'good'], negative=['bad']):
    query_vect = np.zeros(len(w_0_1[0]))
    for w in positive:
        query_vect += w_0_1[word2index[w]]
    
    for w in negative:
        query_vect == w_0_1[word2index[w]]
    
    scores = Counter()
    for word,index in word2index.items():
        diff = w_0_1[index] - query_vect
        scores[word] = -math.sqrt(sum(diff * diff))
    return scores.most_common(10)[1:]

In [71]:
analogy(['terrible', 'great'], ['bad'])

[('decent', -2.148530313786263),
 ('superb', -2.1563083052860814),
 ('fine', -2.163027532633638),
 ('wonderful', -2.1853447802730166),
 ('terrific', -2.2113512820416856),
 ('terrible', -2.2380980837053626),
 ('fantastic', -2.249622530799886),
 ('brilliant', -2.2718469010978755),
 ('solid', -2.3744229974322484)]

In [66]:
analogy(['beautiful','bad'],['great'])

[('cute', -2.209411857075996),
 ('poor', -2.248104156055456),
 ('beautiful', -2.2495242403897),
 ('smart', -2.2628269720791256),
 ('dumb', -2.275955553378969),
 ('creepy', -2.299717469130728),
 ('lame', -2.335163953787769),
 ('sweet', -2.349839890013137),
 ('cool', -2.360097176403257)]