In [1]:
import sys, random, math, numpy as np
from collections import Counter

In [2]:
np.random.seed(1)
random.seed(1)


In [3]:
f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

tokens = list(map(lambda x:(x.split(" ")), raw_reviews))

In [4]:
wordcnt = Counter()
for sent in tokens:
    for word in sent:
        wordcnt[word] -= 1
vocab = list(set(map(lambda  x:x[0], wordcnt.most_common())))

In [5]:
word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i

concatenated = list()
input_dataset = list()

for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
            concatenated.append(word2index[word])
        except:
            ""
    input_dataset.append(sent_indices)
concatenated = np.array(concatenated)

In [9]:
random.shuffle(input_dataset)
alpha, iterations = (0.05, 2)
hidden_size, window, negative = (50, 2, 5)

weights_0_1 = (np.random.rand(len(vocab),hidden_size) - 0.5) * 0.2
weights_1_2 = np.random.rand(len(vocab),hidden_size)*0

layer_2_target = np.zeros(negative + 1)
layer_2_target[0] = 1

In [10]:
def similar(target):
    target_index = word2index[target]
    
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - weights_0_1[target_index]
        squared_diff = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_diff))
    return scores.most_common(10)

def sigmoid(x):
    return 1/(1 + np.exp(-x))


In [11]:
for rev_i, review in enumerate(input_dataset * iterations):
    for target_i in range(len(review)):
        
        #прогнозирование случайного подмножество, потому что прогнозирование всего словаря слишком много требует вычислений
        target_samples = [review[target_i]]+list(concatenated[(np.random.rand(negative)*len(concatenated)).astype('int').tolist()])
        
        left_context = review[max(0, target_i - window) : target_i]
        right_context = review[target_i + 1: min(len(review), target_i + window)]
        
        layer_1 = np.mean(weights_0_1[left_context + right_context], axis=0)
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))
        
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])
        
        weights_0_1[left_context + right_context] -= layer_1_delta * alpha
        weights_1_2[target_samples] -= np.outer(layer_2_delta, layer_1) * alpha

    if(rev_i % 250 == 0):
        sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
                                                       *iterations)) + "   " + str(similar('terrible')))
        sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
                                               *iterations)))
print(similar('terrible'))
        

Progress:0.995   [('terrible', -0.0), ('brilliant', -3.231460136015008), ('horrible', -3.2346887695740794), ('horrid', -3.5408636273187546), ('pathetic', -3.5541401536169843), ('horrendous', -3.6524130884730908), ('magnificent', -3.811352213496053), ('phenomenal', -3.8452778421704), ('ridiculous', -3.8874019027174644), ('fantastic', -3.9429357974454335)]])])][('terrible', -0.0), ('horrible', -3.092257358215906), ('brilliant', -3.1764110366375466), ('pathetic', -3.581886101235946), ('horrendous', -3.647495793922012), ('horrid', -3.6997439837621275), ('magnificent', -3.81403503987757), ('phenomenal', -3.826304134902037), ('dreadful', -3.9217193545849662), ('ridiculous', -3.9664913066611787)]


In [10]:
print(similar('beautiful'))

[('beautiful', -0.0), ('creepy', -3.133661637069066), ('lovely', -3.1829254616359295), ('glamorous', -3.3605481638865915), ('nightmarish', -3.4773418364229407), ('spooky', -3.499259048116791), ('drab', -3.5156567880709453), ('fantastic', -3.581630524320107), ('heartwarming', -3.6028708757731818), ('gorgeous', -3.6061419438561275)]


In [14]:
print(similar('love'))

[('love', -0.0), ('adore', -5.076843711565643), ('dislike', -5.192422162802242), ('commend', -5.266630042401484), ('empathise', -5.450702780254202), ('stupidest', -5.460194766095104), ('prefer', -5.467468671024043), ('debate', -5.532838216355729), ('revisit', -5.539255461253388), ('hate', -5.543813951137705)]


In [24]:
def analogy(positive=['terrible','good'],negative=['bad']):

    norms = np.sum(weights_0_1 * weights_0_1,axis=1)
    norms.resize(norms.shape[0],1)
    
    normed_weights = weights_0_1 * norms

    query_vect = np.zeros(len(weights_0_1[0]))
    for word in positive:
        query_vect += normed_weights[word2index[word]]
    for word in negative:
        query_vect -= normed_weights[word2index[word]]

    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - query_vect
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))

    return scores.most_common(10)[1:]


In [26]:
analogy(['king', 'woman'], ['man'])

[('\n', -339.1801983652029),
 ('woman', -339.34415988120145),
 ('rest', -339.4230021929077),
 ('king', -339.99795859992366),
 ('none', -340.1038813514828),
 ('majority', -340.14557482974095),
 ('father', -340.266150647704),
 ('daughter', -340.32762805755857),
 ('depiction', -340.346613499002)]

In [13]:
norms = np.sum(weights_0_1 * weights_0_1, axis=1)
norms.resize(norms.shape[0], 1)
normed_weights = weights_0_1 * norms

<b>make_sent_vect<b> - преобразование каждого отдельного обзора (списка слов) в векторное представление методом усреднения

функция <b>most_similar_reviews<b> запрашивает обзоры, наиболее похожие на заданный, выполняя скалярное произведение между вектором обзора на входе и векторами всех обзоров

In [25]:
def make_sent_vect(words):
    indices = list(map(lambda x:word2index[x],filter(lambda x:x in word2index,words)))
    return np.mean(normed_weights[indices],axis=0)

def most_similar_reviews(review):
    v = make_sent_vect(review)
    scores = Counter()
    for i,val in enumerate(reviews2vectors.dot(v)):
        scores[i] = val
    most_similar = list()

    for idx,score in scores.most_common(3):
        most_similar.append(raw_reviews[idx][0:40])
    return most_similar

Результат векторного представления сохраняется в матрицу <b>reviews2vectors<b>

In [15]:
reviews2vectors = list()
for review in tokens:
    reviews2vectors.append(make_sent_vect(review))
reviews2vectors = np.array(reviews2vectors)

In [27]:
most_similar_reviews(['awful', 'boring'])

['i am   and i hated this film its the wor',
 'this was a very good movie i wished i co',
 'i read the reviews before i watched this']