In [5]:
import numpy as np
import sys

<b>One-hot encoding<b> - прямое кодирование, описывающее присутсвие/отсутствие точки данных из чилса всех возможных в словаре

In [6]:
onehots = {}
onehots['cat'] = np.array([1, 0, 0, 0])
onehots['the'] = np.array([0, 1, 0, 0])
onehots['dog'] = np.array([0, 0, 1, 0])
onehots['sat'] = np.array([0, 0, 0, 1])

sentence = ['the', 'cat', 'sat']
x = onehots[sentence[0]] + onehots[sentence[1]] + onehots[sentence[2]]

print("Sent encoding: " + str(x))

Sent encoding: [1 1 0 1]


In [7]:
f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('labels.txt')
raw_labels = f.readlines()
f.close()

In [8]:
tokens = list(map(lambda x: set(x.split(" ")), raw_reviews)) #список из набора слов по каждому ревью

In [9]:
vocab = set()
for sent in tokens:
    for word in sent:
        if len(word) > 0:
            vocab.add(word)
vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i

In [10]:
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices)))

target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)            

In [11]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [12]:
np.random.seed(1)

alpha, iterations = (0.01, 2)
hidden_size = 100

weights_0_1 = 0.2 * np.random.random((len(vocab), hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, 1)) - 0.1

<b>Embeding<b> - векторное предстваление 

In [13]:
correct, total = (0,0)
for iter in range(iterations):
    
    for i in range(len(input_dataset) - 1000): #обучение на первых 24000 обзорах
        x,y = (input_dataset[i], target_dataset[i])
        layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0)) #векторное представление + sigmoid
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2)) #линейный слой + sigmoid
        
        layer_2_delta = layer_2 - y
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)
        
        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1, layer_2_delta) * alpha
        
        if np.abs(layer_2_delta) < 0.5:
            correct += 1
        total += 1
        
        if i % 10 == 9:
            progress = str(i/float(len(input_dataset)))
            sys.stdout.write('\nIter: ' + str(iter)\
                             + ' Progress: ' + progress[2:4] + '.' + progress[4:6] + '%'\
                             + ' Train Accuracy: ' + str(correct/float(total)) + '%')

        


Iter: 0 Progress: 00.03% Train Accuracy: 0.0%
Iter: 0 Progress: 00.07% Train Accuracy: 0.05%
Iter: 0 Progress: 00.11% Train Accuracy: 0.13333333333333333%
Iter: 0 Progress: 00.15% Train Accuracy: 0.225%
Iter: 0 Progress: 00.19% Train Accuracy: 0.22%
Iter: 0 Progress: 00.23% Train Accuracy: 0.23333333333333334%
Iter: 0 Progress: 00.27% Train Accuracy: 0.22857142857142856%
Iter: 0 Progress: 00.31% Train Accuracy: 0.2625%
Iter: 0 Progress: 00.35% Train Accuracy: 0.2777777777777778%
Iter: 0 Progress: 00.39% Train Accuracy: 0.3%
Iter: 0 Progress: 00.43% Train Accuracy: 0.3090909090909091%
Iter: 0 Progress: 00.47% Train Accuracy: 0.2916666666666667%
Iter: 0 Progress: 00.51% Train Accuracy: 0.2846153846153846%
Iter: 0 Progress: 00.55% Train Accuracy: 0.2785714285714286%
Iter: 0 Progress: 00.59% Train Accuracy: 0.28%
Iter: 0 Progress: 00.63% Train Accuracy: 0.28125%
Iter: 0 Progress: 00.67% Train Accuracy: 0.2823529411764706%
Iter: 0 Progress: 00.71% Train Accuracy: 0.2833333333333333%
Iter: 

In [14]:
correct, total = (0, 0)
for i in range(len(input_dataset) - 1000, len(input_dataset)):
    x, y = input_dataset[i], target_dataset[i]
    
    layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
    layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
    
    if np.abs(layer_2 - y) < 0.5:
        correct += 1
    total += 1
print("Test Accuracy: " + str(correct/float(total)))

Test Accuracy: 0.852


<b>Сравнение векторных представлений слов<b> - визуализация сходства весов

In [15]:
from collections import Counter
import math

In [21]:
def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(15)

<h2>Вывод:<h2>
Так проявляется эфффект <k>обобщения корреляции<k>

In [22]:
print(similar())

[('beautiful', -0.0), ('touching', -0.7036722950278169), ('terrific', -0.7066902996056571), ('beautifully', -0.7141365718631594), ('best', -0.7385982910473501), ('fascinating', -0.7533733119843948), ('humour', -0.7547535124730974), ('powerful', -0.7599225491237053), ('especially', -0.7603267242221073), ('episodes', -0.7682438243774412), ('unlike', -0.7698642063889606), ('guys', -0.7731408845554972), ('impact', -0.7737558494512462), ('unique', -0.7742359709497129), ('different', -0.775384894335842)]


In [23]:
print(similar('terrible'))

[('terrible', -0.0), ('annoying', -0.7906749017821814), ('horrible', -0.7958995271621507), ('fails', -0.7974421857803003), ('disappointing', -0.8043794497438661), ('worse', -0.812062184261906), ('laughable', -0.8194684810754737), ('poor', -0.8206892213907724), ('disappointment', -0.8266041119128222), ('boring', -0.8293150095881613), ('dull', -0.8403993199372115), ('pointless', -0.8431263451166848), ('mess', -0.8463057166320204), ('bad', -0.858054378910163), ('badly', -0.8654785217691314)]
