In [3]:
import numpy as np
import random
from gensim.models import Word2Vec
import os
import scipy.spatial

In [51]:
model_w2v_fname = os.path.join('models_ok', 'ok-20161206.w2v.300.bin')
model_w2v = Word2Vec.load(model_w2v_fname)

In [70]:
LEN_TRESHOLD = 20
valid_id_word = [(i, model_w2v.index2word[i]) for i in range(len(model_w2v.index2word)) 
                  if len(model_w2v.index2word[i]) <= LEN_TRESHOLD]
indexes, words = map(list, zip(*valid_id_word))
vectors = model_w2v.syn0[np.array(indexes, dtype='int32')]
vectors = vectors / np.linalg.norm(vectors, axis=1)[:, None]

chars = list(set(''.join(words)))
MAX_ID  = len(chars)
MAX_LEN = max(len(s) for s in words)
MAX_LEN = min(MAX_LEN, LEN_TRESHOLD)
W2V_DIM = model_w2v.syn0.shape[1]


char_to_id = { ch:id for id,ch in enumerate(chars) }
id_to_char = { id:ch for id,ch in enumerate(chars) }
print(char_to_id)
print('MAX_LEN = ', MAX_LEN)
print([s for s in model_w2v.index2word if len(s) == MAX_LEN])

{'9': 0, 'g': 1, '4': 20, 'ч': 3, 'и': 4, 'd': 6, 'M': 31, 'x': 9, 'z': 2, 'h': 13, 'c': 12, 'i': 46, 'm': 18, 'ж': 15, 'с': 17, 'ь': 19, 'j': 28, 'a': 21, 'r': 22, 's': 26, 'N': 25, 'b': 23, 'э': 66, '7': 7, 'u': 29, 'y': 30, 'f': 33, 'n': 32, 'ю': 41, 'п': 5, 'е': 34, 't': 54, 'а': 36, 'v': 37, 'o': 38, 'q': 40, 'к': 8, 'ы': 42, 'w': 35, 'ш': 24, 'в': 44, 'з': 45, 'д': 49, 'ъ': 27, 'м': 10, 'я': 47, 'о': 48, 'х': 50, '8': 51, 'й': 52, '#': 57, 'н': 53, 'б': 11, 'k': 55, 'ц': 56, 'U': 58, 'щ': 39, 'у': 59, '6': 60, 'e': 61, 'г': 62, 'ф': 63, 'т': 64, 'л': 14, 'p': 65, '3': 68, '1': 67, '0': 69, 'l': 16, '2': 43, '5': 70, 'р': 71}
MAX_LEN =  20
['проконсультироваться', 'сельскохозяйственных', 'благотворительностью', 'поэкспериментировать', 'среднестатистический', 'неприятностьнепогода', 'сельскохозяйственной']
<built-in function len>


In [66]:
def word_to_ohe_vector(word):
    result = np.zeros([MAX_LEN, MAX_ID], dtype='int32')
    for i in range(len(word)):
        result[i, char_to_id[word[i]]] = 1
    return result

In [67]:
word_to_ohe_vector('9dx').shape

(20, 72)

In [32]:
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Dropout, Activation, Embedding, \
                         Convolution1D, GlobalMaxPooling1D, Lambda, Permute, merge
from keras.layers.recurrent import LSTM, GRU
from keras.regularizers import l2
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.optimizers import Adam, RMSprop

In [33]:
LEN_B = 3
LEN_E = 3

In [151]:
input_b = Input(shape=(LEN_B, MAX_ID, ))
input_m = Input(shape=(1, MAX_ID,  ))
input_e = Input(shape=(LEN_E, MAX_ID, ))

merged = merge([input_b, input_m, input_e], mode='concat', concat_axis=1)

lstm_1 = LSTM(output_dim=128, return_sequences=True, input_dim=(LEN_E + LEN_B + 1, MAX_ID))(merged)
lstm_2 = LSTM(100,return_sequences=True)(lstm_1)
lstm_3 = LSTM(100)(lstm_2)

dense_out = Dense(W2V_DIM)(lstm_3)

mal_model = Model(input=[input_b, input_m, input_e], output=dense_out)

In [152]:
mal_model.summary()
mal_model.compile(loss='cosine_proximity', optimizer='sgd', metrics=['cosine_proximity', 'mse'])

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_46 (InputLayer)            (None, 3, 72)         0                                            
____________________________________________________________________________________________________
input_47 (InputLayer)            (None, 1, 72)         0                                            
____________________________________________________________________________________________________
input_48 (InputLayer)            (None, 3, 72)         0                                            
____________________________________________________________________________________________________
merge_15 (Merge)                 (None, 7, 72)         0           input_46[0][0]                   
                                                                   input_47[0][0]          

In [153]:
def batchGenerator(words, vectors, batch_size=20):
    while 1:
        X_b_batch = np.empty((0, LEN_B, MAX_ID))
        X_e_batch = np.empty((0, LEN_E, MAX_ID))
        X_m_batch = np.empty((0, 1, MAX_ID))
        Y_batch   = np.empty((0, W2V_DIM))
        
        while Y_batch.shape[0] < batch_size:
            i = np.random.choice(len(words))
            len_w = len(words[i])
            
            word_ohe = word_to_ohe_vector(words[i])
            X_b = word_ohe[:LEN_B]
            
            if len_w < LEN_E:
                X_e = word_ohe[:LEN_E]
            else:
                X_e = word_ohe[len_w - LEN_E : len_w]
            X_m = np.sum(word_ohe, axis=0)

            Y = vectors[i]
            yt = model_w2v[words[i]] 
            yt = yt / np.linalg.norm(yt)
            assert np.allclose(Y, yt)
            X_b_batch = np.concatenate((X_b_batch, X_b[None, ...]))
            X_e_batch = np.concatenate((X_e_batch, X_e[None, ...]))
            X_m_batch = np.concatenate((X_m_batch, X_m[None, None, ...]))
            Y_batch   = np.concatenate((Y_batch, Y[None, ...]))
            
        yield [X_b_batch, X_m_batch, X_e_batch], Y_batch

In [154]:

for x, y in batchGenerator(words, vectors):
    print(x[0].shape)
    break

(20, 3, 72)


In [156]:
mal_model.fit_generator(batchGenerator(words, vectors), samples_per_epoch=6000, nb_epoch=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f76970775f8>

In [170]:
def test(model_w2v, model, word='апельсин'):
    len_w = len(word)
    word_ohe = word_to_ohe_vector(word)
    X_b = word_ohe[:LEN_B]

    if len_w < LEN_E:
                    X_e = word_ohe[:LEN_E]
    else:
                    X_e = word_ohe[len_w - LEN_E : len_w]
    X_m = np.sum(word_ohe, axis=0)
    
    res = model.predict([X_b.reshape(1, *X_b.shape), 
                         X_m.reshape(1, 1, *X_m.shape), 
                         X_e.reshape(1, *X_e.shape)])
    
    
    print('cosine similarity:', 
          1 - scipy.spatial.distance.cosine(model_w2v[word], res[0]))
    
    print('sim-by-vec:')
    print(model_w2v.similar_by_vector(res[0]))
    print('sim original:')
    print(model_w2v.most_similar(word))
    
    return res

In [173]:
v = test(model_w2v, mal_model, 'рельсам')

cosine similarity: 0.216107856305
sim-by-vec:
[('небывалое', 0.27472907304763794), ('плазма', 0.27139419317245483), ('дальнее', 0.2707824110984802), ('швейцарском', 0.26794761419296265), ('кале', 0.26429155468940735), ('мане', 0.26239538192749023), ('владивосток', 0.2604215741157532), ('выстраиваются', 0.2596675753593445), ('оро', 0.25935834646224976), ('титан', 0.25295913219451904)]
sim original:
[('электрички', 0.5722105503082275), ('рельс', 0.5523016452789307), ('тротуару', 0.532254695892334), ('машинист', 0.5239666700363159), ('трамваи', 0.5168241262435913), ('вагону', 0.5130778551101685), ('вагоны', 0.5127475261688232), ('галопом', 0.5120807886123657), ('резво', 0.5074643492698669), ('ползут', 0.5065412521362305)]
