In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import modutils
import pickle
import time

w2v_src_file = '../DataSets/Quora/w2v_src_180115.pickle'
w2v_model = '../Models-23Quora03-W2V/model-02.ckpt'
w2v_size = 9000

In [2]:
def recode_max_dict(sentences, full_dict, dict_size):
    last_ind = dict_size - 1
    new_dict = full_dict[:last_ind]
    new_num = sum([x[1] for x in full_dict[last_ind:]])
    new_freq = sum([x[2] for x in full_dict[last_ind:]])
    new_dict.append(('<UNK>', new_num, new_freq, 1))
    
    new_sentences = [[min(last_ind, z) for z in x] for x in sentences]
    return (new_sentences, new_dict)

In [3]:
%%time
with open(w2v_src_file, 'rb') as f:
    (full_dict, full_sentences) = pickle.load(f)
    
(w2v_src, w2v_dict) = recode_max_dict(full_sentences, full_dict, dict_size=w2v_size)

Wall time: 15.2 s


In [19]:
#Load state    
mapper = {x[0]:i for (i,x) in enumerate(w2v_dict)}

def word2idx(w):
    if w in mapper:
        return mapper[w]
    else:
        return mapper['<UNK>']
    
def idx2word(i):
    if type(i) is list:
        return [idx2word(x) for x in i]
    if type(i) is np.ndarray:
        return np.array([idx2word(x) for x in i])
    if i >= len(w2v_dict):
        return '<ERR>'
    return w2v_dict[i][0]

In [5]:
DICT_SIZE = len(w2v_dict)
EMBED_SIZE = 200
NCE_NUM_SAMPLED = 100

init_embeding = np.random.multivariate_normal(np.zeros(EMBED_SIZE), np.identity(EMBED_SIZE), size=DICT_SIZE)/np.sqrt(EMBED_SIZE)
init_beta = np.random.multivariate_normal(np.zeros(EMBED_SIZE), np.identity(EMBED_SIZE), size=DICT_SIZE)/np.sqrt(EMBED_SIZE)
init_intercept = np.zeros((DICT_SIZE,))

tf.reset_default_graph()

with tf.name_scope('Input'):
    tf_in_word = tf.placeholder(tf.int32, shape=(None, ), name='in_word')
    tf_in_context = tf.placeholder(tf.int32, shape=(None, 1), name='in_context')
    tf_in_regularization = tf.placeholder_with_default(0.1, shape=(), name='in_regularization')
    
with tf.name_scope('Embedding'):
    tf_embedding = tf.Variable(init_embeding, dtype=tf.float32)
    tf_embedded_word = tf.nn.embedding_lookup(tf_embedding, tf_in_word, name='out_embedding')
    
with tf.name_scope('Training'):
    tf_nce_beta = tf.Variable(init_beta, dtype=tf.float32)
    tf_nce_intercept = tf.Variable(init_intercept, dtype=tf.float32)
    tf_nce_loss = tf.reduce_mean(
                    tf.nn.nce_loss(weights=tf_nce_beta, biases=tf_nce_intercept,
                                   labels=tf_in_context, inputs=tf_embedded_word,
                                   num_sampled=NCE_NUM_SAMPLED, num_classes=DICT_SIZE))
    #tf_reg_loss = tf.sqrt(tf.reduce_mean(tf.square(tf_embedding))) #bad loss
    tf_reg_loss = tf.sqrt(tf.reduce_mean(tf.square(tf.reduce_mean(tf_embedding, axis=0)))) #center of embedding is 0
    tf_full_loss = tf_nce_loss + tf_in_regularization * tf_reg_loss
    tf_train = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(tf_full_loss)
    
with tf.name_scope('Validation'):
    tf_valid_dictionary = tf.constant(np.array(range(DICT_SIZE)))
    tf_valid_embedding = tf.nn.embedding_lookup(tf_embedding, tf_valid_dictionary)
    tf_valid_in_norm = tf_embedded_word / tf.sqrt(tf.reduce_sum(tf.square(tf_embedded_word), 1, keep_dims=True))
    tf_valid_dic_norm = tf_valid_embedding / tf.sqrt(tf.reduce_sum(tf.square(tf_valid_embedding), 1, keep_dims=True))
    tf_valid_similarity = tf.matmul(tf_valid_in_norm, tf_valid_dic_norm, transpose_b=True)
    
tffw = tf.summary.FileWriter('D:/Jupyter/Logs/00_W2V', tf.get_default_graph())
tffw.close()
print('Graph creation complete.')

Graph creation complete.


In [8]:
tfsSaver = tf.train.Saver()

with tf.Session() as tfs:
    tfsSaver.restore(tfs, save_path=w2v_model)
    dic_embed = tf_valid_dic_norm.eval()
    
print('Complete')

INFO:tensorflow:Restoring parameters from ../Models-23Quora03-W2V/model-02.ckpt
Complete


In [112]:
def word2vec(wrd, embed=dic_embed):
    if type(wrd) is str:
        return embed[word2idx(wrd)]
    if type(wrd) is list:
        return [word2vec(x) for x in wrd]
    if type(wrd) is np.ndarray:
        return [word2vec(x) for x in wrd]
    return None

def topNids(vec, embed=dic_embed):
    dists = np.sqrt(np.sum(np.square(embed - vec), axis=1))
    dord = np.argsort(dists)
    return (dord, dists[dord], np.mean(dists))

In [144]:
n_embed = dic_embed - dic_embed.mean(axis=0)

In [246]:
v = word2vec('solve', n2_embed)
print(np.sqrt(np.sum(np.square(v))))
res = topNids(v, n2_embed)
idx2word(res[0][:10]), res[1][:10], res[2]

1.0


(array(['solve', 'translate', 'tackle', 'handle', 'fix', 'simplify',
        'treat', 'identify', 'relate', 'resolve'],
       dtype='<U9'),
 array([ 0.        ,  0.80219698,  0.8729493 ,  0.94723475,  0.94835615,
         0.95306551,  0.95552576,  0.9576475 ,  0.97037911,  0.97669458], dtype=float32),
 1.4155799)

In [240]:
v = word2vec('she', pca_embed)
print(np.sqrt(np.sum(np.square(v))))
res = topNids(v, pca_embed)
idx2word(res[0][:10]), res[1][:10], res[2]

1.01188699138


(array(['she', 'he', 'he/she', "he's", "she's", "i've", 'everyone',
        'dumbledore', 'anybody', 'boyfriend'],
       dtype='<U10'),
 array([ 0.        ,  0.39814789,  0.75271419,  0.88412588,  0.95190096,
         0.96305999,  0.97022316,  0.97119915,  0.98680196,  0.99613919]),
 1.4185787155359437)

In [184]:
n2_embed = n_embed / np.sqrt(np.square(n_embed).sum(axis=1)).reshape(-1,1)

In [157]:
np.sqrt(np.square(n2_embed).sum(axis=1))[:10]

array([ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  0.99999994,  1.        ,  1.        ,  1.        ], dtype=float32)

In [203]:
import sklearn, sklearn.decomposition

In [214]:
pca_embed = sklearn.decomposition.PCA().fit_transform(n2_embed)

In [224]:
pca_embed.min(axis=0), pca_embed.max(axis=0)

(array([-0.6333091 , -0.54214307, -0.52956626, -0.5003707 , -0.55046846,
        -0.54390562, -0.54141343, -0.47641616, -0.50917155, -0.48353662,
        -0.45214015, -0.42458806, -0.43480314, -0.4115022 , -0.39524293,
        -0.43762931, -0.3771256 , -0.35240305, -0.38641964, -0.43003585,
        -0.41754688, -0.43939526, -0.31356668, -0.33419397, -0.413508  ,
        -0.33128372, -0.35395492, -0.30495326, -0.33358539, -0.36043662,
        -0.33728471, -0.27333219, -0.27416362, -0.32134146, -0.29503932,
        -0.31137491, -0.27792932, -0.29527525, -0.28394695, -0.30838947,
        -0.29131041, -0.27624538, -0.28829076, -0.28900215, -0.26731326,
        -0.27431599, -0.29866832, -0.27918682, -0.24630253, -0.29479382,
        -0.26452575, -0.24393355, -0.26333232, -0.27787186, -0.22743721,
        -0.24611424, -0.24179171, -0.26989162, -0.27518141, -0.25935061,
        -0.2580018 , -0.26187867, -0.28574267, -0.23549507, -0.26484246,
        -0.24259327, -0.26928762, -0.28356649, -0.2