In [128]:
import numpy as np
import pandas as pd
import tensorflow as tf
import modutils
import pickle
import time

w2v_src_file = '../DataSets/Quora/w2v_src_180115.pickle'
w2v_res_file = '../DataSets/Quora/w2v_res_180119.pickle'
w2v_size = 9000

In [2]:
with open(w2v_src_file, 'rb') as f:
    (full_dict, full_sentences) = pickle.load(f)

In [3]:
def recode_max_dict(sentences, full_dict, dict_size):
    last_ind = dict_size - 1
    new_dict = full_dict[:last_ind]
    new_num = sum([x[1] for x in full_dict[last_ind:]])
    new_freq = sum([x[2] for x in full_dict[last_ind:]])
    new_dict.append(('<UNK>', new_num, new_freq, 1))
    
    new_sentences = [[min(last_ind, z) for z in x] for x in sentences]
    return (new_sentences, new_dict)

In [4]:
%%time
(w2v_src, w2v_dict) = recode_max_dict(full_sentences, full_dict, dict_size=w2v_size)

Wall time: 3.27 s


In [5]:
#Load state    
mapper = {x[0]:i for (i,x) in enumerate(w2v_dict)}

def word2idx(w):
    if w in mapper:
        return mapper[w]
    else:
        return mapper['<UNK>']
    
def idx2word(i):
    if i >= len(w2v_dict):
        return '<ERR>'
    return w2v_dict[i][0]

In [6]:
def form_batch(data, ids):
    tmp = np.array([[data[r[0]][r[1]], data[r[0]][r[2]]] for r in ids])
    return tmp[:,0], tmp[:,1]

def yield_batch(data, batch_size, p_word = 1, p_context = [(-1, 0.8), (1, 0.8)], num_batches=-1, verbose=True):
    batch_id = 0
    data_len = len(data)
    while True:
        batch_id += 1
        if num_batches > 0:
            if batch_id > num_batches:
                print('Completed yielding batches {}\t\t'.format(num_batches))
                break
            if not verbose:
                print('Yielding batch {} out of {}'.format(batch_id, num_batches), end='\r')
        ids = []
        while len(ids) < batch_size:
            id0 = np.random.randint(data_len)
            if len(data[id0]) == 0:
                continue
            idi = np.random.randint(len(data[id0]))
            idx = data[id0][idi]
            if type(p_word) in (list, np.ndarray):
                if np.random.uniform() > p_word[idx]:
                    continue
            for (rj, prob) in p_context:
                j = idi + rj
                if j < 0 or j >= len(data[id0]):
                    continue
                if np.random.uniform() > prob:
                    continue
                ids.append((id0, idi, j))
        
        yield form_batch(data, ids[:batch_size])

In [31]:
DICT_SIZE = len(w2v_dict)
EMBED_SIZE = 200
NCE_NUM_SAMPLED = 100

init_embeding = np.random.multivariate_normal(np.zeros(EMBED_SIZE), np.identity(EMBED_SIZE), size=DICT_SIZE)/np.sqrt(EMBED_SIZE)
init_beta = np.random.multivariate_normal(np.zeros(EMBED_SIZE), np.identity(EMBED_SIZE), size=DICT_SIZE)/np.sqrt(EMBED_SIZE)
init_intercept = np.zeros((DICT_SIZE,))

tf.reset_default_graph()

with tf.name_scope('Input'):
    tf_in_word = tf.placeholder(tf.int32, shape=(None, ), name='in_word')
    tf_in_context = tf.placeholder(tf.int32, shape=(None, 1), name='in_context')
    tf_in_regularization = tf.placeholder_with_default(0.1, shape=(), name='in_regularization')
    
with tf.name_scope('Embedding'):
    tf_embedding = tf.Variable(init_embeding, dtype=tf.float32)
    tf_embedded_word = tf.nn.embedding_lookup(tf_embedding, tf_in_word, name='out_embedding')
    
with tf.name_scope('Training'):
    tf_nce_beta = tf.Variable(init_beta, dtype=tf.float32)
    tf_nce_intercept = tf.Variable(init_intercept, dtype=tf.float32)
    tf_nce_loss = tf.reduce_mean(
                    tf.nn.nce_loss(weights=tf_nce_beta, biases=tf_nce_intercept,
                                   labels=tf_in_context, inputs=tf_embedded_word,
                                   num_sampled=NCE_NUM_SAMPLED, num_classes=DICT_SIZE))
    #tf_reg_loss = tf.sqrt(tf.reduce_mean(tf.square(tf_embedding))) #bad loss
    tf_reg_loss = tf.sqrt(tf.reduce_mean(tf.square(tf.reduce_mean(tf_embedding, axis=0)))) #center of embedding is 0
    tf_full_loss = tf_nce_loss + tf_in_regularization * tf_reg_loss
    tf_train = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(tf_full_loss)
    
with tf.name_scope('Validation'):
    tf_valid_dictionary = tf.constant(np.array(range(DICT_SIZE)))
    tf_valid_embedding = tf.nn.embedding_lookup(tf_embedding, tf_valid_dictionary)
    tf_valid_in_norm = tf_embedded_word / tf.sqrt(tf.reduce_sum(tf.square(tf_embedded_word), 1, keep_dims=True))
    tf_valid_dic_norm = tf_valid_embedding / tf.sqrt(tf.reduce_sum(tf.square(tf_valid_embedding), 1, keep_dims=True))
    tf_valid_similarity = tf.matmul(tf_valid_in_norm, tf_valid_dic_norm, transpose_b=True)
    
tffw = tf.summary.FileWriter('D:/Jupyter/Logs/00_W2V', tf.get_default_graph())
tffw.close()
print('Graph creation complete.')

Graph creation complete.


In [8]:
%%time
valid_set = [x for x in yield_batch(w2v_src, batch_size=32768, num_batches=32)]
(valid_x, valid_y) = (np.hstack(x) for x in list(zip(*valid_set)))

Completed yielding batches 32		
Wall time: 11.5 s


In [34]:
tfsSaver = tf.train.Saver(max_to_keep=5)

simvalid_x = np.array([word2idx('two'), word2idx('this'), word2idx('are'), word2idx('bad'),
                       word2idx('price'), word2idx('number'), word2idx('xbox'), word2idx('math'),
                      word2idx('book'), word2idx('trump'), word2idx('college')])
simvalid_dict = {tf_in_word: simvalid_x}
valid_dict = {tf_in_word: valid_x, tf_in_context: valid_y.reshape(-1, 1)}

hp_w2v_num0 = 50
hp_w2v_alpha = -0.5

p_w2v_wordnum = np.array([x[1] for x in w2v_dict])
p_w2v_word = 1 #np.power(np.maximum(1, p_w2v_wordnum / hp_w2v_num0), hp_w2v_alpha) 
p_w2v_context = [(-1, 0.99), (1, 0.99)]
#p_w2v_context = [(-2, 0.3), (-1, 0.8), (1, 0.8), (2, 0.5), (3, 0.2)]

In [35]:
num_epochs = 100

with tf.Session() as tfs:
    tfs.run(tf.global_variables_initializer())
    
    sim = tf_valid_similarity.eval(feed_dict=simvalid_dict)
    [nce_loss, reg_loss] = tfs.run([tf_nce_loss, tf_reg_loss], feed_dict=valid_dict)
    print('Starting loss={:.3f} ({:.3f} reg-loss)'.format(nce_loss, reg_loss))
    for q in range(len(sim)):
        print([idx2word(z) for z in list(reversed(sim[q,:].argsort()))[:10]])
        
    for i in range(num_epochs):
        t0 = time.perf_counter()
        for (train_x, train_y) in yield_batch(w2v_src, p_word=p_w2v_word, p_context=p_w2v_context,
                                              batch_size=512, num_batches=10000, verbose=False):
            train_dict = {tf_in_word: train_x, tf_in_context: train_y.reshape(-1, 1)}
            tf_train.run(feed_dict=train_dict)

        sim = tf_valid_similarity.eval(feed_dict=simvalid_dict)
        [nce_loss, reg_loss] = tfs.run([tf_nce_loss, tf_reg_loss], feed_dict=valid_dict)
        dic_embed = tf_valid_dic_norm.eval()
        t1 = time.perf_counter()
        print('Step complete in {0:.2f} sec, loss={1:.3f} ({2:.3f} reg-loss)'.format(t1-t0, nce_loss, reg_loss))
        for q in range(len(sim)):
            print([idx2word(z) for z in list(reversed(sim[q,:].argsort()))[:10]])
        p = tfsSaver.save(tfs, 'D:/Jupyter/Models/Models-23Quora03-W2VS1/model-{0:02d}.ckpt'.format(i))
        print('Model saved at checkpoint: {0}'.format(p))
    
print('Complete')

Starting loss=315.089 (0.001 reg-loss)
['two', 'marking', 'teenagers', 'torrent', 'version', 'fc', 'maintain', 'luther', '"black', 'srcc']
['this', 'cool', 'crisis', 'documents', 'must', 'volt', 'origin', 'chris', 'humans', 'akhilesh']
['are', 'gaining', 'brisbane', 'discovered', 'immediate', 'historically', 'avoid', 'end', 'crying', 'microscope']
['bad', 'fourth', 'back', 'walls', 'bc', 'dies', 'playing', 'worn', 'intel', 'custom']
['price', 'atheist', 'manipal', 'drugs', 'writing', 'k3', 'atrocities', 'beach', 'platform', 'us']
['number', 'meanings', 'esteem', 'virus', 'vegetables', 'protest', 'stones', 'said', 'bullets', 'replaced']
['xbox', 'wet', 'hour', 'someones', 'sem', 'lovers', 'celebrated', 'fort', 'requested', 'republicans']
['math', 'smith', 'caught', "(i'm", 'adam', 'ontario', 'skip', 'raghuram', 'gases', 'converting']
['book', 'satan', 'skills', 'survive', 'blank', 'annoying', 'polo', 'thriller', 'incorrect', 'itinerary']
['trump', 'story', 'vertical', 'hbo', 'cognizant'

KeyboardInterrupt: 

In [45]:
tfsSaver = tf.train.Saver(max_to_keep=5)

simvalid_x = np.array([word2idx('two'), word2idx('this'), word2idx('are'), word2idx('bad'),
                       word2idx('price'), word2idx('number'), word2idx('xbox'), word2idx('math'),
                      word2idx('book'), word2idx('trump'), word2idx('college')])
simvalid_dict = {tf_in_word: simvalid_x}
valid_dict = {tf_in_word: valid_x, tf_in_context: valid_y.reshape(-1, 1)}

hp_w2v_num0 = 500
hp_w2v_alpha = -0.75

p_w2v_wordnum = np.array([x[1] for x in w2v_dict])
p_w2v_word = np.power(np.maximum(1, p_w2v_wordnum / hp_w2v_num0), hp_w2v_alpha) 
#p_w2v_context = [(-1, 0.99), (1, 0.99)]
p_w2v_context = [(-3, 0.2), (-2, 0.4), (-1, 0.6), (1, 0.7), (2, 0.5), (3, 0.3)]

In [52]:
num_epochs = 100

with tf.Session() as tfs:
    tfsSaver.restore(tfs, 'D:/Jupyter/Models/Models-23Quora03-W2VS1/model-03.ckpt')
    
    sim = tf_valid_similarity.eval(feed_dict=simvalid_dict)
    [nce_loss, reg_loss] = tfs.run([tf_nce_loss, tf_reg_loss], feed_dict=valid_dict)
    print('Starting loss={:.3f} ({:.3f} reg-loss)'.format(nce_loss, reg_loss))
    for q in range(len(sim)):
        print([idx2word(z) for z in list(reversed(sim[q,:].argsort()))[:10]])
        
    for i in range(num_epochs):
        t0 = time.perf_counter()
        for (train_x, train_y) in yield_batch(w2v_src, p_word=p_w2v_word, p_context=p_w2v_context,
                                              batch_size=512, num_batches=10000, verbose=False):
            train_dict = {tf_in_word: train_x, tf_in_context: train_y.reshape(-1, 1), tf_in_regularization: 0.01}
            tf_train.run(feed_dict=train_dict)

        sim = tf_valid_similarity.eval(feed_dict=simvalid_dict)
        [nce_loss, reg_loss] = tfs.run([tf_nce_loss, tf_reg_loss], feed_dict=valid_dict)
        dic_embed = tf_valid_dic_norm.eval()
        t1 = time.perf_counter()
        print('Step complete in {0:.2f} sec, loss={1:.3f} ({2:.3f} reg-loss)'.format(t1-t0, nce_loss, reg_loss))
        for q in range(len(sim)):
            print([idx2word(z) for z in list(reversed(sim[q,:].argsort()))[:10]])
        p = tfsSaver.save(tfs, 'D:/Jupyter/Models/Models-23Quora03-W2VS2/model-{0:02d}.ckpt'.format(i))
        print('Model saved at checkpoint: {0}'.format(p))
    
print('Complete')

INFO:tensorflow:Restoring parameters from D:/Jupyter/Models/Models-23Quora03-W2VS1/model-03.ckpt
Starting loss=4.113 (0.717 reg-loss)
['two', 'three', 'four', '3-4', 'eight', 'several', '2-3', '45', '2.5', 'smaller']
['this', '2012', 'darkness', 'digestion', 'joy', 'denmark', 'me"', 'boom', 'louis', 'zomato']
['are', 'were', "aren't", "weren't", 'do', 'is/was', 'characteristics', 'instances', 'is/are', 'ngos']
['bad', 'weird', 'frequent', 'normal', 'reasonable', 'terrible', 'sociopath', 'special', 'superior', 'immoral']
['price', 'understanding', 'mileage', 'salaries', 'lifespan', 'weights', 'directions', 'fate', 'implementation', 'depth']
['number', 'numbers', 'scams', 'stolen', 'hotspot', '3g', 'otg', 'salaries', 'icon', 'tower']
['xbox', 'nikon', 'ps4', 'oneplus', 'tick', 'cpa', 'hyundai', 'toyota', 'tor', 'alexa']
['math', 'maths', 'quant', 'mathematics', 'coding', 'geometry', 'flute', 'time"', 'javascript', 'niche']
['book', 'books', 'novel', 'podcasts', 'topic', 'poem', 'advertis

KeyboardInterrupt: 

In [58]:
tfsSaver = tf.train.Saver(max_to_keep=5)

simvalid_x = np.array([word2idx('two'), word2idx('this'), word2idx('are'), word2idx('bad'),
                       word2idx('price'), word2idx('phone'), word2idx('xbox'), word2idx('math'),
                      word2idx('book'), word2idx('trump'), word2idx('college'), word2idx('how'),
                      word2idx('km'), word2idx('step'), word2idx('guide')])
simvalid_dict = {tf_in_word: simvalid_x}
valid_dict = {tf_in_word: valid_x, tf_in_context: valid_y.reshape(-1, 1)}

hp_w2v_num0 = 500
hp_w2v_alpha = -0.85

p_w2v_wordnum = np.array([x[1] for x in w2v_dict])
p_w2v_word = np.power(np.maximum(1, p_w2v_wordnum / hp_w2v_num0), hp_w2v_alpha) 
#p_w2v_context = [(-1, 0.99), (1, 0.99)]
p_w2v_context = [(-4, 0.1), (-3, 0.3), (-2, 0.5), (-1, 0.7), (1, 0.8), (2, 0.6), (3, 0.4), (4, 0.2)]

In [59]:
num_epochs = 100
i = 0

with tf.Session() as tfs:
    tfsSaver.restore(tfs, 'D:/Jupyter/Models/Models-23Quora03-W2VS3/model-{0:02d}.ckpt'.format(i))
    
    sim = tf_valid_similarity.eval(feed_dict=simvalid_dict)
    [nce_loss, reg_loss] = tfs.run([tf_nce_loss, tf_reg_loss], feed_dict=valid_dict)
    print('Starting loss={:.3f} ({:.3f} reg-loss)'.format(nce_loss, reg_loss))
    for q in range(len(sim)):
        print([idx2word(z) for z in list(reversed(sim[q,:].argsort()))[:10]])
        
    while i < num_epochs:
        t0 = time.perf_counter()
        for (train_x, train_y) in yield_batch(w2v_src, p_word=p_w2v_word, p_context=p_w2v_context,
                                              batch_size=512, num_batches=10000, verbose=False):
            train_dict = {tf_in_word: train_x, tf_in_context: train_y.reshape(-1, 1), tf_in_regularization: 0.001}
            tf_train.run(feed_dict=train_dict)

        sim = tf_valid_similarity.eval(feed_dict=simvalid_dict)
        [nce_loss, reg_loss] = tfs.run([tf_nce_loss, tf_reg_loss], feed_dict=valid_dict)
        dic_embed = tf_valid_dic_norm.eval()
        t1 = time.perf_counter()
        print('Step complete in {0:.2f} sec, loss={1:.3f} ({2:.3f} reg-loss)'.format(t1-t0, nce_loss, reg_loss))
        for q in range(len(sim)):
            print([idx2word(z) for z in list(reversed(sim[q,:].argsort()))[:10]])
        
        i += 1
        p = tfsSaver.save(tfs, 'D:/Jupyter/Models/Models-23Quora03-W2VS3/model-{0:02d}.ckpt'.format(i))
        print('Model saved at checkpoint: {0}'.format(p))
    
print('Complete')

INFO:tensorflow:Restoring parameters from D:/Jupyter/Models/Models-23Quora03-W2VS3/model-00.ckpt
Starting loss=4.247 (0.802 reg-loss)
['two', '2', 'three', 'only', '3', '5', '4', 'four', '6', 'both']
['this', 'it', '<UNK>', 'me', 'reflection', 'name', 'one', 'every', 'it"', 'that']
['are', 'were', 'some', 'is', 'the', '<UNK>', 'be', 'that', 'have', 'all']
['bad', 'good', 'weird', 'normal', 'worst', 'terrible', 'like', 'work', 'harmful', '(on']
['price', 'prices', 'shares', 'buying', 'purchasing', 'market', 'buy', 'target', 'tablet', 'purchase']
['phone', 'phones', 'smartphone', 'device', 'cellphone', 'mobile', 'laptop', "phone's", 'tablet', 'iphone']
['xbox', 'ps4', 'playstation', 'console', 'oneplus', 'ps3', 'pirated', 'canon', 'otherwise', 'htc']
['math', 'physics', 'maths', 'mathematics', 'biology', 'cs', 'economics', 'linguistics', 'calculus', 'coding']
['book', 'books', 'textbook', 'novel', 'read', 'movie', 'course', 'poem', 'introductory', 'series']
['trump', 'donald', 'hillary',

KeyboardInterrupt: 

In [126]:
#(full_dict, full_sentences)
full_w2v = dic_embed

In [127]:
full_data = (full_dict, full_sentences, full_w2v)

In [131]:
with open(w2v_res_file, 'wb') as f:
    pickle.dump(full_data, f)