In [5]:
import numpy as np
import tensorflow as tf
import keras as k
from keras import Sequential
import random
from keras.layers import Embedding
from tensorflow.contrib.tensorboard.plugins import projector
%matplotlib inline
import matplotlib.pyplot as plt
import os
import math
import h5py
import pickle
import pdb
from progressbar import log_progress

log_dir = '/home/steich/NF_Prize_Data/log'

if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    
x_train = []
y_train = []
with open('../data/user_embedding_5_train.tsv', 'r') as iFile:
    for line in iFile:
        line = line.replace('\n', '')
        line = line.split('\t')
        x_train.append(eval(line[1]))
        y_train.append(int(line[0]))
        
epochs = 100
batch_size = 32
embedding_size = 64
num_sampled = 64

with open('../data/name_one_hot_dict.pickle', 'rb') as handle:
    name_one_hot = pickle.load(handle)
    
name_count = len(name_one_hot)

def generate_batch(train_features, train_labels, batch_size, steps):
    
    batch_samples = np.zeros((batch_size))
    batch_labels = np.zeros((batch_size, 1))
    index_list = list(range(len(train_labels)))
    random.shuffle(index_list)
    batch_count = 0
    step = 0
    
    while step < steps:
        
        curr_index = index_list[-1]
        for i in train_features[curr_index]:
            
            if batch_count == 32:
                batch_count = 0
                step += 1
                yield (batch_samples, batch_labels)
            
            batch_labels[batch_count][0] = name_one_hot[train_labels[curr_index]]
            batch_samples[batch_count] = name_one_hot[i]
            batch_count += 1
            
        index_list.pop()
        
graph = tf.Graph()
with graph.as_default():
    
    #input data here
    with tf.name_scope('inputs'):
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        #valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
        
    with tf.device('/gpu:0'):
        with tf.name_scope('embeddings'):
            embeddings = tf.Variable(
                tf.random_uniform([name_count, embedding_size], -1.0, 1.0))
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)
            
        with tf.name_scope('weights'):
            nce_weights = tf.Variable(
                tf.truncated_normal(
                    [name_count, embedding_size],
                    stddev=1.0 / math.sqrt(embedding_size)))
        with tf.name_scope('biases'):
            nce_biases = tf.Variable(tf.zeros([name_count]))
            
    
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(
            tf.nn.nce_loss(
                weights=nce_weights,
                biases=nce_biases,
                inputs=embed,
                labels=train_labels,
                num_sampled=num_sampled,
                num_classes=name_count))
        
    tf.summary.scalar('loss', loss)
    
    with tf.name_scope('optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    
    #valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    #similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
    
    merged = tf.summary.merge_all()
    
    init = tf.global_variables_initializer()
    
    saver = tf.train.Saver()
    
with tf.Session(graph=graph) as session:
    
    writer = tf.summary.FileWriter(log_dir, session.graph)
    
    init.run()
    print('Initialized')
    
    steps = 1000
    average_loss = 0
    step = 0
    run_metadata = tf.RunMetadata()
    
    for epoch in range(epochs):
        if epoch > 0:
            print('Epoch: ', epoch, ' - Average Loss: ', average_loss / steps)
            average_loss = 0
        else:
            print('Epoch: ', epoch)
            
        for batch_inputs, batch_labels in log_progress(generate_batch(x_train, y_train, batch_size, steps),
                                                      every=1,
                                                      size=steps,
                                                      name="Batch"):
            
            feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

            run_metadata = tf.RunMetadata()

            _, summary, loss_val = session.run(
                [optimizer, merged, loss],
                feed_dict=feed_dict,
                run_metadata=run_metadata)
            average_loss += loss_val

            writer.add_summary(summary, step)
            step += 1
        
    writer.add_run_metadata(run_metadata, 'step%d' % step)
    final_embeddings = normalized_embeddings.eval()
    saver.save(session, os.path.join(log_dir, 'metadata.ckpt'))
    
writer.close()

from sklearn.manifold import TSNE
    
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
n = len(final_embeddings)
sample = random.sample(range(n), 500)
low_dim_embs = tsne.fit_transform([final_embeddings[i] for i in sample])
labels = sample
plt.figure(figsize=(18,18))
for i, label in enumerate(labels):
    x, y = low_dim_embs[i, :]
    plt.scatter(x,y)
    plt.annotate(label,
                    xy=(x, y),
                    xytext=(5,2),
                    textcoords='offset points',
                    ha='right',
                    va='bottom')
plt.savefig("/home/steich/NF_Prize_Data/data/tsne_user_embeddings_bow_all.png")
plt.show()
    
with open('/home/steich/NF_Prize_Data/data/user_emb_bow_all_unweighted.tsv', 'w') as f:
    for i, embs in enumerate(final_embeddings):
        emb_str = [str(emb) for emb in embs]
        f.write(str(i) + '\t' + '\t'.join(emb_str) + '\n')

In [None]:
import numpy as np
import tensorflow as tf
import keras as k
from keras import Sequential
import random
from keras.layers import Embedding
from tensorflow.contrib.tensorboard.plugins import projector
%matplotlib inline
import matplotlib.pyplot as plt
import os
import math
import h5py
import pickle
import pdb
from progressbar import log_progress

log_dir = '/home/steich/NF_Prize_Data/log'

if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    
x_train = []
y_train = []
with open('../data/user_embedding_5_thresh_train.tsv', 'r') as iFile:
    for line in iFile:
        line = line.replace('\n', '')
        line = line.split('\t')
        x_train.append(eval(line[1]))
        y_train.append(int(line[0]))
        
epochs = 100
batch_size = 32
embedding_size = 64
num_sampled = 64

with open('../data/name_one_hot_dict.pickle', 'rb') as handle:
    name_one_hot = pickle.load(handle)
    
name_count = len(name_one_hot)

def generate_batch(train_features, train_labels, batch_size, steps):
    
    batch_samples = np.zeros((batch_size))
    batch_labels = np.zeros((batch_size, 1))
    index_list = list(range(len(train_labels)))
    random.shuffle(index_list)
    batch_count = 0
    step = 0
    
    while step < steps:
        
        curr_index = index_list[-1]
        for i in train_features[curr_index]:
            
            if batch_count == 32:
                batch_count = 0
                step += 1
                yield (batch_samples, batch_labels)
            
            batch_labels[batch_count][0] = name_one_hot[train_labels[curr_index]]
            batch_samples[batch_count] = name_one_hot[i]
            batch_count += 1
            
        index_list.pop()
        
graph = tf.Graph()
with graph.as_default():
    
    #input data here
    with tf.name_scope('inputs'):
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        #valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
        
    with tf.device('/gpu:0'):
        with tf.name_scope('embeddings'):
            embeddings = tf.Variable(
                tf.random_uniform([name_count, embedding_size], -1.0, 1.0))
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)
            
        with tf.name_scope('weights'):
            nce_weights = tf.Variable(
                tf.truncated_normal(
                    [name_count, embedding_size],
                    stddev=1.0 / math.sqrt(embedding_size)))
        with tf.name_scope('biases'):
            nce_biases = tf.Variable(tf.zeros([name_count]))
            
    
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(
            tf.nn.nce_loss(
                weights=nce_weights,
                biases=nce_biases,
                inputs=embed,
                labels=train_labels,
                num_sampled=num_sampled,
                num_classes=name_count))
        
    tf.summary.scalar('loss', loss)
    
    with tf.name_scope('optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    
    #valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    #similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
    
    merged = tf.summary.merge_all()
    
    init = tf.global_variables_initializer()
    
    saver = tf.train.Saver()
    
with tf.Session(graph=graph) as session:
    
    writer = tf.summary.FileWriter(log_dir, session.graph)
    
    init.run()
    print('Initialized')
    
    steps = 1000
    average_loss = 0
    step = 0
    run_metadata = tf.RunMetadata()
    
    for epoch in range(epochs):
        if epoch > 0:
            print('Epoch: ', epoch, ' - Average Loss: ', average_loss / steps)
            average_loss = 0
        else:
            print('Epoch: ', epoch)
            
        for batch_inputs, batch_labels in log_progress(generate_batch(x_train, y_train, batch_size, steps),
                                                      every=1,
                                                      size=steps,
                                                      name="Batch"):
            
            feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

            run_metadata = tf.RunMetadata()

            _, summary, loss_val = session.run(
                [optimizer, merged, loss],
                feed_dict=feed_dict,
                run_metadata=run_metadata)
            average_loss += loss_val

            writer.add_summary(summary, step)
            step += 1
        
    writer.add_run_metadata(run_metadata, 'step%d' % step)
    final_embeddings = normalized_embeddings.eval()
    saver.save(session, os.path.join(log_dir, 'metadata.ckpt'))
    
writer.close()

from sklearn.manifold import TSNE
    
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
n = len(final_embeddings)
sample = random.sample(range(n), 500)
low_dim_embs = tsne.fit_transform([final_embeddings[i] for i in sample])
labels = sample
plt.figure(figsize=(18,18))
for i, label in enumerate(labels):
    x, y = low_dim_embs[i, :]
    plt.scatter(x,y)
    plt.annotate(label,
                    xy=(x, y),
                    xytext=(5,2),
                    textcoords='offset points',
                    ha='right',
                    va='bottom')
plt.savefig("/home/steich/NF_Prize_Data/data/tsne_user_embeddings_bow_thresh.png")
plt.show()
    
with open('/home/steich/NF_Prize_Data/data/user_emb_bow_thresh_unweighted.tsv', 'w') as f:
    for i, embs in enumerate(final_embeddings):
        emb_str = [str(emb) for emb in embs]
        f.write(str(i) + '\t' + '\t'.join(emb_str) + '\n')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Initialized
Epoch:  0


Epoch:  1  - Average Loss:  282.0476734619141


Epoch:  2  - Average Loss:  250.52641741943359


Epoch:  3  - Average Loss:  228.5533304901123


Epoch:  4  - Average Loss:  218.7359927520752


Epoch:  5  - Average Loss:  208.52891203308104


Epoch:  6  - Average Loss:  202.4528664932251


Epoch:  7  - Average Loss:  193.5076616973877


Epoch:  8  - Average Loss:  190.57244184112548


Epoch:  9  - Average Loss:  184.38606866455078


Epoch:  10  - Average Loss:  181.4573975982666


Epoch:  11  - Average Loss:  177.36425119018554


Epoch:  12  - Average Loss:  173.90751698303222


Epoch:  13  - Average Loss:  169.5646791000366


Epoch:  14  - Average Loss:  167.85513865661622


Epoch:  15  - Average Loss:  163.06347871398927


Epoch:  16  - Average Loss:  161.7150713043213


Epoch:  17  - Average Loss:  157.47838619995116


Epoch:  18  - Average Loss:  155.68766609954835


Epoch:  19  - Average Loss:  153.2573872566223


Epoch:  20  - Average Loss:  150.769688495636


Epoch:  21  - Average Loss:  148.77537171936035


Epoch:  22  - Average Loss:  144.74094772720338


Epoch:  23  - Average Loss:  145.444472366333


Epoch:  24  - Average Loss:  142.2839389038086


Epoch:  25  - Average Loss:  139.17042591094972


Epoch:  26  - Average Loss:  138.44604962158203


Epoch:  27  - Average Loss:  137.61268333435058


Epoch:  28  - Average Loss:  136.13179683303832


Epoch:  29  - Average Loss:  132.3703830833435


Epoch:  30  - Average Loss:  129.54193143463135


Epoch:  31  - Average Loss:  128.23740784454347


Epoch:  32  - Average Loss:  125.9760052986145


Epoch:  33  - Average Loss:  124.68910023880005


Epoch:  34  - Average Loss:  124.27971188354492


Epoch:  35  - Average Loss:  120.76313982391358


Epoch:  36  - Average Loss:  118.94760676574707


Epoch:  37  - Average Loss:  115.4667952156067


Epoch:  38  - Average Loss:  114.61527981948852


Epoch:  39  - Average Loss:  111.69080026626587


Epoch:  40  - Average Loss:  110.52368306732178


Epoch:  41  - Average Loss:  107.93609763145447


Epoch:  42  - Average Loss:  105.88048427772522


Epoch:  43  - Average Loss:  103.1642483100891


Epoch:  44  - Average Loss:  101.0927216873169


Epoch:  45  - Average Loss:  98.10986826896668


Epoch:  46  - Average Loss:  96.86414110183716


Epoch:  47  - Average Loss:  94.24024624443054


Epoch:  48  - Average Loss:  92.95495582008361


Epoch:  49  - Average Loss:  90.5484365196228


Epoch:  50  - Average Loss:  89.99229949951172


Epoch:  51  - Average Loss:  87.58844115829467


Epoch:  52  - Average Loss:  85.2100892162323


Epoch:  53  - Average Loss:  82.33615483856201


Epoch:  54  - Average Loss:  81.45931358718872


Epoch:  55  - Average Loss:  78.72986657524109


Epoch:  56  - Average Loss:  77.30330863189697


Epoch:  57  - Average Loss:  75.63837035369873


Epoch:  58  - Average Loss:  72.95794278240204


Epoch:  59  - Average Loss:  70.9831981639862


Epoch:  60  - Average Loss:  70.67774885368347


Epoch:  61  - Average Loss:  68.82813820362091


Epoch:  62  - Average Loss:  68.95394424533843


Epoch:  63  - Average Loss:  65.51769890499115


Epoch:  64  - Average Loss:  64.8048328666687


Epoch:  65  - Average Loss:  62.69861666488647


Epoch:  66  - Average Loss:  61.6746944360733


Epoch:  67  - Average Loss:  60.070141920089725


Epoch:  68  - Average Loss:  58.42500888729096


Epoch:  69  - Average Loss:  56.477035948753354


Epoch:  70  - Average Loss:  56.49311699581146


Epoch:  71  - Average Loss:  55.00271317958832


Epoch:  72  - Average Loss:  53.00651082277298


Epoch:  73  - Average Loss:  52.959346507072446


Epoch:  74  - Average Loss:  52.11487867689133


Epoch:  75  - Average Loss:  50.09105627250671


Epoch:  76  - Average Loss:  49.16446797370911


Epoch:  77  - Average Loss:  48.882749133586884


Epoch:  78  - Average Loss:  47.7178695936203


Epoch:  79  - Average Loss:  46.1270489795208


Epoch:  80  - Average Loss:  45.35878061008454


Epoch:  81  - Average Loss:  44.51613084602356


Epoch:  82  - Average Loss:  43.934827809333804


Epoch:  83  - Average Loss:  43.482727190494536


Epoch:  84  - Average Loss:  42.06276924133301


Epoch:  85  - Average Loss:  40.890683179855344


Epoch:  86  - Average Loss:  40.39842712259293


Epoch:  87  - Average Loss:  40.20612968921662
