In [4]:
import pandas as pd
import random
import numpy as np
import networkx as nx
import nltk
import tensorflow as tf
import collections
import math
import os.path
from six.moves import xrange
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import Birch

### Load Document Data

In [5]:
def load():
    ppp = nltk.data.load('../../Downloads/ppp.txt', encoding='utf8')
    words_p = nltk.tokenize.wordpunct_tokenize(ppp)[130:]
    alw = nltk.data.load('../../Downloads/alw.txt', encoding='utf8')
    words_a = nltk.tokenize.wordpunct_tokenize(alw)[143:]
    return words_a, words_p

### Generate Batches Within W2V

In [6]:
data_index = 0
def gen_batch(data, batch_size, skip_window, num_skips):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1 # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [ skip_window ]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels


### Build Vocab

In [7]:
def build_vocab(words, vocabulary_size):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, dictionary, reverse_dictionary

# We're Gonna Start By Just Saving Dicts

### Word2Vec TensorFlow

In [8]:
def W2V2(batch_size, embedding_size, skip_window, num_skips, valid_size,
       valid_window, valid_examples, num_sampled, vocabulary_size,
       num_steps, data, revdic):
    graph = tf.Graph()
    
    with graph.as_default():
        def weight_summary(var, name):
          """Attach a lot of summaries to a Tensor."""
          with tf.name_scope('summaries'):
            mean = tf.reduce_mean(var)
            tf.scalar_summary('mean/' + name, mean)
            with tf.name_scope('stddev'):
                stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
            tf.scalar_summary('stddev/' + name, stddev)
            tf.scalar_summary('max/' + name, tf.reduce_max(var))
            tf.scalar_summary('min/' + name, tf.reduce_min(var))
            tf.histogram_summary(name, var)
        def scalar_summary(var, name):
            with tf.name_scope('summaries'):
                tf.scalar_summary('scalar/'+name, var)
                
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        

        with tf.device('/cpu:0'):
            
            embeddings = tf.Variable(
                            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0), name="emb")
            weight_summary(embeddings, 'embeddings')
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size], 
                               stddev=1.0 / math.sqrt(embedding_size)), name="nw")
        weight_summary(nce_weights, 'nce_weights')
        
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]), name="nb")
        weight_summary(nce_biases, 'nce_biases')
        
        loss = tf.reduce_mean(
            tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
                          num_sampled, vocabulary_size))
        #scalar_summary(loss, 'loss')
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 
                                     1, keep_dims=True))
        #scalar_summary(norm, 'norm')
        
        normalized_embeddings = embeddings / norm
        weight_summary(normalized_embeddings, 'normalized_embeddings')
        
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
        
        valid_embeddings = tf.nn.embedding_lookup(
            normalized_embeddings, valid_dataset)
        weight_summary(valid_embeddings, 'valid_embeddings')
        
        similarity = tf.matmul(
            valid_embeddings, normalized_embeddings, transpose_b=True)
        weight_summary(similarity, 'similarity')
        
        merged = tf.merge_all_summaries()
        
        init = tf.initialize_all_variables()
    
    
    with tf.Session(graph=graph) as session:
        
        
        saver = tf.train.Saver()
        train_writer = tf.train.SummaryWriter('./summaries' + '/train',
                                      session.graph)
        
        init.run()
        print("Initialized")      
        
        #saver = tf.train.Saver({'In'})
        
        average_loss = 0
        for step in xrange(num_steps):
            batch_inputs, batch_labels = gen_batch(
                data, batch_size, skip_window, num_skips)
            feed_dict = {train_inputs : batch_inputs, 
                         train_labels : batch_labels}
            
            if step % 10 == 0:
                
                _, loss_val, summary = session.run([optimizer, loss, merged], feed_dict=feed_dict)
                average_loss += loss_val
            
                train_writer.add_summary(summary, step)
            else:
                _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
                average_loss += loss_val
        
            if step % 2000 == 0:
                if step > 0:
                    average_loss /= 2000
                # The average loss is an estimate of the loss over the last 2000 batches.
                print("Average loss at step ", step, ": ", average_loss)
                average_loss = 0

            # Note that this is expensive (~20% slowdown if computed every 500 steps)
            if step % 10000 == 0 and step > 0:
                sim = similarity.eval()
                for i in xrange(valid_size):
                    valid_word = revdic[valid_examples[i]]
                    top_k = 8 # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k+1]
                    log_str = "Nearest to %s:" % valid_word
                    for k in xrange(top_k):
                        close_word = revdic[nearest[k]]
                        log_str = "%s %s," % (log_str, close_word)
                    print(log_str)
        final_embeddings = normalized_embeddings.eval()
        return final_embeddings

### Word2Vec Parameters

In [9]:
#words = load()
#KB = prep_graph(words)
batch_size = 256
embedding_size = 200  # Dimension of the embedding vector.
skip_window = 3      # How many words to consider left and right.
num_skips = 4         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 10    # Random set of words to evaluate similarity on.
valid_window = 200  # Only pick dev samples in the head of the distribution.
num_sampled = 100    # Number of negative examples to sample.
num_steps = 20000

### Make a Graph Where Nodes are Words with Number Attributes

In [14]:
def prep_graph(words):
    KB2 = nx.Graph()
    for word in words:
        if not KB2.has_node(word):
            KB2.add_node(word)
        
    KB3 = nx.convert_node_labels_to_integers(KB2, label_attribute='word')
    KB4 = nx.Graph()
    
    for node in KB3.nodes(True):
        KB4.add_node(node[1]['word'], number=node[0])
    return KB4

### We call this each time we want to add new layers

In [10]:
def add_level(words, embeddings, KB, n_cluster):
    for index, word in enumerate(words[:-1]):
        if KB.has_node(word) and KB.has_node(words[index+1]):
            if KB.has_edge(word, words[index+1]):
                node_name = KB.edge[word][words[index+1]]['node']
                words = np.insert(words, index+1, str(node_name))
    
    
    data = [KB.node[word]['number'] for word in words if word in KB.node]
    embed_data = np.array([embeddings[wordnum] for wordnum in data])
    next_lvl_raw = embed_data[1:] - embed_data[:-1]
    mbatch = MiniBatchKMeans(n_clusters=n_cluster, 
                             batch_size=max(len(words)*.05, n_cluster+1), 
                             max_iter=100000)
    next_lvl_cent = mbatch.fit(embed_data)

    vocab_size = KB.number_of_nodes()
    
    for num in range(vocab_size, vocab_size+n_cluster):
        KB.add_node(str(num), number=num)
    
    words_n = np.array([words[0]])
    for i in range(1, len(words)-1):
        t = next_lvl_cent.labels_[i-1]
        words_n = np.append(words_n, [str(t+vocab_size), words[i+1]])
        KB.add_edge(words[i], words[i+1], node=str(t+vocab_size))
        
    return words_n, KB     

In [261]:
def add_level2(words, embeddings, KB, fit=None):
    words
    for index, word in enumerate(words[:-1]):
        if KB.has_node(word) and KB.has_node(words[index+1]):
            if KB.has_edge(word, words[index+1]):
                node_name = KB.edge[word][words[index+1]]['node']
                words = np.insert(words, index+1, str(node_name))
    
    
    data = [KB.node[word]['number'] for word in words if word in KB.node]
    embed_data = np.array([embeddings[wordnum] for wordnum in data])
    next_lvl_raw = embed_data[1:] - embed_data[:-1]
    
    if not fit:
        mbatch = Birch(n_clusters=None, branching_factor=100)
        b_tree = mbatch.fit(embed_data)
    else:
        fit.set_params(n_clusters=fit.n_clusters+n_cluster)
        next_lvl_cent = fit.partial_fit(embed_data)
    vocab_size = KB.number_of_nodes()
    
    for num in range(vocab_size, vocab_size+n_cluster):
        KB.add_node(str(num), number=num)
    
    words_n = np.array([words[0]])
    for i in range(1, len(words)-1):
        t = next_lvl_cent.labels_[i-1]
        words_n = np.append(words_n, [str(t+vocab_size), words[i+1]])
        KB.add_edge(words[i], words[i+1], node=str(t+vocab_size))
        
    return words_n, KB, next_lvl_cent     

## Our Method - Maintained

In [11]:
def new2(words_l, words_t, vocab_size):
    global data_index
    data_index = 0
    m_common = [item[0] for item in collections.Counter(words_l).most_common(n=500)]
    words_l = np.array([word for word in words_l if word in m_common])
    KB = prep_graph(words_l)
    
    valid_examples = np.random.choice(valid_window, 
                                      valid_size, replace=False)
    for i in range(1):
        if i > 0:
            words_l, KB, fit = add_level2(words_l, embeddings_l, KB, 500)
        data_index = 0
        #vocab_size = KB.number_of_nodes()
        revdic = {node[1]['number']: node[0] for node in KB.nodes(True)}
        data_l = [KB.node[word]['number'] for word in words_l]
        embeddings_l = W2V2(batch_size, embedding_size, skip_window,
                num_skips, valid_size, valid_window, valid_examples, 
                       num_sampled, vocab_size, 10000, data_l, revdic)
    
    
    words_t = np.array([word for word in words_t if word in m_common])
    for index, word in enumerate(words_t[:-1]):
        if KB.has_node(word) and KB.has_node(words_t[index+1]):
            if KB.has_edge(word, words_t[index+1]):
                node_name = KB.edge[word][words_t[index+1]]['node']
                words_t = np.insert(words_t, index+1, str(node_name))
    
    data_index = 0
    data_t = [KB.node[word]['number'] for word in words_t if word in KB.node]
    embeddings_t = W2V2(batch_size, embedding_size, skip_window,
                num_skips, valid_size, valid_window, valid_examples, 
                       num_sampled, vocab_size, num_steps, data_t, revdic)
    

## Actual Test Area

In [12]:
words_l, words_t = load()

In [15]:
vocab_size = 500
m_common = [item[0] for item in collections.Counter(words_l).most_common(n=500)]
words_l = np.array([word for word in words_l if word in m_common])
KB = prep_graph(words_l)
revdic = {node[1]['number']: node[0] for node in KB.nodes(True)}
data_l = [KB.node[word]['number'] for word in words_l]
valid_examples = np.random.choice(valid_window, 
                                      valid_size, replace=False)
embeddings_l = W2V2(batch_size, embedding_size, skip_window,
                num_skips, valid_size, valid_window, valid_examples, 
                       num_sampled, vocab_size, 10000, data_l, revdic)

Initialized
Average loss at step  0 :  161.569137573
Average loss at step  2000 :  5.26270027351
Average loss at step  4000 :  4.2121324532
Average loss at step  6000 :  4.1391200037
Average loss at step  8000 :  4.10277059472


In [16]:
data = [KB.node[word]['number'] for word in words_l if word in KB.node]
embed_data = np.array([embeddings_l[wordnum] for wordnum in data])
next_lvl_raw = embed_data[1:] - embed_data[:-1]

In [20]:
fit = Birch(n_clusters=None)

In [27]:
for i in range(40, 60):
    fit.partial_fit(next_lvl_raw[i*500:(i+1)*500])

In [31]:
fit.set_params(n_clusters=500)
fit.partial_fit()

Birch(branching_factor=50, compute_labels=True, copy=True, n_clusters=500,
   threshold=0.5)

In [36]:
fit.predict(next_lvl_raw)

array([100,  67, 113, ...,  49, 394, 256])

In [38]:
fit2 = MiniBatchKMeans(n_clusters=500, 
                             batch_size=max(len(next_lvl_raw)*.05, 500+1), 
                             max_iter=100000)