In [1]:
import pandas as pd
import random
import numpy as np
import networkx as nx
import nltk
import tensorflow as tf
import collections
import math
import os.path
from six.moves import xrange
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import Birch
import  glob

### Load Document Data

In [2]:
Doc = collections.namedtuple('Doc',['investigator', 'amount', 'abstract'])

def load2(fn):
    doc = nltk.data.load(fn)
    wrds = nltk.tokenize.wordpunct_tokenize(doc)
    investigator = " ".join(wrds[wrds.index("Investigator")+2:
                          min(wrds.index('@' if '@' in wrds else "Abstract", 
                                          wrds.index("Investigator"))-1, 
                              wrds.index('(', wrds.index("Investigator")))])
    amount = int(wrds[wrds.index("Amt") + 4: wrds.index('(', wrds.index("Amt"))][0])
    abstract = " ".join(wrds[wrds.index('Abstract')+2:])
    return Doc(investigator, amount, abstract)

### Generate Batches Within W2V

In [3]:
data_index = 0
def gen_batch(data, batch_size, skip_window, num_skips):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1 # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [ skip_window ]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels


###  Next Level

In [4]:
def complete(KB, data):
    data_c = list([data[0]])
    data_nl = list()
    for index, clust in enumerate(data[:-1]):
            i = 0
            ed = KB.get_edge_data(clust, data[index+1])
            while ed and i < 20:
                data_c.append(ed['number'])
                data_nl.append(ed['number'])
                ed = KB.get_edge_data(ed['number'], data[index+1])
                i += 1
            data_c.append(data[index+1])
    return data_c, data_nl

In [5]:
def add_level(KB, embeds, data_c, fit):
    embed_data = np.array([embeds[i] for i in data_c])
    flatten = lambda l: [item for sublist in l for item in sublist]
    nl_raw = [embed_data[0]] + flatten(
                np.array([[(sel - embed_data[ind-1]), sel] for ind, sel 
                       in enumerate(embed_data[1:], start=1)]))
    
    nl_data = fit.predict(nl_raw)
    print(len(nl_data))
    for i in range(0, len(nl_data)-1, 2):
        KB.add_edge(nl_data[i], nl_data[i+2], number=nl_data[i+1])

    return KB, list(nl_data)

### Word2Vec Parameters

In [6]:
#words = load()
#KB = prep_graph(words)
batch_size = 256
embedding_size = 200  # Dimension of the embedding vector.
skip_window = 3      # How many words to consider left and right.
num_skips = 4         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 10    # Random set of words to evaluate similarity on.
valid_window = 200  # Only pick dev samples in the head of the distribution.
num_sampled = 100    # Number of negative examples to sample.
num_steps = 20000

In [7]:
def prep_graph(words):
    KB2 = nx.Graph()
    for word in words:
        if not KB2.has_node(word):
            KB2.add_node(word)
        
    KB3 = nx.convert_node_labels_to_integers(KB2, label_attribute='word')
    KB4 = nx.Graph()
    
    for node in KB3.nodes(True):
        KB4.add_node(node[1]['word'], number=node[0])
    return KB4

### We call this each time we want to add new layers

In [8]:
def add_level(words, embeddings, KB, n_cluster):
    for index, word in enumerate(words[:-1]):
        if KB.has_node(word) and KB.has_node(words[index+1]):
            if KB.has_edge(word, words[index+1]):
                node_name = KB.edge[word][words[index+1]]['node']
                words = np.insert(words, index+1, str(node_name))
    
    
    data = [KB.node[word]['number'] for word in words if word in KB.node]
    embed_data = np.array([embeddings[wordnum] for wordnum in data])
    next_lvl_raw = embed_data[1:] - embed_data[:-1]
    mbatch = MiniBatchKMeans(n_clusters=n_cluster, 
                             batch_size=max(len(words)*.05, n_cluster+1), 
                             max_iter=100000)
    next_lvl_cent = mbatch.fit(embed_data)

    vocab_size = KB.number_of_nodes()
    
    for num in range(vocab_size, vocab_size+n_cluster):
        KB.add_node(str(num), number=num)
    
    words_n = np.array([words[0]])
    for i in range(1, len(words)-1):
        t = next_lvl_cent.labels_[i-1]
        words_n = np.append(words_n, [str(t+vocab_size), words[i+1]])
        KB.add_edge(words[i], words[i+1], node=str(t+vocab_size))
        
    return words_n, KB     

# Now We Test For Maintaining State of W2V

In [13]:
def W2V2(KB, batch_size, embedding_size, skip_window, num_skips, valid_size,
       valid_window, valid_examples, num_sampled, vocabulary_size,
       num_steps, filenames, num_files):
    graph = tf.Graph()
    
    with graph.as_default():
        def weight_summary(var, name):
          """Attach a lot of summaries to a Tensor."""
          with tf.name_scope('summaries'):
            mean = tf.reduce_mean(var)
            tf.scalar_summary('mean/' + name, mean)
            with tf.name_scope('stddev'):
                stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
            tf.scalar_summary('stddev/' + name, stddev)
            tf.scalar_summary('max/' + name, tf.reduce_max(var))
            tf.scalar_summary('min/' + name, tf.reduce_min(var))
            tf.histogram_summary(name, var)
        
                
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

        with tf.device('/cpu:0'):
            
            embeddings = tf.Variable(
                            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0), name="emb")
            weight_summary(embeddings, 'embeddings')
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size], 
                               stddev=1.0 / math.sqrt(embedding_size)), name="nw")
        weight_summary(nce_weights, 'nce_weights')
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]), name="nb")
        weight_summary(nce_biases, 'nce_biases')
        
        loss = tf.reduce_mean(
            tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
                          num_sampled, vocabulary_size))
        #scalar_summary(loss, 'loss')
        optimizer = tf.train.AdamOptimizer().minimize(loss)
        
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 
                                     1, keep_dims=True))
        #scalar_summary(norm, 'norm')
        
        normalized_embeddings = embeddings / norm
        weight_summary(normalized_embeddings, 'normalized_embeddings')
        valid_embeddings = tf.nn.embedding_lookup(
            normalized_embeddings, valid_dataset)
        weight_summary(valid_embeddings, 'valid_embeddings')
        similarity = tf.matmul(
            valid_embeddings, normalized_embeddings, transpose_b=True)
        weight_summary(similarity, 'similarity')
        
        merged = tf.merge_all_summaries()
        
        init = tf.initialize_all_variables()
    
    
    with tf.Session(graph=graph) as session:
    
        
        saver = tf.train.Saver()
        train_writer = tf.train.SummaryWriter('./summaries' + '/train',
                                      session.graph)
        
        init.run()
        fit = Birch(threshold=.1, branching_factor=20, n_clusters=None)
        if os.path.isfile('./tmp/nips2.ckpt'):
            saver.restore(session, './tmp/nips1.ckpt')
            print("Restored")
        else:
            print("Initialized")
        
        
        dq = []
        for i in range(num_files):
            dq += load2(filenames[i]).abstract.split(" ")
            
        dictionary = {}
        dat = []
        for word in dq:
            if word not in dictionary:
                dictionary[word] = len(dictionary)
            dat += [dictionary[word]]
            
        
        average_loss = 0
        for step in xrange(num_steps):
            batch_inputs, batch_labels = gen_batch(
                dat, batch_size, skip_window, num_skips)
            feed_dict = {train_inputs : batch_inputs, 
                         train_labels : batch_labels}
            
            if step % 10 == 0:
                
                _, loss_val, summary = session.run([optimizer, loss, merged], feed_dict=feed_dict)
                average_loss += loss_val
            
                train_writer.add_summary(summary, step)
            else:
                _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
                average_loss += loss_val
        
            if step % 2000 == 0:
                if step > 0:
                    average_loss /= 2000
                # The average loss is an estimate of the loss over the last 2000 batches.
                print("Average loss at step ", step, ": ", average_loss)
                average_loss = 0

            # Note that this is expensive (~20% slowdown if computed every 500 steps)
            if step % 10000 == 0 and step > 0:
                sim = similarity.eval()
                for i in xrange(valid_size):
                    valid_word = revdic[valid_examples[i]]
                    top_k = 8 # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k+1]
                    log_str = "Nearest to %s:" % valid_word
                    for k in xrange(top_k):
                        close_word = revdic[nearest[k]]
                        log_str = "%s %s," % (log_str, close_word)
                    print(log_str)
        saver.save(session, './tmp/nips1.ckpt')
        final_embeddings = normalized_embeddings.eval()
        return final_embeddings, KB, fit

In [15]:
filenames = [fn for fn in glob.iglob('./text/Part*/*/*/*.txt', recursive=False)]
num_files = 100
vocabulary_size = 10000
KB = nx.DiGraph()
KB.add_nodes_from(np.arange(10000))
embedding_size = 100  # Dimension of the embedding vector.
skip_window = 3      # How many words to consider left and right.
num_skips = 4         # How many times to reuse an input to generate a label.
#batch = batch[:num_skips * (len(batch)//num_skips)]
batch_size = 256#len(batch)
# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 5    # Random set of words to evaluate similarity on.
valid_window = 10  # Only pick dev samples in the head of the distribution.
num_sampled = 20    # Number of negative examples to sample.
num_steps = 8000
valid_examples = np.random.choice(valid_window, 
                                      valid_size, replace=False)

embed = W2V2(KB, batch_size, embedding_size, skip_window, num_skips, 
                   valid_size, valid_window, valid_examples, num_sampled,
                   vocabulary_size, num_steps, filenames, num_files)


Initialized
Average loss at step  0 :  93.8474197388
Average loss at step  2000 :  58.8172911773
Average loss at step  4000 :  29.4206732301


KeyboardInterrupt: 

In [20]:
len(filenames) 

0

### Adding Levels

In [8]:
def add_level2(words, embeddings, KB, n_cluster, fit=None):
    words
    for index, word in enumerate(words[:-1]):
        if KB.has_node(word) and KB.has_node(words[index+1]):
            if KB.has_edge(word, words[index+1]):
                node_name = KB.edge[word][words[index+1]]['node']
                words = np.insert(words, index+1, str(node_name))
    
    
    data = [KB.node[word]['number'] for word in words if word in KB.node]
    embed_data = np.array([embeddings[wordnum] for wordnum in data])
    next_lvl_raw = embed_data[1:] - embed_data[:-1]
    
    if not fit:
        mbatch = Birch(n_clusters=n_cluster, branching_factor=200)
        b_tree = mbatch.fit(embed_data)
    else:
        fit.set_params(n_clusters=fit.n_clusters+n_cluster)
        next_lvl_cent = fit.partial_fit(embed_data)
    vocab_size = KB.number_of_nodes()
    
    for num in range(vocab_size, vocab_size+n_cluster):
        KB.add_node(str(num), number=num)
    
    words_n = np.array([words[0]])
    for i in range(1, len(words)-1):
        t = next_lvl_cent.labels_[i-1]
        words_n = np.append(words_n, [str(t+vocab_size), words[i+1]])
        KB.add_edge(words[i], words[i+1], node=str(t+vocab_size))
        
    return words_n, KB, next_lvl_cent     