In [1]:
import pandas as pd
import random
import numpy as np
import networkx as nx
import nltk
import tensorflow as tf
import collections
import math
import os.path
from six.moves import xrange
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import MiniBatchKMeans
import glob 

In [9]:
Doc = collections.namedtuple('Doc',['investigator', 'amount', 'abstract'])

def load2(fn):
    doc = nltk.data.load(fn)
    wrds = nltk.tokenize.wordpunct_tokenize(doc)
    investigator = " ".join(wrds[wrds.index("Investigator")+2:
                          min(wrds.index('@' if '@' in wrds else "Abstract", 
                                          wrds.index("Investigator"))-1, 
                              wrds.index('(', wrds.index("Investigator")))])
    amount = int(wrds[wrds.index("Amt") + 4: wrds.index('(', wrds.index("Amt"))][0])
    abstract = " ".join(wrds[wrds.index('Abstract')+2:])
    return Doc(investigator, amount, abstract)



In [10]:
def complete(KB, data):
    data_c = list([data[0]])
    for index, clust in enumerate(data[:-1]):
            i = 0
            ed = KB.get_edge_data(clust, data[index+1])
            while ed and i < 5:
                data_c.append(ed['number'])
                ed = KB.get_edge_data(ed['number'], data[index+1])
                i += 1
            data_c.append(data[index+1])
    return data_c

In [23]:
def add_level(KB, embeds, data_c, fit):
    embed_data = np.array([embeds[i] for i in data_c])
    flatten = lambda l: [item for sublist in l for item in sublist]
    nl_raw = [embed_data[0]] + flatten(
                np.array([[(sel - embed_data[ind-1]), sel] for ind, sel 
                       in enumerate(embed_data[1:], start=1)]))
    
    fit.partial_fit(nl_raw)
    nl_data = fit.predict(nl_raw)
    
    for i in range(0, len(nl_data)-1, 2):
        KB.add_edge(nl_data[i], nl_data[i+2], number=nl_data[i+1])

    return KB, list(nl_data), fit

In [24]:
f_index = 0
def gen_batch2(KB, embeds, dictionary, filenames, 
               window, fit):
    global f_index
    ab = []
    for i in range(f_index, f_index+4):
        ab += load2(filenames[f_index]).abstract.split(" ")
    data = []
    for word in ab:
        if word in dictionary:
            data.append(dictionary[word])
        else:
            dictionary[word] = random.randint(0, len(embeds)-1)
            data.append(dictionary[word])
    
    data_c = complete(KB, data)
    KB, nl_data, fit = add_level(KB, embeds, data_c, fit)
    labels = [nl_data[i-window: i] + nl_data[i+1: i+window+1] 
              for i in range(window, len(nl_data)-window)]
    f_index += 4
    return nl_data, labels, dictionary, KB, fit
    

In [33]:
def W2V2(KB, embedding_size, skip_window, num_skips, valid_size,
       valid_window, valid_examples, num_sampled, vocabulary_size,
       num_steps, filenames):
    graph = tf.Graph()
    
    with graph.as_default():
        def weight_summary(var, name):
          """Attach a lot of summaries to a Tensor."""
          with tf.name_scope('summaries'):
            mean = tf.reduce_mean(var)
            tf.scalar_summary('mean/' + name, mean)
            with tf.name_scope('stddev'):
                stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
            tf.scalar_summary('stddev/' + name, stddev)
            tf.scalar_summary('max/' + name, tf.reduce_max(var))
            tf.scalar_summary('min/' + name, tf.reduce_min(var))
            tf.histogram_summary(name, var)

                
        train_inputs = tf.placeholder(tf.int32)
        train_labels = tf.placeholder(tf.int32, shape=[None, skip_window*2])
        

        with tf.device('/cpu:0'):
            gaussians = tf.random_normal((vocabulary_size, embedding_size))

            embeddings = tf.Variable(tf.mul(tf.div(1.0, tf.sqrt(tf.reduce_sum(
                        tf.square(gaussians), reduction_indices=1,
                        keep_dims=True))), gaussians), name="emb")
#             embeddings = tf.Variable(
#                             tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0), name="emb")
            weight_summary(embeddings, 'embeddings')
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size], 
                               stddev=1.0 / math.sqrt(embedding_size)), name="nw")
        weight_summary(nce_weights, 'nce_weights')
        
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]), name="nb")
        weight_summary(nce_biases, 'nce_biases')
        
        loss = tf.reduce_mean(
            tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
                          num_sampled, vocabulary_size, num_true=skip_window*2))
        #scalar_summary(loss, 'loss')
        optimizer = tf.train.AdamOptimizer().minimize(loss)
        
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 
                                     1, keep_dims=True))
        #scalar_summary(norm, 'norm')
        
        normalized_embeddings = embeddings / norm
        weight_summary(normalized_embeddings, 'normalized_embeddings')
        
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
        
        valid_embeddings = tf.nn.embedding_lookup(
            normalized_embeddings, valid_dataset)
        weight_summary(valid_embeddings, 'valid_embeddings')
        
        similarity = tf.matmul(
            valid_embeddings, normalized_embeddings, transpose_b=True)
        weight_summary(similarity, 'similarity')
        
        merged = tf.merge_all_summaries()
        
        init = tf.initialize_all_variables()
    
    
    with tf.Session(graph=graph) as session:
        
        
        saver = tf.train.Saver([embeddings, nce_weights, nce_biases])
        train_writer = tf.train.SummaryWriter('./summaries' + '/train',
                                      session.graph)
        dictionary = {}
        init.run()
        fit = MiniBatchKMeans(n_clusters=vocabulary_size, 
                              init=embeddings.eval()).fit(embeddings.eval())
        if os.path.isfile('./tmp/nips2.ckpt'):
            saver.restore(session, './tmp/nips1.ckpt')
            print("Restored")
        else:
            print("Initialized")      
        
        average_loss = 0
        for step in xrange(num_steps):
            print(step)
            batch_inputs, batch_labels, dictionary, KB, fit = gen_batch2(
                KB, embeddings.eval(), dictionary, filenames, skip_window, fit)
            batch_inputs = np.array(batch_inputs[:int(len(batch_inputs)/skip_window)])
            batch_labels = np.array(batch_labels[:len(batch_inputs)])
            if len(batch_inputs) > 100:
                while len(batch_inputs) > batch_size:
                    feed_dict = {train_inputs : batch_inputs[:batch_size], 
                         train_labels : batch_labels[:batch_size]}
                    _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
                    average_loss += loss_val
                    batch_inputs = batch_inputs[batch_size:]
                    batch_labels = batch_labels[batch_size:]
                
                feed_dict = {train_inputs : batch_inputs, 
                         train_labels : batch_labels}
                
                if step % 10 == 0: 
                    _, loss_val, summary = session.run([optimizer, loss, merged], feed_dict=feed_dict)
                    average_loss += loss_val
                    #train_writer.add_summary(summary, step)
                else:
                    _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
                    average_loss += loss_val
        
                if step % 50 == 0:
                    if step > 0:
                        average_loss /= 2000
                    # The average loss is an estimate of the loss over the last 2000 batches.
                    print("Average loss at step ", step, ": ", average_loss)
                    average_loss = 0

                # Note that this is expensive (~20% slowdown if computed every 500 steps)
                if step % 10000 == 0 and step > 0:
                    sim = similarity.eval()
                    for i in xrange(valid_size):
                        valid_word = revdic[valid_examples[i]]
                        top_k = 8 # number of nearest neighbors
                        nearest = (-sim[i, :]).argsort()[1:top_k+1]
                        log_str = "Nearest to %s:" % valid_word
                        for k in xrange(top_k):
                            close_word = revdic[nearest[k]]
                            log_str = "%s %s," % (log_str, close_word)
                        print(log_str)
        
        saver.save(session, './tmp/nips1.ckpt')
        final_embeddings = normalized_embeddings.eval()
        return final_embeddings, KB, dictionary

In [34]:

filenames = [fn for fn in glob.iglob('./text/Part*/*/*/*.txt', recursive=False)]
vocabulary_size = 10000
KB = nx.DiGraph()
KB.add_nodes_from(np.arange(10000))
embedding_size = 100  # Dimension of the embedding vector.
skip_window = 2      # How many words to consider left and right.
num_skips = 4         # How many times to reuse an input to generate a label.
#batch = batch[:num_skips * (len(batch)//num_skips)]
batch_size = 256#len(batch)
# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 5    # Random set of words to evaluate similarity on.
valid_window = 10  # Only pick dev samples in the head of the distribution.
num_sampled = 20    # Number of negative examples to sample.
num_steps = 1000
valid_examples = np.random.choice(valid_window, 
                                      valid_size, replace=False)

embed = W2V2(KB, embedding_size, skip_window, num_skips, 
                   valid_size, valid_window, valid_examples, num_sampled,
                   vocabulary_size, num_steps, filenames)


  init_size=init_size)


Initialized
0
Average loss at step  0 :  402.736419678
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
Average loss at step  50 :  12.8072569847
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
Average loss at step  100 :  13.1081203041
101
102
103


KeyboardInterrupt: 

In [477]:
final_embeddings = embed[0]

In [478]:
KB = embed[1]

In [479]:
dictionary = embed[2]

In [480]:
dictionary

{'similarites': 9380,
 '': 3445,
 'awards': 9390,
 'HRU': 1757,
 'governing': 8197,
 'Syowa': 4142,
 'diffeomorphisms': 2429,
 'Specific': 8772,
 'Tarski': 6915,
 'Electron': 3675,
 'utility': 111,
 'Serrano': 1677,
 'Gastropoda': 3376,
 'tensile': 6347,
 'examined': 3429,
 'latex': 2049,
 'EAWAG': 9582,
 'Systematic': 2157,
 'principle': 7765,
 'biomedical': 9649,
 'inverted': 375,
 'investigating': 5567,
 'War': 631,
 'extant': 6817,
 'transformation': 8558,
 'affected': 7353,
 'standards': 8442,
 'age': 7737,
 'eigenvalues': 2123,
 'Microelectronics': 9369,
 'asks': 3971,
 'internation': 9755,
 'fraction': 6036,
 'huge': 7006,
 'Saclay': 3960,
 'catalogs': 1372,
 'student': 96,
 'encoder': 6902,
 'tidal': 5593,
 'decays': 1817,
 'odor': 544,
 'propagation': 8503,
 'purchase': 9728,
 'difficulty': 4689,
 'avenue': 6108,
 'norms': 6690,
 'precipitated': 4960,
 'axial': 9536,
 'generalization': 6437,
 'duality': 612,
 'micellar': 7430,
 'model': 5546,
 'scattered': 75,
 'nad': 5163,
 '

In [482]:
nx.write_gpickle(KB, './KBNSF1.gpickle')

In [490]:
pd_dictionary = pd.DataFrame.from_dict(dictionary, orient='index').reset_index()

In [491]:
pd_dictionary.to_pickle('./NIFDict1.pickle')

In [492]:
np.save('./embedNIF1', final_embeddings)

In [495]:
test = pd_dictionary.groupby(0)

In [507]:
test.groups

{0: Int64Index([7921], dtype='int64'),
 1: Int64Index([511, 14362], dtype='int64'),
 2: Int64Index([1016, 2588, 3071, 4319], dtype='int64'),
 3: Int64Index([3349, 8849], dtype='int64'),
 4: Int64Index([6206, 7084, 7432], dtype='int64'),
 5: Int64Index([524, 10766, 11094], dtype='int64'),
 6: Int64Index([2747], dtype='int64'),
 7: Int64Index([1255], dtype='int64'),
 8: Int64Index([4411, 14250], dtype='int64'),
 9: Int64Index([3782, 5769, 6508], dtype='int64'),
 11: Int64Index([5813, 6256], dtype='int64'),
 12: Int64Index([8373], dtype='int64'),
 16: Int64Index([7286, 10206], dtype='int64'),
 17: Int64Index([9083], dtype='int64'),
 20: Int64Index([10666, 13240], dtype='int64'),
 21: Int64Index([684, 5254, 6696, 6744], dtype='int64'),
 22: Int64Index([591], dtype='int64'),
 24: Int64Index([460, 3839], dtype='int64'),
 25: Int64Index([1390, 3541, 3978, 7912, 10027, 10525], dtype='int64'),
 26: Int64Index([11557], dtype='int64'),
 27: Int64Index([4557, 9796, 11131], dtype='int64'),
 28: Int

In [505]:
test.get_group(2)

Unnamed: 0,index,0
12969,molybdenum,277
