In [10]:
import pandas as pd
import random
import numpy as np
import networkx as nx
import nltk
import tensorflow as tf
import collections
import math
from six.moves import xrange
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import MiniBatchKMeans

In [11]:
from functools import partial

class Infix(object):
    def __init__(self, func):
        self.func = func
    def __or__(self, other):
        return self.func(other)
    def __ror__(self, other):
        return Infix(partial(self.func, other))
    def __call__(self, v1, v2):
        return self.func(v1, v2)

## Read In

In [12]:
def load():
    ppp = nltk.data.load('../../Downloads/ppp.txt', encoding='utf8')
    words_p = nltk.tokenize.wordpunct_tokenize(ppp)[130:]
    alw = nltk.data.load('../../Downloads/alw.txt', encoding='utf8')
    words_a = nltk.tokenize.wordpunct_tokenize(alw)[143:]
    words = words_a + words_p
    return words

## Turn into list of Sentences

In [13]:
def get_sentences(words):
    sentences_word  = []
    sent_word = []
    for index, word in enumerate(words):
        if word in ['?','.','!']:
            sent_word += [word]
            sentences_word += [sent_word]
            sent_word = []
        else:
            sent_word += [word]
    return sentences_word

## Turn Vocab into Indices
-- This was process, but I think this is useful regardless. I will need to change it to accomodate ..... stuff?

In [14]:
def build_vocab(words, vocabulary_size):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, dictionary, reverse_dictionary

## Set Up Graph Building

Basically we want to make sure that things are being added to the graph. How do we do this. Well. I think we should take them as if they were read in line and then only add things like that. This way we can do all our learning right good without having to stop and add to the graph

Adding node likelihood : http://ieeexplore.ieee.org/document/7266560/

In [15]:
def graph(KB, embeddings, sentence, max_nodes):
    def next_lvl(sentence):
        @Infix
        def to(a, b):
            x = embeddings[sentence[b]] - embeddings[sentence[a]]
            return x
        
        nl = []
        ln = []
        for index in range(len(sentence[:-1])):
            if KB.has_edge(sentence[index], sentence[index+1]):
                nl.append(KB[sentence[index]][sentence[index+1]]['node'])
            else:
                if KB.number_of_nodes()+1 < max_nodes:
                    KB.add_node(KB.number_of_nodes(), exp=.5)
                    KB.add_edge(sentence[index], sentence[index+1],
                               node=KB.number_of_nodes())
                    embeddings[KB.number_of_nodes()] = index |to| (index+1)
                else:
                    x = index |to| (index+1)
                    similarity = cosine_similarity(embeddings, [x])
                    nearest = np.argmax(similarity)
                    exp = KB.node[nearest]['exp']
                    embeddings[nearest] = embeddings[nearest]*exp + x*(1-exp)
                    KB.node[nearest]['exp'] += (1-exp)**3
                    if 1-similarity.max() > 1-exp:
                        nl.append(nearest)
                    
                    if not KB.has_edge(sentence[index], sentence[index+1]):
                        KB.add_edge(sentence[index], sentence[index+1], 
                                    node=nearest)
        return nl
    
    
    l_all = [sentence]
    l_next = sentence
    while l_next:
        l_next = next_lvl(l_next)
        print(l_next)
        l_all += l_next
    return KB, embeddings, l_all[:-1]

In [17]:
def prep_graph(words):
    KB2 = nx.Graph()
    for word in words:
        if not KB2.has_node(word):
            KB2.add_node(word)
        
    KB3 = nx.convert_node_labels_to_integers(KB2, label_attribute='word')
    KB4 = nx.Graph()
    
    for node in KB3.nodes(True):
        KB4.add_node(node[1]['word'], number=node[0])
    return KB4

In [138]:
def add_level(words, embeddings, KB, n_cluster):
    
    for index, word in enumerate(words[:-1]):
        if KB.has_node(word) and KB.has_node(words[index+1]):
            if KB.has_edge(word, words[index+1]):
                node_name = KB.edge[word][words[index+1]]['node']
                words.insert(index+1, str(node_name))
                
    data = [KB.node[word]['number'] for word in words if word in KB.node]
    
    embed_data = np.array([embeddings[wordnum] for wordnum in data])
    
    next_lvl_raw = embed_data[1:] - embed_data[:-1]
    next_lvl_cent = TFKMC(next_lvl_raw, n_clusters=n_cluster)

    words_n = [words[0]]
    vocab_size = KB.number_of_nodes()
    
    for num in range(vocab_size, vocab_size+n_cluster):
        KB.add_node(str(num), number=num)
    
    for i in range(len(next_lvl_cent.labels_)):
        t = next_lvl_cent.labels_[i]
        words_n.extend([str(t+vocab_size), words[i+1]])
        KB.add_edge(words[i], words[i+1], node=str(t+vocab_size))
        
    
    
    return words_n, KB     
    
    
    

## Generate Each Batch
-- a useful thing would be for batches to be made of few words, a couple sentences, and then be taken from there. idk.

In [19]:
data_index = 0
def gen_batch(data, batch_size, skip_window, num_skips):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1 # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [ skip_window ]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels


## Do the Actual Word2Vec Algorithm in TF

In [37]:
def W2V(batch_size, embedding_size, skip_window, num_skips, valid_size,
       valid_window, valid_examples, num_sampled, vocabulary_size,
       num_steps, data, revdic):
    graph = tf.Graph()
    
    with graph.as_default():
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
        
        with tf.device('/cpu:0'):
            
            embeddings = tf.Variable(
                            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size], 
                               stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
        
        loss = tf.reduce_mean(
            tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
                          num_sampled, vocabulary_size))
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 
                                     1, keep_dims=True))
        normalized_embeddings = embeddings / norm
        valid_embeddings = tf.nn.embedding_lookup(
            normalized_embeddings, valid_dataset)
        similarity = tf.matmul(
            valid_embeddings, normalized_embeddings, transpose_b=True)
        
        init = tf.initialize_all_variables()
    
    with tf.Session(graph=graph) as session:
        
        init.run()
        print("Initialized")
        #saver = tf.train.Saver({'In'})
        
        average_loss = 0
        for step in xrange(num_steps):
            batch_inputs, batch_labels = gen_batch(
                data, batch_size, skip_window, num_skips)
            feed_dict = {train_inputs : batch_inputs, 
                         train_labels : batch_labels}
            
            _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
            average_loss += loss_val

        
            if step % 2000 == 0:
                if step > 0:
                    average_loss /= 2000
                # The average loss is an estimate of the loss over the last 2000 batches.
                print("Average loss at step ", step, ": ", average_loss)
                average_loss = 0

            # Note that this is expensive (~20% slowdown if computed every 500 steps)
            if step % 10000 == 0 and step > 0:
                sim = similarity.eval()
                for i in xrange(valid_size):
                    valid_word = revdic[valid_examples[i]]
                    top_k = 8 # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k+1]
                    log_str = "Nearest to %s:" % valid_word
                    for k in xrange(top_k):
                        close_word = revdic[nearest[k]]
                        log_str = "%s %s," % (log_str, close_word)
                    print(log_str)

        final_embeddings = normalized_embeddings.eval()
        return final_embeddings
        
    

## Set Up Running

In [139]:
prev = 0

words = load()
KB = prep_graph(words)
number_words = KB.number_of_nodes()
batch_size = 256
embedding_size = 100  # Dimension of the embedding vector.
skip_window = 3      # How many words to consider left and right.
num_skips = 4         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 200    # Random set of words to evaluate similarity on.
valid_window = number_words  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 100    # Number of negative examples to sample.
num_steps = 20000
assert batch_size % num_skips == 0

In [140]:
data = []
vocab_size = KB.number_of_nodes()
revdic = {node[1]['number']: node[0] for node in KB.nodes(True)}

words_n = words[prev: prev+10000]
data = [KB.node[word]['number'] for word in words_n]

data, dic, revdic = build_vocab(words, vocab_size)
embeddings = W2V(batch_size, embedding_size, skip_window,
            num_skips, valid_size, valid_window,
            valid_examples, num_sampled, vocab_size,
            num_steps, data, revdic)

prev += 10000
#words_n, KB = add_level(words[prev: prev+10000], embeddings, KB, 500)

Initialized
Average loss at step  0 :  327.263061523
Average loss at step  2000 :  37.4282845838
Average loss at step  4000 :  6.95995280075
Average loss at step  6000 :  5.64904360628
Average loss at step  8000 :  5.35014919043
Average loss at step  10000 :  5.26306878948
Nearest to folding: www, _food_, wherever, _You_, pages, moments, insolence, hoarse,
Nearest to compass: confiding, defects, advise, Every, distress, V, ,, unaffected,
Nearest to affirmative: mad, reproaches, if, vouch, deprive, re, nearly, told,
Nearest to board: appearance, tolerably, invited, graces, pass, exact, thinks, teeth,
Nearest to conveying: 24, Hearts, abrupt, hundred, DISTRIBUTOR, Dawson, )(, barely,
Nearest to resisting: agree, intimacy, Suppose, Ugh, passed, rope, caprice, eight,
Nearest to study: promised, lives, superciliousness, characters, comes, attending, till, exhibiting,
Nearest to approached: exists, pointed, grin, warned, Caterpillar, draughts, facts, _more_,
Nearest to Engaged: pianoforte, B

In [183]:
words_n = words[100000:150000]
words_n, KB = add_level(words_n, embeddings, KB, 1000)
vocab_size = KB.number_of_nodes()
revdic = {node[1]['number']: node[0] for node in KB.nodes(True)}
data = [KB.node[word]['number'] for word in words_n]


  distances = np.zeros(self.batch_size, dtype=np.float64)
  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples -

In [184]:
data_index = 0
embeddings = W2V(batch_size, embedding_size, skip_window,
            num_skips, valid_size, valid_window,
            valid_examples, num_sampled, vocab_size,
            num_steps, data, revdic)

Initialized
Average loss at step  0 :  387.60723877
Average loss at step  2000 :  69.9865728245
Average loss at step  4000 :  9.01119960183
Average loss at step  6000 :  3.6344371556
Average loss at step  8000 :  2.63510841748
Average loss at step  10000 :  2.29744554491
Nearest to among: beauty, 18863, 19699, 19781, 19408, 19461, evil, pay,
Nearest to facts: breast, 19323, 16217, 19179, appeared, 18915, 10342, 19072,
Nearest to ladder: 11590, bedrooms, obliging, extras, 15786, 19490, bowed, 16736,
Nearest to upper: 18882, 19383, 8991, 9769, own, colouring, 9642, 9119,
Nearest to concealment: 16416, 15912, 18557, 9305, 17676, 13565, $, commended,
Nearest to pet: policy, cool, talked, 18731, 15075, 15025, stronger, 17875,
Nearest to coquetry: if, 11978, am, 9118, mistakes, individual, 12574, 19855,
Nearest to certainty: civilities, 18296, care, 12077, 8828, 17858, 11783, Though,
Nearest to connivance: 15070, 18240, 15875, 14404, 8668, 15272, enjoying, 14316,
Nearest to soothing: 14516, 

In [40]:
def TFKMC(vectors, n_clusters=1000, max_iter=100000):
    mbatch = MiniBatchKMeans(n_clusters=n_clusters, batch_size=len(vectors)*.1, max_iter=max_iter)
    centroids = mbatch.fit(vectors)
    return centroids

In [186]:
for index, word in enumerate(words[:-1]):
        if KB.has_node(word) and KB.has_node(words[index+1]):
            if KB.has_edge(word, words[index+1]):
                node_name = KB.edge[word][words[index+1]]['node']
                words.insert(index+1, str(node_name))

In [224]:
data = [KB.node[word]['number'] for word in words]

In [196]:
counts = collections.Counter()

In [197]:
for word in words:
    counts.update([word])

In [227]:
non_word = list(filter(lambda datum: revdic[datum] == str(datum), data))

In [230]:
nw_revdic = {node[1]['number']: node[0] for node in KB.nodes(True) 
             if str(node[1]['number']) == node[0]}

In [234]:
nw = np.array(list(map(int, non_word)))

In [237]:
nx.to_pandas_dataframe(KB)

Unnamed: 0,12344,15176,11660,fourthly,equipment,frighten,16932,junior,13816,19222,...,listener,governed,tureen,tickets,screamed,9992,11899,16122,19186,10868
12344,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11660,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fourthly,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
equipment,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
frighten,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
junior,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [239]:
nx.write_graphml(KB, './L1.graphml')

In [240]:
np.save('./L!', nw)

In [241]:
np.save('./embedL!', embeddings)

## What To Do

### 1. Read Gradient Descent Site
### 2. Read the Candidate Sampling Thing
### 3. Implement KMeans MiniBatch in TensorFlow
### 4. Write Graph Adding Code
### 5. Write Processing for Transitions into Graph
### 6. Implement Word2Vec Using Graphs
