### Gazetteer Stats

In [1]:
import sys
import itertools
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

def get_tokens(filename):
    tkns = []
    with open(filename, 'r') as f:
        for line in f:
            tkns += line.strip().lower().split()
    return tkns       
    
def get_word_counts(filename):
    tkns = get_tokens(filename)
    return Counter(tkns)
    
def plot_word_frequencies(filename):
    counter = get_word_counts(filename)
    name_counts = Counter([counter[key] for key in counter])
    samples = []
    
    for i in range(1, 500):
        if i in name_counts:
            samples.append(np.log(name_counts[i]))
        else:
            samples.append(0)
        
    return samples

word_count = get_word_counts('../data/words.txt')
name_count = get_word_counts('../data/names.txt')

In [2]:
k = 32.4179104

priors = {}

with open('../data/ner_on_html/vocab.words.txt') as f:
    for n in f:
        n = n.strip().lower()
        a = name_count[n]
        b = round(word_count[n] * k)
        b = 1 if a > b else b - a
        priors[n] = (a, b)
        
# Scale priors.
for w in priors: 
    k = 0.01
    priors[w] = (round(k * priors[w][0])+1, round(k * priors[w][1])+1)

In [3]:
def map_est(k, n, a, b):
    return (k + a - 1) / (n + a + b - 2)

counts = {}
for p in priors:
    counts[p] = [0, 0]

probs = {}
def update_probs():
    global probs
    for p in priors:
        k, n = counts[p]
        alpha, beta = priors[p]
        probs[p] = map_est(k, n, 1+alpha, 1+beta)
update_probs()

In [7]:
import tensorflow as tf
import numpy as np
from random import shuffle

def get_sentences(f):
    with open(f, 'r', encoding="utf-8") as f:
        sentences = f.read().strip().split('\n\n')
        sentences = [[t.split() for t in s.split('\n')] for s in sentences if len(s) > 0]
        return sentences
    
sentences = get_sentences('../data/ner_on_html/test')
sentences = [[(t[0], 1 if t[1] != 'O' else 0) for t in s] for s in sentences if s[0][0] != '-DOCSTART-']

def get_random_labels(sentence):
    labels = []
    for t in sentence:        
        p = probs[t]
        y = np.random.choice(a=[0, 1], p=[1-p, p])
        labels.append(y)  
    return labels

words = tf.placeholder(tf.string, shape=(None,), name='words')
label_ids = tf.placeholder(tf.int32, shape=(None,), name='label_ids')
train_label_ids = tf.placeholder(tf.int32, shape=(None,), name='train_label_ids')

labels = tf.one_hot(train_label_ids, 2)

vocab_words = tf.contrib.lookup.index_table_from_file(
    '../data/ner_on_html/vocab.words.txt', num_oov_buckets=1
)

word_ids = vocab_words.lookup(words)
glove = np.load('../data/ner_on_html/glove.npz')['embeddings']
variable = np.vstack([glove, [[0.] * 300]])
variable = tf.Variable(variable, dtype=tf.float32, trainable=False)
word_embs = tf.nn.embedding_lookup(variable, word_ids)

with tf.variable_scope('lstm', reuse=tf.AUTO_REUSE):
    lstm_cell_fw = tf.nn.rnn_cell.LSTMCell(100)
    lstm_cell_bw = tf.nn.rnn_cell.LSTMCell(100)

    word_embs = tf.reshape(word_embs, [1, -1, 300])
    (output_fw, output_bw), (_, _) = tf.nn.bidirectional_dynamic_rnn(
        lstm_cell_fw, lstm_cell_bw, word_embs,
        dtype=tf.float32
    )

    output = tf.concat([output_fw, output_bw], axis=-1)
    output = tf.reshape(output, [-1, 200])
    
    logits = tf.layers.dense(output, 2)
    
    # Uncomment to predict based on priors only.
    # pred_ids = tf.argmax(labels, axis=-1)    
    pred_ids = tf.argmax(logits, axis=-1)
    correct = tf.equal(tf.to_int32(pred_ids), label_ids)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
        
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits))
    train_step = tf.train.AdamOptimizer().minimize(loss)
    
counts = {}
for p in priors:
    counts[p] = [0, 0]    
    
with tf.Session() as sess:
    tf.tables_initializer().run()
    tf.initializers.global_variables().run()
    
    for _ in range(30):
        new_counts = {}
        for p in priors:
            new_counts[p] = [0, 0]
        
        a = []
                
        shuffle(sentences)
        for i, s in enumerate(sentences):
            words_ = [t[0].lower() for t in s]
            labels_ = [t[1] for t in s]
            train_labels = get_random_labels(words_)
            acc, preds, _ = sess.run([accuracy, pred_ids, train_step], feed_dict={
                words: words_,
                label_ids: labels_,
                train_label_ids: train_labels
            })
            a.append(acc)
        
            for j, p in enumerate(preds):
                if p == 1:
                    new_counts[words_[j]][0] += 5
                new_counts[words_[j]][1] += 5
        
        print('Acc:', sum(a)/float(len(a)))
        counts = new_counts
        update_probs()

print('All other: 0.89')
print('All prior: 0.8054')

Acc: 0.821717194559
Acc: 0.838790433871
Acc: 0.841318442645
Acc: 0.841968317872
Acc: 0.84158705471
Acc: 0.841719799912
Acc: 0.841105207079


KeyboardInterrupt: 

In [59]:
n = 'ronald'

print(priors[n])
print(counts[n])

print((priors[n][0]+priors[n][1])/counts[n][1])

(13, 8)
[20, 30]
0.7


In [32]:
x = tf.constant([
    [
        [1, 2, 3],
        [1, 2, 3],
        [1, 2, 3],
    ],
    [
        [1, 2, 3],
        [4, 5, 6],
        [6, 7, 9],
    ]
], dtype=tf.int64)

Q = tf.transpose(x, [1, 0, 2]) 
K = tf.transpose(x, [1, 0, 2]) 

A = tf.map_fn(
    lambda k: tf.cast(tf.reduce_all(tf.equal(Q, k), axis=-1), tf.int64),
    K
)

z = tf.cast(tf.transpose(A, [2, 1, 0]), tf.float32)
# z = tf.cast(tf.reduce_sum(tf.cast(tf.equal(x, y), tf.int32), axis=-1), tf.float32)


with tf.Session() as sess:
    sess.run([tf.initializers.global_variables(), tf.tables_initializer()])
    res = sess.run(z)
    print(res)

[[[ 1.  1.  1.]
  [ 1.  1.  1.]
  [ 1.  1.  1.]]

 [[ 1.  0.  0.]
  [ 0.  1.  0.]
  [ 0.  0.  1.]]]
