In [1]:
text = "Input text looks something like this. I thinkso..."

In [3]:
from twokenize import tokenize

In [4]:
print tokenize(text)

['Input', 'text', 'looks', 'something', 'like', 'this', '.', 'I', 'thinkso', '...']


In [1]:
def chargram(token, n=3):
    """ Convert word into character level ngrams.
    
    We pad both ends of the word with _ tokens on both ends for `wide` ngrams
    
    Eg:
        
        ['__i', '_in', 'inp', 'npu', 'put', 'ut_', 't__'] = str_to_char_ngrams('input', 3)
    """
    token = '_'*(n-1) + token + '_'*(n-1)
    chargram = []
    for i in range(len(token[:-(n-1)])):
        chargram.append(token[i:i+n])
    return chargram

def chargrams(tokens, n=3):
    return [ chargram(token, n=n) for token in tokens ]
chargrams(['input', 'text'])

[['__i', '_in', 'inp', 'npu', 'put', 'ut_', 't__'],
 ['__t', '_te', 'tex', 'ext', 'xt_', 't__']]

In [2]:
from collections import Counter

class Vocab():
    def __init__(self, all_tokens=None, min_count=5):
        self.min_count=min_count
        self.count_index = Counter()
        self._vocab2idx = {'<PAD>':0,
                           '<UNK>':1}
        self._idx2vocab = {0:'<PAD>',
                           1:'<UNK>'}
        self.vocabset = set(self._vocab2idx.keys())
        self.idxset = set(self._idx2vocab.keys())
        
        if all_tokens:
            self.use(all_tokens)
        
    @property
    def n(self):
        return sum( count for count in self.count_index.values() )
    
    @property
    def pad(self):
        return '<PAD>'
    
    @property
    def ipad(self):
        return 0
    
    def idx(self, token):
        if token in self.vocabset:
            return self._vocab2idx[token]
        else:
            return self._vocab2idx['<UNK>']
        
    def token(self, idx):
        if idx in self.idxset:
            return self._idx2vocab[idx]
        else:
            return self._idx2vocab['<UNK>']
    
    def use(self, tokens):
        self.count_index = Counter()
        self.add(tokens)        
    
    def add(self, tokens):
        for token in tokens:
            self.count_index[token] += 1
        self._vocab2idx = {'<UNK>':0}
        self._vocab2idx.update({token:i+1 for i, (token, count) in enumerate(self.count_index.most_common())
                                if count >= self.min_count})
        self._idx2vocab = {i:token for token, i in self._vocab2idx.items()}
        self.vocabset = set(self._vocab2idx.keys())
        self.idxset = set(self._idx2vocab.keys())
        
    def count(self, token):
        return self.count_index[token]
vocab = Vocab(min_count=1)

In [99]:
vocab.use(tokenize(text))

In [100]:
vocab.n

10

In [101]:
vocab.idx('a')

0

In [3]:
data_fname = '../wnut_ner_evaluation/data/train_notypes'
xs, ys = [], []
with open(data_fname, 'r') as f:
    x, y = [], []
    for i, line in enumerate(f):
        split = line.split()
        if split:
            x.append(split[0])
            y.append(split[1])
        else: 
            xs.append(x)
            ys.append(y)
            x, y = [], []

data_fname = '../wnut_ner_evaluation/data/dev_notypes'
dev_xs, dev_ys = [], []
with open(data_fname, 'r') as f:
    x, y = [], []
    for i, line in enumerate(f):
        split = line.split()
        if split:
            x.append(split[0])
            y.append(split[1])
        else: 
            dev_xs.append(x)
            dev_ys.append(y)
            x, y = [], []

In [4]:
all_chargrams = [ c for x in xs for g in chargrams(x) for c in g ]
xvocab = Vocab(all_chargrams, min_count=0)
yvocab = Vocab([l for y in ys for l in y ], min_count=0)

In [5]:
def pad_tensor(tensor, pad_symbol):
    if not isinstance(tensor[0], (list,tuple)):
        return tensor

    
    pad_len = max(len(sub_tensor) for sub_tensor in tensor)
    tensor = type(tensor)([ type(sub_tensor)(pad_tensor(
                                [element for element in sub_tensor]
                                +[[pad_symbol]*pad_len]*(pad_len-len(sub_tensor)), pad_symbol))
                            for sub_tensor in tensor])
    return tensor

def sentences_to_chargrams(sentences, vocab):
    all_xs, all_ws = [], []
    for sentence in sentences:
        xs, ws = [], []
        for token in sentence:
            grams = chargram(token)
            x = [ vocab.idx(cgram) for cgram in grams ]
            w = [ 1 for cgram in grams ]
            xs.append(x)
            ws.append(w)
        all_xs.append(xs)
        all_ws.append(ws)
    return all_xs, all_ws

def pad_chargrams(sentences, ws, pad):
    max_sent_len = max(len(sent) for sent in sentences)
    max_word_len = max(len(word) for sent in sentences for word in sent)
    padded_sentences, padded_ws = [], []
    for sentence, weights in zip(sentences, ws):
        padded_sentence = []
        padded_weights = []
        for token, weight in zip(sentence, weights):
            token += [pad]*(max_word_len-len(token))
            weight += [0]*(max_word_len-len(weight))
            padded_sentence.append(token)
            padded_weights.append(weight)
        for _ in range(max_sent_len - len(sentence)):
            padded_sentence.append([pad]*max_word_len)
            padded_weights.append([0]*max_word_len)
        padded_sentences.append(padded_sentence)
        padded_ws.append(padded_weights)
    return padded_sentences, padded_ws, max_sent_len, max_word_len

def sentence_to_index(sentence, vocab):
    return [ vocab.idx(token) for token in sentence ]

def sentences_to_index(sentences, vocab):
    return [ sentence_to_index(sentence, vocab) for sentence in sentences ]

def pad_sentence(sentence, pad, pad_len):
    return sentence + [pad]*(pad_len-len(sentence))

def pad_sentences(sentences, pad):
    pad_len = max(len(sentence) for sentence in sentences)
    return [ pad_sentence(sentence, pad, pad_len) for sentence in sentences ]

# Load actual data

In [18]:
all_xs = xs + dev_xs
all_ys = ys + dev_ys

all_xs, all_ws = sentences_to_chargrams(all_xs, xvocab)
all_xs, all_ws, max_time, max_chargrams = pad_chargrams(all_xs, all_ws, xvocab.ipad)
all_ys = sentences_to_index(all_ys, yvocab)
all_ys = pad_sentences(all_ys, yvocab.ipad)

train_xs, test_xs = all_xs[:len(xs)], all_xs[len(xs):]
train_ws, test_ws = all_ws[:len(xs)], all_ws[len(xs):]
train_ys, test_ys = all_ys[:len(xs)], all_ys[len(xs):]

print max_time, max_chargrams
print len(train_xs), len(test_xs)

41 123
2394 1000


In [56]:
import tensorflow as tf
import random
import numpy as np

In [65]:
class EntitySegmenter():
    def __init__(self, session, **hyperparams):
        print "Model Loading...",
        self.hyperparams = hyperparams
        for var, val in hyperparams.items():
            setattr(self, var, val)
        
        self.manipulate_params()
        self.build_forward()
        self.build_loss()
        self.build_optimizer()
        
        self.session = session
        self.session.run(tf.initialize_all_variables())
        print "Done"
        
    def manipulate_params(self):
        pass
    
    def build_forward(self):
        # inputs
        self.x_input = tf.placeholder(tf.int32, [None, self.max_time, self.max_chargrams])
        self.x_weight = tf.placeholder(tf.float32, [None, self.max_time, self.max_chargrams])
        self.y_input = tf.placeholder(tf.int32, [None, self.max_time])
        self.dropout_keep = tf.placeholder(tf.float32)
        
        # embed and take weighted sum of character grams as word embedding
        self.chargram_vectors = tf.Variable(tf.random_uniform([self.xvocab.n, self.char_embed_size], 
                                                             -.1, .1, tf.float32))
        self.embedded_chargrams = tf.nn.embedding_lookup(self.chargram_vectors, 
                                                         self.x_input)
        tile_weights = tf.tile(tf.expand_dims(self.x_weight, [3]), [1,1,1,self.char_embed_size])
        self.embedded_words = tf.reduce_sum(tile_weights * self.embedded_chargrams, [2])
        
    
        # mlp
        self.W = tf.Variable(tf.random_uniform([self.char_embed_size, self.yvocab.n], 
                                              -.1, .1, tf.float32))
        self.b = tf.Variable(tf.zeros([self.yvocab.n]))
        
        logits = tf.matmul(tf.reshape(self.embedded_words, [-1, self.char_embed_size]), self.W) + self.b
        self.logits = tf.reshape(logits, [-1, self.max_time, self.yvocab.n])
        self.labels = tf.argmax(self.logits, 2)
        
    def build_loss(self):
        word_weights = tf.minimum(tf.reduce_sum(self.x_weight, [2]), 1.)
        # convert logits to lists
        logits_list = [tf.squeeze(t, [1]) for t in tf.split(1, self.max_time, self.logits)]
        targets_list = [tf.squeeze(t, [1]) for t in tf.split(1, self.max_time, self.y_input)]
        weights_list = [tf.squeeze(t, [1]) for t in tf.split(1, self.max_time, word_weights)]
        seq_loss = tf.nn.seq2seq.sequence_loss_by_example(logits_list,
                                                          targets_list,
                                                          weights_list)
        self.loss = tf.reduce_mean(seq_loss)
    
    def build_optimizer(self):
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        self.train_op = self.optimizer.minimize(self.loss)
        
    def batch_generator(self, x, y, batch_size=64):
        # shuffle the data
        xy = zip(x,y)
        random.shuffle(xy)
        x, y = zip(*xy)
        
        # create minibatches
        num_batches = len(y) // batch_size + 1
        for batch_i in range(num_batches):
            start, end = batch_i*batch_size, (batch_i+1)*batch_size
            yield batch_i, x[start:end], y[start:end]
    
    def print_progress(self, epoch_i, n_epoch, batch_i, n_batch, loss):
        if batch_i == 0:
            print
        print "\r Epoch {0}/{1} : Batch {2}/{3}: Loss {4:.4f}".format(
              epoch_i+1, n_epoch, batch_i+1, n_batch, loss),
        
    def fit(self, x, y, n_epoch=1, batch_size=64, **partial_fit_kwargs):
        n_batch = len(y) // batch_size + 1
        for epoch_i in range(n_epoch):
            for batch_i, batch_x, batch_y in self.batch_generator(x, y, batch_size): 
                loss = self.partial_fit(batch_x, batch_y)
                self.print_progress(epoch_i, n_epoch, batch_i, n_batch, loss)
                
    def partial_fit(self, x, y, measure_only=False):
        x_input, x_weight = zip(*x)
        feed_dict = {
            self.x_input:x_input,
            self.x_weight:x_weight,
            self.y_input:y,
            self.dropout_keep:1.-self.dropout
        }
        
        if measure_only:
            loss = self.session.run(self.loss, feed_dict)
        else:
            loss, _ = self.session.run([self.loss, self.train_op], feed_dict)
        return loss
    
    def predict(self, x):
        x_input, x_weight = zip(*x)
        feed_dict = {
            self.x_input:x_input,
            self.x_weight:x_weight,
            self.dropout_keep:1.
        }
        predictions = self.session.run(self.labels, feed_dict)
        return predictions
    
    def save(self):
        pass
    
    def load(self):
        pass

In [66]:
tf.reset_default_graph()
session = tf.InteractiveSession()

params = {
    'learning_rate':.001,
    'xvocab':xvocab,
    'yvocab':yvocab,
    'max_time':max_time,
    'max_chargrams':max_chargrams,
    'char_embed_size':50,
    'dropout':.0
}
ner = EntitySegmenter(session, **params)

Model Loading... Done


Exception AssertionError: AssertionError("Nesting violated for default stack of <type 'weakref'> objects",) in <bound method InteractiveSession.__del__ of <tensorflow.python.client.session.InteractiveSession object at 0x133d59090>> ignored


In [67]:
ner.fit(zip(train_xs, train_ws), train_ys, batch_size=128, n_epoch=5)


 Epoch 1/5 : Batch 19/19: Loss 8.7038
 Epoch 2/5 : Batch 19/19: Loss 4.1111
 Epoch 3/5 : Batch 19/19: Loss 1.2432
 Epoch 4/5 : Batch 19/19: Loss 0.6888
 Epoch 5/5 : Batch 19/19: Loss 0.5211


In [68]:
preds = ner.predict(zip(test_xs, test_ws))

In [69]:
preds = preds.tolist()

In [70]:
def depad_sequence(sequence, pad):
    if isinstance(pad, list):
        return [ s for s, p in zip(sequence, pad) if p != 0 ]
    else:
        return [ s for s in sequence if s !=pad ]

def depad_sequences(sequences, pad):
    if isinstance(pad, list):
        return [ depad_sequence(sequence, p) for sequence, p in zip(sequences, pad) ]
    else:
        return [ depad_sequence(sequence, pad) for sequence in sequences ]

target_ys = depad_sequences(test_ys, yvocab.ipad)
target_ws = [ [ min(sum(w),1) for w in ws ] for ws in test_ws ]
predictions = depad_sequences(preds, target_ws)

In [71]:
def macro_f1(flat_targets, flat_preds):
    scores = {}
    for target, pred in zip(flat_targets, flat_preds):
        if target not in scores:
            scores[target] = {'tp':0,
                              'fp':0,
                              'fn':0}
        if pred not in scores:
            scores[pred] = {'tp':0,
                            'fp':0,
                            'fn':0}
        if target == pred:
            scores[pred]['tp'] += 1
        
        else:
            scores[pred]['fp'] += 1
            scores[target]['fn'] += 1
        stats = {'scores':{}}
        for target, score in scores.items():
            precision = score['tp'] / float(score['tp'] + score['fp'] +1e-15)
            recall = score['tp'] / float(score['tp'] + score['fn'] + 1e-15)
            f1 = 2*precision*recall / (precision + recall + 1e-15)
            support = score['tp'] + score['fn']
            stats['scores'][target] = {'precision':precision,
                             'recall':recall,
                             'f1':f1,
                             'support':support}
        stats['macro_precision'] = np.mean([s['precision'] for s in stats['scores'].values()])
        stats['macro_recall'] = np.mean([s['recall'] for s in stats['scores'].values()]) 
        stats['macro_f1'] = np.mean([s['f1'] for s in stats['scores'].values()])
    return stats

In [72]:
flat_targets = [ t for y in target_ys for t in y ]
flat_predictions = [ p for y in predictions for p in y ]
stats = macro_f1(flat_targets, flat_predictions)

In [73]:
print stats['macro_f1']

0.321356522478


In [74]:
print stats

{'macro_recall': 0.33333333333333331, 'macro_f1': 0.32135652247775565, 'macro_precision': 0.31021052415800587, 'scores': {1: {'recall': 1.0, 'support': 15133, 'precision': 0.9306315724740176, 'f1': 0.964069567433267}, 2: {'recall': 0.0, 'support': 661, 'precision': 0.0, 'f1': 0.0}, 3: {'recall': 0.0, 'support': 467, 'precision': 0.0, 'f1': 0.0}}}
