## Baseline Neural Network Model

This model uses an embedding representation of the words in the vocabulary which is then passed through a BiLSTM to predict the NER labels.

In [1]:
# import necessary packages
from collections import defaultdict
import time
import random
import dynet as dy
import numpy as np

In [2]:
# defines dicts to convert words and tags into indices
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"]

In [3]:
# takes a word index and returns word
def i2w(index):
    return list(w2i.keys())[list(w2i.values()).index(index)]

In [4]:
# takes a tag index and returns tag
def i2t(index):
     return list(t2i.keys())[list(t2i.values()).index(index)]

In [5]:
def read_dataset(filename):
    with open(filename, "r") as f:
        data_list = []
        sent_list = []
        for line in f:
            if len(line.strip()) != 0:
                word, tag = line.strip().split("\t")
                sent_list.append((w2i[word], t2i[tag]))
            else:
                if len(sent_list) != 0:
                    data_list.append(sent_list)
                sent_list = []
        return data_list

# [(1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 1), (11, 0), (12, 0), (13, 0), (14, 0), (15, 0), (16, 0), (17, 0)]

In [6]:
train = read_dataset("wnut17/data/train")
dev = read_dataset("wnut17/data/dev")
train = train + dev

In [7]:
# freezes the w2i dict
w2i = defaultdict(lambda: UNK, w2i)

In [8]:
nwords = max(w2i.values()) + 1 # used to exclude extra UNK
ntags = len(t2i)

In [9]:
test = read_dataset("wnut17/data/test")

In [10]:
# Start DyNet and define trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

In [11]:
# Define the model
EMB_SIZE = 64
HID_SIZE = 64
W_emb = model.add_lookup_parameters((nwords, EMB_SIZE))  # Word embeddings
lstm_builders = [dy.LSTMBuilder(1, EMB_SIZE, HID_SIZE, model), 
                 dy.LSTMBuilder(1, EMB_SIZE, HID_SIZE, model)] # fwd and bwd LSTM
W_sm = model.add_parameters((ntags, 2 * HID_SIZE))  # Softmax weights
b_sm = model.add_parameters((ntags))  # Softmax bias

In [12]:
def build_tagging_graph(sent):
    '''
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @return list of error for each tag
    '''
    dy.renew_cg()
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    word_embs = [dy.lookup(W_emb, word) for word, tag in sent]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    errs = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of word from LSTM
        predicted = W_sm_exp * complete_rep + b_sm_exp
        err = dy.pickneglogsoftmax(predicted, tag)
        errs.append(err)
    return dy.esum(errs)

In [13]:
def tag_sent(sent):
    '''
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @ return list of (word, predicted labels)
    '''
    dy.renew_cg()
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    word_embs = [dy.lookup(W_emb, word) for word, tag in sent]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    predicted_labels = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of word from LSTM
        scores = (W_sm_exp * complete_rep + b_sm_exp).npvalue()
        predict = np.argmax(scores)
        predicted_labels.append((word, predict))
    return predicted_labels

In [14]:
for ITER in range(50):
    # Perform training
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    if ITER == 49:
        out = open('predicted1.txt', 'w')
    for sent in train:
        sent_error = build_tagging_graph(sent)
        train_loss += sent_error.value()
        sent_error.backward()
        trainer.update()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start))
    total_acc = 0.0
    for sent in test:
        p_labels = tag_sent(sent)
        g_labels = [tags for word, tags in sent]
        test_correct = 0
        for i, p_g in enumerate(zip(p_labels, g_labels)):
            word = p_g[0][0]
            predicted = p_g[0][1]
            gold = p_g[1]
            if predicted == gold:
                test_correct += 1
            if ITER == 49:
                out.write(i2w(word) + ' ' + i2t(gold) + ' ' + i2t(predicted) + '\n')
                if i == (len(p_g) - 1):
                    out.write('\n')
        total_acc += test_correct / len(g_labels)
    print("iter %r: test acc=%.4f" % (ITER, total_acc / len(test)))
    if iter == 49:
        out.close()

iter 0: train loss/sent=7.5156, time=8.37s
iter 0: test acc=0.9074
iter 1: train loss/sent=5.2781, time=8.24s
iter 1: test acc=0.9098
iter 2: train loss/sent=4.2173, time=8.49s
iter 2: test acc=0.9097
iter 3: train loss/sent=3.4142, time=8.47s
iter 3: test acc=0.9040
iter 4: train loss/sent=2.7021, time=8.30s
iter 4: test acc=0.8822
iter 5: train loss/sent=2.1130, time=8.32s
iter 5: test acc=0.8814
iter 6: train loss/sent=1.5663, time=8.26s
iter 6: test acc=0.8674
iter 7: train loss/sent=1.1498, time=8.29s
iter 7: test acc=0.8549
iter 8: train loss/sent=0.7723, time=8.28s
iter 8: test acc=0.8688
iter 9: train loss/sent=0.4873, time=8.33s
iter 9: test acc=0.8754
iter 10: train loss/sent=0.3345, time=8.36s
iter 10: test acc=0.8574
iter 11: train loss/sent=0.2257, time=8.27s
iter 11: test acc=0.8764
iter 12: train loss/sent=0.1604, time=8.31s
iter 12: test acc=0.8803
iter 13: train loss/sent=0.1199, time=8.35s
iter 13: test acc=0.8729
iter 14: train loss/sent=0.1060, time=8.38s
iter 14: t

### Glove Embeddings

This model uses the 200 dimension pre-trained Glove embeddings to represent the words which are then passed through the BiLSTM to predict the NER labels.

In [None]:
from collections import defaultdict
import time
import random
import dynet as dy
import numpy as np

In [None]:
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["unk"]

In [None]:
# takes a word index and returns word
def i2w(index):
    return list(w2i.keys())[list(w2i.values()).index(index)]

In [None]:
# takes a tag index and returns tag
def i2t(index):
     return list(t2i.keys())[list(t2i.values()).index(index)]

In [None]:
def read_dataset(filename):
    with open(filename, "r") as f:
        data_list = []
        sent_list = []
        for line in f:
            if len(line.strip()) != 0:
                word, tag = line.strip().split("\t")
                sent_list.append((word, t2i[tag]))
            else:
                if len(sent_list) != 0:
                    data_list.append(sent_list)
                sent_list = []
        return data_list

# [(1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 1), (11, 0), (12, 0), (13, 0), (14, 0), (15, 0), (16, 0), (17, 0)]

In [None]:
dev = read_dataset("wnut17/data/dev")
train = read_dataset("wnut17/data/train")

In [None]:
print(t2i)

In [None]:
w2i = defaultdict(lambda: UNK, w2i)

In [None]:
test = read_dataset("wnut17/data/test")

In [None]:
nwords = len(w2i)
ntags = len(t2i)

In [None]:
%%time
# load the pre-trained Glove embeddings
vocab = defaultdict(lambda: len(vocab))
vectors = []
vectors.append(list(np.zeros(200))) # 200 zeros or random?
with open("glove.twitter.27B/glove.twitter.27B.200d.txt") as f:
    f.readline()
    for i, line in enumerate(f):
        if i % 10000 == 0:
            print(i)
        fields = line.strip().split(" ")
        vocab[fields[0]]
        vectors.append(list(map(float, fields[1:])))

In [None]:
print(vectors[0])

In [None]:
model = dy.Model()
lookup = model.add_lookup_parameters((len(vectors), len(vectors[0])))
lookup.init_from_array(np.array(vectors))
print(lookup[vocab["hello"]].value())

In [None]:
# Start DyNet and define trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

In [None]:
def build_tagging_graph(sent):
    dy.renew_cg()
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    word_embs = [dy.lookup(W_emb, word) for word, tag in sent]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    errs = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of word from LSTM
        predicted = W_sm_exp * complete_rep + b_sm_exp
        err = dy.pickneglogsoftmax(predicted, tag)
        errs.append(err)
    return dy.esum(errs)

In [None]:
def tag_sent(sent):
    dy.renew_cg()
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    word_embs = [dy.lookup(W_emb, word, update=False) for word, tag in sent]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    predicted_labels = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of word from LSTM
        scores = (W_sm_exp * complete_rep + b_sm_exp).npvalue()
        predict = np.argmax(scores)
        predicted_labels.append(predict)
    return predicted_labels

In [None]:
for ITER in range(50):
    # Perform training
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    for sent in train:
        sent_error = build_tagging_graph(sent)
        train_loss += sent_error.value()
        sent_error.backward()
        trainer.update()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start))
    total_acc = 0.0
    for sent in dev:
        p_labels = tag_sent(sent)
        g_labels = [tags for word, tags in sent]
        test_correct = 0
        for predicted, gold in zip(p_labels, g_labels):
            if predicted == gold:
                test_correct += 1
        total_acc += test_correct / len(g_labels)
    print("iter %r: test acc=%.4f" % (ITER, total_acc / len(dev)))