# Twitter Neural Network Experiments

The following notebook represents the various experiments we conducted in order to see what parameters and modifications worked best. Some of the experiments below may not be reported in the paper due to the space constraint and/or due to its low performance.

All of the models below are structured roughly the same where embeddings are passed through the bi-LSTM and the output of the bi-LSMT is passed to the softmax layer. Thus each model is referred to by its embedding type; a short description is written under each model for clarity. Most of the models below are also the 10-label variant of the challenge unless indicated.

## Word embeddings (trained from training set alone)

The system below is the most basic system that utilizes just word embeddings initialized and trained solely on the training data (e.g. not pre-trained embeddings).

In [1]:
# import necessary packages
from collections import defaultdict
import time
import random
import dynet as dy
import numpy as np

In [2]:
# defines dicts to convert words and tags into indices
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"]

In [3]:
def i2w(index):
    """Takes a word index and returns word.""" 
    return list(w2i.keys())[list(w2i.values()).index(index)]

In [4]:
def i2t(index):
    """Takes a tag index and returns a tag."""
    return list(t2i.keys())[list(t2i.values()).index(index)]

In [5]:
def read_dataset(filename):
    """Reads a file from the WNUT17 dataset.

    Returns:
        A list containing each sentence from the dataset in a separate list.
        Each element inside the sentence list is a tuple containing the
        word index and tag index.
        
    For example:
    [[(1, 0), (2, 1), (3, 2), (4, 0)],[(2,1), (13,2), (14, 0), (15,0)]]
"""
    with open(filename, "r") as f:
        data_list = []
        sent_list = []
        for line in f:
            if len(line.strip()) != 0:
                word, tag = line.strip().split("\t")
                sent_list.append((w2i[word], t2i[tag])) # (word index, tag index)
            else:
                if len(sent_list) != 0:
                    data_list.append(sent_list)
                sent_list = []
        return data_list

In [6]:
# reads in the training and dev data; combines them both as train
train = read_dataset("wnut17/data/train")
dev = read_dataset("wnut17/data/dev")
train = train + dev

In [7]:
# freezes the w2i dict
w2i = defaultdict(lambda: UNK, w2i)

In [8]:
# number of words and number of tags
nwords = max(w2i.values()) + 1 # used to exclude extra UNK
ntags = len(t2i)

In [9]:
# reads in the test data;
test = read_dataset("wnut17/data/test")

In [10]:
# Start DyNet and define trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

In [11]:
# Define the model
EMB_SIZE = 64
HID_SIZE = 64
W_emb = model.add_lookup_parameters((nwords, EMB_SIZE))  # Word embeddings
lstm_builders = [dy.LSTMBuilder(1, EMB_SIZE, HID_SIZE, model), 
                 dy.LSTMBuilder(1, EMB_SIZE, HID_SIZE, model)] # fwd and bwd LSTM
W_sm = model.add_parameters((ntags, 2 * HID_SIZE))  # Softmax weights
b_sm = model.add_parameters((ntags))  # Softmax bias

In [12]:
def build_tagging_graph(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @return list of error for each tag
    """
    dy.renew_cg()
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    word_embs = [dy.lookup(W_emb, word) for word, tag in sent]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    errs = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of a single word from LSTM
        predicted = W_sm_exp * complete_rep + b_sm_exp
        err = dy.pickneglogsoftmax(predicted, tag)
        errs.append(err)
    return dy.esum(errs)

In [13]:
def tag_sent(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @ return list of (word, predicted labels)
    """
    dy.renew_cg()
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    word_embs = [dy.lookup(W_emb, word) for word, tag in sent]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    predicted_labels = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of word from LSTM
        scores = (W_sm_exp * complete_rep + b_sm_exp).npvalue()
        predict = np.argmax(scores)
        predicted_labels.append((word, predict))
    return predicted_labels

In [14]:
iter_max = 50 # max num. of iterations

for ITER in range(iter_max):
    """
    Trains a neural network using the defined computational graph.
    Prints the accuracy per iteration and outputs the predictions in the final
    iteration.
    
    Args:
        iter_max: the maximum number of iterations to train the network.
    """
    # Perform training
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    if ITER == iter_max - 1: # final iteration
        out = open('predicted/10labels/model-1.txt', 'w')
    for sent in train:
        sent_error = build_tagging_graph(sent)
        train_loss += sent_error.value()
        sent_error.backward()
        trainer.update()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start))
    
    total_acc = 0.0
    for sent in test:
        p_labels = tag_sent(sent)
        g_labels = [tags for word, tags in sent]
        test_correct = 0
        for i, p_g in enumerate(zip(p_labels, g_labels)):
            word = p_g[0][0]
            predicted = p_g[0][1]
            gold = p_g[1]
            if predicted == gold:
                test_correct += 1
            if ITER == iter_max - 1:
                out.write(i2w(word) + '\t' + i2t(gold) + '\t' + i2t(predicted) + '\n') # writes the word, gold tag, and predicted tag
                if i == (len(p_g) - 1):
                    out.write('\n')
        total_acc += test_correct / len(g_labels)
    print("iter %r: test acc=%.4f" % (ITER, total_acc / len(test)))
out.close()

iter 0: train loss/sent=7.4537, time=8.34s
iter 0: test acc=0.9074
iter 1: train loss/sent=5.3288, time=7.71s
iter 1: test acc=0.9085
iter 2: train loss/sent=4.2602, time=8.04s
iter 2: test acc=0.9113
iter 3: train loss/sent=3.4010, time=8.43s
iter 3: test acc=0.9111
iter 4: train loss/sent=2.6634, time=7.99s
iter 4: test acc=0.9122
iter 5: train loss/sent=2.0325, time=8.69s
iter 5: test acc=0.9126
iter 6: train loss/sent=1.4901, time=8.86s
iter 6: test acc=0.9123
iter 7: train loss/sent=1.0708, time=8.37s
iter 7: test acc=0.9130
iter 8: train loss/sent=0.7558, time=8.27s
iter 8: test acc=0.9131
iter 9: train loss/sent=0.4685, time=8.61s
iter 9: test acc=0.9125
iter 10: train loss/sent=0.3234, time=8.53s
iter 10: test acc=0.9127
iter 11: train loss/sent=0.2138, time=8.52s
iter 11: test acc=0.9129
iter 12: train loss/sent=0.1567, time=8.28s
iter 12: test acc=0.9130
iter 13: train loss/sent=0.1143, time=8.58s
iter 13: test acc=0.9136
iter 14: train loss/sent=0.1108, time=8.30s
iter 14: t

In [67]:
# runs conll evaluation script; minor change for python3 in file (line 155)
!python3 conlleval.py predicted/10labels/model-1.txt > predicted/10labels/model-1-eval.txt

### 2 labels model

The following is the two labels (NER vs. No NER) version of the above system. 

In [33]:
# import necessary packages
from collections import defaultdict
import time
import random
import dynet as dy
import numpy as np

In [34]:
# defines dicts to convert words and tags into indices
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"]

In [35]:
def i2w(index):
    """Takes a word index and returns word.""" 
    return list(w2i.keys())[list(w2i.values()).index(index)]

In [36]:
def i2t(index):
    """Takes a tag index and returns a tag."""
    return list(t2i.keys())[list(t2i.values()).index(index)]

In [37]:
def read_dataset(filename):
    """Reads a file from the WNUT17 dataset.

    Returns:
        A list containing each sentence from the dataset in a separate list.
        Each element inside the sentence list is a tuple containing the
        word index and tag index.
        
    For example:
    [[(1, 0), (2, 1), (3, 2), (4, 0)],[(2,1), (13,2), (14, 0), (15,0)]]
"""
    with open(filename, "r") as f:
        data_list = []
        sent_list = []
        for line in f:
            if len(line.strip()) != 0:
                word, tag = line.strip().split("\t")
                sent_list.append((w2i[word], t2i[tag])) # (word index, tag index)
            else:
                if len(sent_list) != 0:
                    data_list.append(sent_list)
                sent_list = []
        return data_list

In [38]:
# reads in the training and dev data; combines them both as train
train = read_dataset("wnut17/data/train_notypes")
dev = read_dataset("wnut17/data/dev_notypes")
train = train + dev

In [39]:
# freezes the w2i dict
w2i = defaultdict(lambda: UNK, w2i)

In [40]:
# number of words and number of tags
nwords = max(w2i.values()) + 1 # used to exclude extra UNK
ntags = len(t2i)

In [41]:
# reads in the test data;
test = read_dataset("wnut17/data/test_notypes")

In [42]:
# Start DyNet and define trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

In [43]:
# Define the model
EMB_SIZE = 64
HID_SIZE = 64
W_emb = model.add_lookup_parameters((nwords, EMB_SIZE))  # Word embeddings
lstm_builders = [dy.LSTMBuilder(1, EMB_SIZE, HID_SIZE, model), 
                 dy.LSTMBuilder(1, EMB_SIZE, HID_SIZE, model)] # fwd and bwd LSTM
W_sm = model.add_parameters((ntags, 2 * HID_SIZE))  # Softmax weights
b_sm = model.add_parameters((ntags))  # Softmax bias

In [44]:
def build_tagging_graph(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @return list of error for each tag
    """
    dy.renew_cg()
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    word_embs = [dy.lookup(W_emb, word) for word, tag in sent]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    errs = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of a single word from LSTM
        predicted = W_sm_exp * complete_rep + b_sm_exp
        err = dy.pickneglogsoftmax(predicted, tag)
        errs.append(err)
    return dy.esum(errs)

In [45]:
def tag_sent(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @ return list of (word, predicted labels)
    """
    dy.renew_cg()
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    word_embs = [dy.lookup(W_emb, word) for word, tag in sent]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    predicted_labels = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of word from LSTM
        scores = (W_sm_exp * complete_rep + b_sm_exp).npvalue()
        predict = np.argmax(scores)
        predicted_labels.append((word, predict))
    return predicted_labels

In [46]:
iter_max = 50 # max num. of iterations

for ITER in range(iter_max):
    """
    Trains a neural network using the defined computational graph.
    Prints the accuracy per iteration and outputs the predictions in the final
    iteration.
    
    Args:
        iter_max: the maximum number of iterations to train the network.
    """
    # Perform training
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    if ITER == iter_max - 1: # final iteration
        out = open('predicted/2labels/model-1.txt', 'w')
    for sent in train:
        sent_error = build_tagging_graph(sent)
        train_loss += sent_error.value()
        sent_error.backward()
        trainer.update()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start))
    
    total_acc = 0.0
    for sent in test:
        p_labels = tag_sent(sent)
        g_labels = [tags for word, tags in sent]
        test_correct = 0
        for i, p_g in enumerate(zip(p_labels, g_labels)):
            word = p_g[0][0]
            predicted = p_g[0][1]
            gold = p_g[1]
            if predicted == gold:
                test_correct += 1
            if ITER == iter_max - 1:
                out.write(i2w(word) + '\t' + i2t(gold) + '\t' + i2t(predicted) + '\n') # writes the word, gold tag, and predicted tag
                if i == (len(p_g) - 1):
                    out.write('\n')
        total_acc += test_correct / len(g_labels)
    print("iter %r: test acc=%.4f" % (ITER, total_acc / len(test)))
out.close()

iter 0: train loss/sent=4.5196, time=9.37s
iter 0: test acc=0.9084
iter 1: train loss/sent=2.9940, time=8.65s
iter 1: test acc=0.9136
iter 2: train loss/sent=2.2171, time=8.09s
iter 2: test acc=0.9187
iter 3: train loss/sent=1.5927, time=7.80s
iter 3: test acc=0.9162
iter 4: train loss/sent=1.0688, time=7.77s
iter 4: test acc=0.9167
iter 5: train loss/sent=0.6863, time=7.91s
iter 5: test acc=0.9117
iter 6: train loss/sent=0.4192, time=8.08s
iter 6: test acc=0.9127
iter 7: train loss/sent=0.2398, time=7.74s
iter 7: test acc=0.9139
iter 8: train loss/sent=0.1514, time=7.77s
iter 8: test acc=0.9084
iter 9: train loss/sent=0.1105, time=7.61s
iter 9: test acc=0.9107
iter 10: train loss/sent=0.0718, time=7.61s
iter 10: test acc=0.9151
iter 11: train loss/sent=0.0741, time=8.16s
iter 11: test acc=0.9099
iter 12: train loss/sent=0.0668, time=9.51s
iter 12: test acc=0.9122
iter 13: train loss/sent=0.0627, time=8.36s
iter 13: test acc=0.9144
iter 14: train loss/sent=0.0720, time=8.75s
iter 14: t

In [69]:
# runs conll evaluation script; minor change for python3 in file (line 155)
!python3 conlleval.py predicted/2labels/model-1.txt > predicted/2labels/model-1-eval.txt

### Glove Embeddings

This model uses the 50 dimension pre-trained Glove embeddings to represent the words which are then passed through the BiLSTM to predict the NER labels.

In [44]:
# import necessary packages
from collections import defaultdict
import time
import random
import dynet as dy
import numpy as np

In [45]:
# defines dicts to convert words and tags into indices
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"]

In [46]:
def i2w(index):
    """Takes a word index and returns word.""" 
    return list(w2i.keys())[list(w2i.values()).index(index)]

In [47]:
def i2t(index):
    """Takes a tag index and returns a tag."""
    return list(t2i.keys())[list(t2i.values()).index(index)]

In [48]:
def read_dataset(filename):
    """Reads a file from the WNUT17 dataset.

    Returns:
        A list containing each sentence from the dataset in a separate list.
        Each element inside the sentence list is a tuple containing the
        word index and tag index.
        
    For example:
    [[(1, 0), (2, 1), (3, 2), (4, 0)],[(2,1), (13,2), (14, 0), (15,0)]]
"""
    with open(filename, "r") as f:
        data_list = []
        sent_list = []
        for line in f:
            if len(line.strip()) != 0:
                word, tag = line.strip().split("\t")
                sent_list.append((w2i[word], t2i[tag])) # (word index, tag index)
            else:
                if len(sent_list) != 0:
                    data_list.append(sent_list)
                sent_list = []
        return data_list

In [49]:
# reads in the training and dev data; combines them both as train
train = read_dataset("wnut17/data/train")
dev = read_dataset("wnut17/data/dev")
train = train + dev

In [50]:
# freezes the w2i dict
w2i = defaultdict(lambda: UNK, w2i)

In [51]:
# number of words and number of tags
nwords = max(w2i.values()) + 1 # used to exclude extra UNK
ntags = len(t2i)

In [52]:
%%time
# load the pre-trained Glove embeddings
embeddings = {}
with open("glove.twitter.27B/glove.twitter.27B.200d.txt") as f:
    for line in f:
        split = line.split()
        word = split[0]
        vec = split[1:]
        embeddings[word] = vec
    embedding_dim = len(embeddings[list(embeddings.keys())[0]])
    out = np.random.uniform(-0.8, 0.8, (nwords, embedding_dim))
    for word, embed in embeddings.items():
        embed_np = np.array(embed)
        if word in w2i.keys():
            out[w2i[word]] = embed_np

CPU times: user 1min 37s, sys: 1min 33s, total: 3min 10s
Wall time: 3min 21s


In [53]:
# reads in the test data;
test = read_dataset("wnut17/data/test")

In [54]:
# Start DyNet and define trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

In [55]:
# Define the model
EMB_SIZE = len(out[0])
HID_SIZE = 64
W_emb = model.add_lookup_parameters((nwords, EMB_SIZE))  # Word embeddings
W_emb.init_from_array(out)
lstm_builders = [dy.LSTMBuilder(1, EMB_SIZE, HID_SIZE, model), 
                 dy.LSTMBuilder(1, EMB_SIZE, HID_SIZE, model)] # fwd and bwd LSTM
W_sm = model.add_parameters((ntags, 2 * HID_SIZE))  # Softmax weights
b_sm = model.add_parameters((ntags))  # Softmax bias

In [60]:
def build_tagging_graph(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @return list of error for each tag
    """
    dy.renew_cg()
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    word_embs = [dy.lookup(W_emb, word) for word, tag in sent]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    errs = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of a single word from LSTM
        predicted = W_sm_exp * complete_rep + b_sm_exp
        err = dy.pickneglogsoftmax(predicted, tag)
        errs.append(err)
    return dy.esum(errs)

In [61]:
def tag_sent(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @ return list of (word, predicted labels)
    """
    dy.renew_cg()
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    word_embs = [dy.lookup(W_emb, word) for word, tag in sent]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    predicted_labels = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of word from LSTM
        scores = (W_sm_exp * complete_rep + b_sm_exp).npvalue()
        predict = np.argmax(scores)
        predicted_labels.append((word, predict))
    return predicted_labels

In [62]:
iter_max = 50 # max num. of iterations

for ITER in range(iter_max):
    """
    Trains a neural network using the defined computational graph.
    Prints the accuracy per iteration and outputs the predictions in the final
    iteration.
    
    Args:
        iter_max: the maximum number of iterations to train the network.
    """
    # Perform training
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    if ITER == iter_max - 1: # final iteration
        out = open('predicted/10labels/model-2.txt', 'w')
    for sent in train:
        sent_error = build_tagging_graph(sent)
        train_loss += sent_error.value()
        sent_error.backward()
        trainer.update()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start))
    
    total_acc = 0.0
    for sent in test:
        p_labels = tag_sent(sent)
        g_labels = [tags for word, tags in sent]
        test_correct = 0
        for i, p_g in enumerate(zip(p_labels, g_labels)):
            word = p_g[0][0]
            predicted = p_g[0][1]
            gold = p_g[1]
            if predicted == gold:
                test_correct += 1
            if ITER == iter_max - 1:
                out.write(i2w(word) + '\t' + i2t(gold) + '\t' + i2t(predicted) + '\n') # writes the word, gold tag, and predicted tag
                if i == (len(p_g) - 1):
                    out.write('\n')
        total_acc += test_correct / len(g_labels)
    print("iter %r: test acc=%.4f" % (ITER, total_acc / len(test)))
out.close()

iter 0: train loss/sent=0.0332, time=13.32s
iter 0: test acc=0.8939
iter 1: train loss/sent=0.0278, time=12.99s
iter 1: test acc=0.9003
iter 2: train loss/sent=0.0377, time=12.98s
iter 2: test acc=0.8917
iter 3: train loss/sent=0.0283, time=13.00s
iter 3: test acc=0.8963
iter 4: train loss/sent=0.0419, time=12.96s
iter 4: test acc=0.8949
iter 5: train loss/sent=0.0348, time=12.98s
iter 5: test acc=0.8970
iter 6: train loss/sent=0.0321, time=12.96s
iter 6: test acc=0.9037
iter 7: train loss/sent=0.0303, time=13.07s
iter 7: test acc=0.9046
iter 8: train loss/sent=0.0246, time=13.02s
iter 8: test acc=0.9041
iter 9: train loss/sent=0.0271, time=13.05s
iter 9: test acc=0.9078
iter 10: train loss/sent=0.0295, time=12.99s
iter 10: test acc=0.9077
iter 11: train loss/sent=0.0276, time=12.97s
iter 11: test acc=0.9073
iter 12: train loss/sent=0.0276, time=12.98s
iter 12: test acc=0.9088
iter 13: train loss/sent=0.0256, time=12.98s
iter 13: test acc=0.9107
iter 14: train loss/sent=0.0368, time=12

In [70]:
# runs conll evaluation script; minor change for python3 in file (line 155)
!python3 conlleval.py predicted/10labels/model-2.txt > predicted/10labels/model-2-eval.txt

## Character + Word Embeddings

This model uses a combination of character embeddings and word embeddings trained solely from the dataset (e.g. no pre-trained embeddings).

In [1]:
from collections import defaultdict
import time
import random
import dynet as dy
import numpy as np

In [2]:
w2i = defaultdict(lambda: len(w2i))   # word: its position in the doc; all unknown words(UNK) are in the position 0
t2i = defaultdict(lambda: len(t2i))   # tag: its position {'b-sportsteam': 11, 'i-musicartist': 19...}
char2i = defaultdict(lambda: len(char2i))

In [3]:
UNK = w2i["<unk>"]
UNK_char = char2i["<unk_char>"]
pad_char = char2i["<*>"]

In [4]:
def i2w(index):
    """Takes a word index and returns word.""" 
    return list(w2i.keys())[list(w2i.values()).index(index)]

In [5]:
def i2t(index):
    """Takes a tag index and returns a tag."""
    return list(t2i.keys())[list(t2i.values()).index(index)]

In [6]:
# function to read and preprocess data for training
# return: processed data
def read_dataset(filename):
    with open(filename, "r") as f:
        data_list = []
        sent_list = []
        for line in f:
            if len(line.strip()) != 0:
                word, tag = line.strip().split("\t")
                sent_list.append((w2i[word], t2i[tag])) # (word index, tag index)
                for char in word:
                    char2i[char]
            else:
                if len(sent_list) != 0:
                    data_list.append(sent_list)
                sent_list = []
        return data_list

In [7]:
# reads in the training and dev data; combines them both as train
train = read_dataset("wnut17/data/train")
dev = read_dataset("wnut17/data/dev")
train = train + dev

In [8]:
# freezes the w2i dict
w2i = defaultdict(lambda: UNK, w2i)
char2i = defaultdict(lambda: UNK_char, char2i)

In [9]:
# number of words and number of tags
nwords = max(w2i.values()) + 1 # used to exclude extra UNK
nchar = max(char2i.values()) + 1
ntags = len(t2i)

In [10]:
# reads in the test data;
test = read_dataset("wnut17/data/test")

In [11]:
# Start DyNet and define trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

In [12]:
# Define the model
# parameter sizes are not correct; input_dim = word_embedding_dim + 128?
CH_EMB_SIZE = 30
EMB_SIZE = 64
HID_SIZE = 64

chW_emb = model.add_lookup_parameters((nchar, CH_EMB_SIZE))  # char embeddings
W_emb = model.add_lookup_parameters((nwords, EMB_SIZE))  # Word embeddings

char_lstm_builders = [dy.LSTMBuilder(1, 30, EMB_SIZE, model), 
                 dy.LSTMBuilder(1, 30, EMB_SIZE, model)] # fwd and bwd LSTM

lstm_builders = [dy.LSTMBuilder(1, 128 + HID_SIZE, HID_SIZE, model), 
                 dy.LSTMBuilder(1, 128 + HID_SIZE, HID_SIZE, model)] # fwd and bwd LSTM

W_sm = model.add_parameters((ntags, 128))  # Softmax weights
b_sm = model.add_parameters((ntags))  # Softmax bias

In [None]:
# Define the model
# parameter sizes are not correct; input_dim = word_embedding_dim + 128?
CH_EMB_SIZE = 30
EMB_SIZE = 64
HID_SIZE = 64

chW_emb = model.add_lookup_parameters((nchar, CH_EMB_SIZE))  # char embeddings
W_emb = model.add_lookup_parameters((nwords, EMB_SIZE))  # Word embeddings

char_lstm_builders = [dy.LSTMBuilder(1, CH_EMB_SIZE, EMB_SIZE, model), 
                 dy.LSTMBuilder(1, CH_EMB_SIZE, EMB_SIZE, model)] # fwd and bwd LSTM

lstm_builders = [dy.LSTMBuilder(1, (2 * EMB_SIZE) + EMB_SIZE, HID_SIZE, model), 
                 dy.LSTMBuilder(1, (2 * EMB_SIZE) + EMB_SIZE, HID_SIZE, model)] # fwd and bwd LSTM

W_sm = model.add_parameters((ntags, 2 * HID_SIZE))  # Softmax weights
b_sm = model.add_parameters((ntags))  # Softmax bias

In [None]:
# W_EMB = 128
# HID = 128

In [13]:
def word_representation(word):
    fwd_init, bwd_init = [b.initial_state() for b in char_lstm_builders]
    word_embs = dy.lookup(W_emb, word)
    char_ids = [pad_char] + [char2i[c] for c in i2w(word)] + [pad_char]
    char_embs = [dy.lookup(chW_emb, cid) for cid in char_ids]
    
    fwd_embs = fwd_init.transduce(char_embs)
    bwd_embs = bwd_init.transduce(reversed(char_embs))
    complete_char_rep = dy.concatenate([fwd_embs[-1], bwd_embs[-1]]) 
    
    return dy.concatenate([word_embs, complete_char_rep])

In [14]:
def build_tagging_graph(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @return list of error for each tag
    """
    dy.renew_cg()
    word_embs = [word_representation(w) for w,t in sent]
    
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
 
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    errs = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of a single word from LSTM
        predicted = W_sm_exp * complete_rep + b_sm_exp
        err = dy.pickneglogsoftmax(predicted, tag)
        errs.append(err)
    return dy.esum(errs)

In [15]:
def tag_sent(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @ return list of (word, predicted labels)
    """
    dy.renew_cg()
    
    word_embs = [word_representation(w) for w,t in sent]
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    predicted_labels = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of word from LSTM
        scores = (W_sm_exp * complete_rep + b_sm_exp).npvalue()
        predict = np.argmax(scores)
        predicted_labels.append((word, predict))
    return predicted_labels

In [16]:
iter_max = 50 # max num. of iterations

for ITER in range(iter_max):
    """
    Trains a neural network using the defined computational graph.
    Prints the accuracy per iteration and outputs the predictions in the final
    iteration.
    
    Args:
        iter_max: the maximum number of iterations to train the network.
    """
    # Perform training
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    if ITER == iter_max - 1: # final iteration
        out = open('predicted/10labels/model-3.txt', 'w')
    for sent in train:
        sent_error = build_tagging_graph(sent)
        train_loss += sent_error.value()
        sent_error.backward()
        trainer.update()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start))
    
    total_acc = 0.0
    for sent in test:
        p_labels = tag_sent(sent)
        g_labels = [tags for word, tags in sent]
        test_correct = 0
        for i, p_g in enumerate(zip(p_labels, g_labels)):
            word = p_g[0][0]
            predicted = p_g[0][1]
            gold = p_g[1]
            if predicted == gold:
                test_correct += 1
            if ITER == iter_max - 1:
                out.write(i2w(word) + '\t' + i2t(gold) + '\t' + i2t(predicted) + '\n') # writes the word, gold tag, and predicted tag
                if i == (len(p_g) - 1):
                    out.write('\n')
        total_acc += test_correct / len(g_labels)
    print("iter %r: test acc=%.4f" % (ITER, total_acc / len(test)))
out.close()

iter 0: train loss/sent=6.2222, time=90.29s
iter 0: test acc=0.9064
iter 1: train loss/sent=4.4422, time=89.93s
iter 1: test acc=0.9113
iter 2: train loss/sent=3.6548, time=88.29s
iter 2: test acc=0.9121
iter 3: train loss/sent=3.0750, time=86.21s
iter 3: test acc=0.9119
iter 4: train loss/sent=2.5578, time=93.24s
iter 4: test acc=0.9127
iter 5: train loss/sent=2.0375, time=88.39s
iter 5: test acc=0.9119
iter 6: train loss/sent=1.5580, time=88.01s
iter 6: test acc=0.9048
iter 7: train loss/sent=1.1370, time=89.03s
iter 7: test acc=0.9111
iter 8: train loss/sent=0.8220, time=89.47s
iter 8: test acc=0.9027
iter 9: train loss/sent=0.5662, time=87.63s
iter 9: test acc=0.9128
iter 10: train loss/sent=0.3877, time=83.59s
iter 10: test acc=0.9103
iter 11: train loss/sent=0.2605, time=87.40s
iter 11: test acc=0.9122
iter 12: train loss/sent=0.1785, time=85.34s
iter 12: test acc=0.9102
iter 13: train loss/sent=0.1526, time=85.91s
iter 13: test acc=0.9061
iter 14: train loss/sent=0.1087, time=85

In [18]:
# runs conll evaluation script; minor change for python3 in file (line 155)
!python3 conlleval.py predicted/10labels/model-3.txt > predicted/10labels/model-3-eval.txt

## Character + GloVe Embeddings

The following model uses character embeddings trained from the dataset and the 50 dimensional pre-trained GloVe embeddings.

In [1]:
from collections import defaultdict
import time
import random
import dynet as dy
import numpy as np

In [2]:
w2i = defaultdict(lambda: len(w2i))   # word: its position in the doc; all unknown words(UNK) are in the position 0
t2i = defaultdict(lambda: len(t2i))   # tag: its position {'b-sportsteam': 11, 'i-musicartist': 19...}
char2i = defaultdict(lambda: len(char2i))

In [3]:
UNK = w2i["<unk>"]
UNK_char = char2i["<unk_char>"]
pad_char = char2i["<*>"]

In [4]:
def i2w(index):
    """Takes a word index and returns word.""" 
    return list(w2i.keys())[list(w2i.values()).index(index)]

In [5]:
def i2t(index):
    """Takes a tag index and returns a tag."""
    return list(t2i.keys())[list(t2i.values()).index(index)]

In [6]:
# function to read and preprocess data for training
# return: processed data
def read_dataset(filename):
    with open(filename, "r") as f:
        data_list = []
        sent_list = []
        for line in f:
            if len(line.strip()) != 0:
                word, tag = line.strip().split("\t")
                sent_list.append((w2i[word], t2i[tag])) # (word index, tag index)
                for char in word:
                    char2i[char]
            else:
                if len(sent_list) != 0:
                    data_list.append(sent_list)
                sent_list = []
        return data_list

In [7]:
# reads in the training and dev data; combines them both as train
train = read_dataset("wnut17/data/train")
dev = read_dataset("wnut17/data/dev")
train = train + dev

In [8]:
# freezes the w2i dict
w2i = defaultdict(lambda: UNK, w2i)
char2i = defaultdict(lambda: UNK_char, char2i)

In [9]:
# number of words and number of tags
nwords = max(w2i.values()) + 1 # used to exclude extra UNK
nchar = max(char2i.values()) + 1
ntags = len(t2i)

In [10]:
%%time
# load the pre-trained Glove embeddings
embeddings = {}
with open("glove.twitter.27B/glove.twitter.27B.50d.txt") as f:
    for line in f:
        split = line.split()
        word = split[0]
        vec = split[1:]
        embeddings[word] = vec
    embedding_dim = len(embeddings[list(embeddings.keys())[0]])
    out = np.random.uniform(-0.8, 0.8, (nwords, embedding_dim))
    for word, embed in embeddings.items():
        embed_np = np.array(embed)
        if word in w2i.keys():
            out[w2i[word]] = embed_np

CPU times: user 23.3 s, sys: 2.57 s, total: 25.9 s
Wall time: 26.2 s


In [11]:
# reads in the test data;
test = read_dataset("wnut17/data/test")

In [12]:
# Start DyNet and define trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

In [13]:
# Define the model
# parameter sizes are not correct; input_dim = word_embedding_dim + 128?
CH_EMB_SIZE = 30
EMB_SIZE = len(out[0])
HID_SIZE = 64

chW_emb = model.add_lookup_parameters((nchar, CH_EMB_SIZE))  # char embeddings
W_emb = model.add_lookup_parameters((nwords, EMB_SIZE))  # Word embeddings
W_emb.init_from_array(out)

char_lstm_builders = [dy.LSTMBuilder(1, CH_EMB_SIZE, EMB_SIZE, model), 
                 dy.LSTMBuilder(1, CH_EMB_SIZE, EMB_SIZE, model)] # fwd and bwd LSTM

lstm_builders = [dy.LSTMBuilder(1, (2 * EMB_SIZE) + EMB_SIZE, HID_SIZE, model), 
                 dy.LSTMBuilder(1, (2 * EMB_SIZE) + EMB_SIZE, HID_SIZE, model)] # fwd and bwd LSTM

W_sm = model.add_parameters((ntags, 2 * HID_SIZE))  # Softmax weights
b_sm = model.add_parameters((ntags))  # Softmax bias

In [14]:
def word_representation(word):
    fwd_init, bwd_init = [b.initial_state() for b in char_lstm_builders]
    word_embs = dy.lookup(W_emb, word)
    char_ids = [pad_char] + [char2i[c] for c in i2w(word)] + [pad_char]
    char_embs = [dy.lookup(chW_emb, cid) for cid in char_ids]
    
    fwd_embs = fwd_init.transduce(char_embs)
    bwd_embs = bwd_init.transduce(reversed(char_embs))
    complete_char_rep = dy.concatenate([fwd_embs[-1], bwd_embs[-1]]) 
    
    return dy.concatenate([word_embs, complete_char_rep])

In [15]:
def build_tagging_graph(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @return list of error for each tag
    """
    dy.renew_cg()
    word_embs = [word_representation(w) for w,t in sent]
    
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
 
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    errs = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of a single word from LSTM
        predicted = W_sm_exp * complete_rep + b_sm_exp
        err = dy.pickneglogsoftmax(predicted, tag)
        errs.append(err)
    return dy.esum(errs)

In [16]:
def tag_sent(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @ return list of (word, predicted labels)
    """
    dy.renew_cg()
    
    word_embs = [word_representation(w) for w,t in sent]
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    predicted_labels = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of word from LSTM
        scores = (W_sm_exp * complete_rep + b_sm_exp).npvalue()
        predict = np.argmax(scores)
        predicted_labels.append((word, predict))
    return predicted_labels

In [17]:
%%time
iter_max = 50 # max num. of iterations

for ITER in range(iter_max):
    """
    Trains a neural network using the defined computational graph.
    Prints the accuracy per iteration and outputs the predictions in the final
    iteration.
    
    Args:
        iter_max: the maximum number of iterations to train the network.
    """
    # Perform training
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    if ITER == iter_max - 1: # final iteration
        out = open('predicted/10labels/model-4.txt', 'w')
    for sent in train:
        sent_error = build_tagging_graph(sent)
        train_loss += sent_error.value()
        sent_error.backward()
        trainer.update()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start))
    
    total_acc = 0.0
    for sent in test:
        p_labels = tag_sent(sent)
        g_labels = [tags for word, tags in sent]
        test_correct = 0
        for i, p_g in enumerate(zip(p_labels, g_labels)):
            word = p_g[0][0]
            predicted = p_g[0][1]
            gold = p_g[1]
            if predicted == gold:
                test_correct += 1
            if ITER == iter_max - 1:
                out.write(i2w(word) + '\t' + i2t(gold) + '\t' + i2t(predicted) + '\n') # writes the word, gold tag, and predicted tag
                if i == (len(p_g) - 1):
                    out.write('\n')
        total_acc += test_correct / len(g_labels)
    print("iter %r: test acc=%.4f" % (ITER, total_acc / len(test)))
out.close()

iter 0: train loss/sent=5.7647, time=87.53s
iter 0: test acc=0.9082
iter 1: train loss/sent=4.3639, time=88.14s
iter 1: test acc=0.9087
iter 2: train loss/sent=3.7422, time=88.66s
iter 2: test acc=0.9006
iter 3: train loss/sent=3.2336, time=86.89s
iter 3: test acc=0.9042
iter 4: train loss/sent=2.6910, time=83.75s
iter 4: test acc=0.9046
iter 5: train loss/sent=2.1805, time=84.21s
iter 5: test acc=0.8774
iter 6: train loss/sent=1.6962, time=83.79s
iter 6: test acc=0.8477
iter 7: train loss/sent=1.2387, time=84.02s
iter 7: test acc=0.8324
iter 8: train loss/sent=0.8904, time=83.74s
iter 8: test acc=0.8232
iter 9: train loss/sent=0.6249, time=83.88s
iter 9: test acc=0.8614
iter 10: train loss/sent=0.4012, time=83.89s
iter 10: test acc=0.7747
iter 11: train loss/sent=0.3070, time=81.96s
iter 11: test acc=0.8565
iter 12: train loss/sent=0.1929, time=81.70s
iter 12: test acc=0.8666
iter 13: train loss/sent=0.1584, time=82.31s
iter 13: test acc=0.8048
iter 14: train loss/sent=0.1459, time=82

In [18]:
# runs conll evaluation script; minor change for python3 in file (line 155)
!python3 conlleval.py predicted/10labels/model-4.txt > predicted/10labels/model-4-eval.txt

## Orthographic character embeddings + orthographic word embeddings + regular character embeddings + regular word embeddings

The following system utilizes a combination of the orthographic character embeddings, orthographic word embeddings, regular character embeddings and regular word embeddings.  

To give a concrete example, if we have 'Beyonce' as a word in a sentence, then the orthorgraphic representation would be 'Cccccc', and thus we would look up the embeddings of each individual character in the representation, as well as the embeddings of 'Cccccc' as a whole. Then like normal, we would look up the embeddings of each individual character in 'Beyonce' and 'Beyonce' itself (should it be in the look up dictionary and training set).

In [1]:
from collections import defaultdict
import time
import random
import dynet as dy
import numpy as np

In [2]:
w2i = defaultdict(lambda: len(w2i))   # word: its position in the doc; all unknown words(UNK) are in the position 0
t2i = defaultdict(lambda: len(t2i))   # tag: its position {'b-sportsteam': 11, 'i-musicartist': 19...}

char2i = defaultdict(lambda: len(char2i))

ortho_char2i = defaultdict(lambda: len(ortho_char2i))
ortho_word2i = defaultdict(lambda: len(ortho_word2i))

In [3]:
UNK = w2i["<unk>"]

UNK_char = char2i["<unk_char>"]
pad_char = char2i["<*>"]

pad_ortho_char = ortho_char2i["<*>"]
UNK_ortho_word = ortho_word2i["<unk_ortho_word>"]

In [4]:
def i2w(index):
    """Takes a word index and returns word.""" 
    return list(w2i.keys())[list(w2i.values()).index(index)]

def i2t(index):
    """Takes a tag index and returns a tag."""
    return list(t2i.keys())[list(t2i.values()).index(index)]

def i2ortho_char(index):
    """Takes a character index and returns a character."""
    return list(ortho_char2i.keys())[list(ortho_char2i.values()).index(index)]

def i2ortho_word(index):
    """Takes a character index and returns a character."""
    return list(ortho_word2i.keys())[list(ortho_word2i.values()).index(index)]

In [5]:
def get_ortho_crep(char):
    if char.isupper():
        return "C"
    if char.islower():
        return "c"
    if char.isdigit():
        return "n"
    else:
        return "p"

In [6]:
# function to read and preprocess data for training
# return: processed data [(1, 0, 0), (2, 1, 0), (3, 2, 0), (4, 2, 0),
def read_dataset(filename):
    with open(filename, "r") as f:
        data_list = []
        sent_list = []
        ortho_sent_list = []
        for line in f:
            if len(line.strip()) != 0:
                word, tag = line.strip().split("\t")
                #sent_list.append((w2i[word], t2i[tag])) # (word index, tag index)
                ortho_word = str()
                for char in word:
                    char2i[char]
                    ortho_char = get_ortho_crep(char)
                    ortho_word += ortho_char
                    ortho_char2i[ortho_char]
                sent_list.append((w2i[word], ortho_word2i[ortho_word], t2i[tag])) # (word index, orthographic word index, tag index)

            else:
                if len(sent_list) != 0:
                    data_list.append(sent_list)
                sent_list = []
        return data_list


In [7]:
train = read_dataset("train")
dev = read_dataset("dev")
train = train + dev

In [8]:
# freezes the w2i dict
w2i = defaultdict(lambda: UNK, w2i)
char2i = defaultdict(lambda: UNK_char, char2i)
ortho_word2i = defaultdict(lambda: UNK_ortho_word, ortho_word2i)

In [9]:
# number of words and number of tags
nwords = max(w2i.values()) + 1 # used to exclude extra UNK
nchar = max(char2i.values()) + 1

n_ortho_char = len(ortho_char2i)
n_ortho_words = max(ortho_word2i.values()) + 1

ntags = len(t2i)

In [10]:
print(nwords, nchar, n_ortho_char, n_ortho_words, ntags)

14879 94 5 2227 21


In [11]:
len(ortho_word2i)

2227

In [12]:
# reads in the test data;
test = read_dataset("test")

In [13]:
# Start DyNet and define trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

In [14]:
# Define the model
# parameter sizes are not correct; input_dim = word_embedding_dim + 128?
W_EMB_SIZE = 64
CH_EMB_SIZE = 30
ORTHO_CH_EMB_SIZE = 30
ORTHO_EMB_SIZE = 50
HID_SIZE = 64
#CHAR_LSTM_OUTPUT_SIZE = 200

W_emb = model.add_lookup_parameters((nwords, W_EMB_SIZE))  # Word embeddings
chW_emb = model.add_lookup_parameters((nchar, CH_EMB_SIZE))  # char embeddings

orthoW_emb = model.add_lookup_parameters((n_ortho_words, ORTHO_EMB_SIZE)) # ortho word embed
orthoChar_emb = model.add_lookup_parameters((n_ortho_char, CH_EMB_SIZE)) # ortho char embed

# Word-based biLSTM + WO_EMBED_SIZE + 4*CHAR_LSTM_OUTPUT_SIZE
lstm_builders = [dy.LSTMBuilder(1, W_EMB_SIZE + ORTHO_EMB_SIZE + 4*ORTHO_CH_EMB_SIZE, HID_SIZE, model), 
                 dy.LSTMBuilder(1, W_EMB_SIZE + ORTHO_EMB_SIZE + 4*ORTHO_CH_EMB_SIZE, HID_SIZE, model)] # fwd and bwd LSTM

# Char-based biLSTM
char_lstm_builders = [dy.LSTMBuilder(1, CH_EMB_SIZE, ORTHO_CH_EMB_SIZE, model), 
                     dy.LSTMBuilder(1, CH_EMB_SIZE, ORTHO_CH_EMB_SIZE, model)] # fwd and bwd LSTM
    
# Ortho features biLSTM
ortho_lstm_builders = [dy.LSTMBuilder(1, CH_EMB_SIZE, ORTHO_CH_EMB_SIZE, model), 
                     dy.LSTMBuilder(1, CH_EMB_SIZE, ORTHO_CH_EMB_SIZE, model)] # fwd and bwd LSTM
    
W_sm = model.add_parameters((ntags, 2 * HID_SIZE))  # Softmax weights
b_sm = model.add_parameters((ntags))  # Softmax bias

In [15]:
def orthoword_representation(word):
    fwd_init, bwd_init = [b.initial_state() for b in ortho_lstm_builders]
    for b in ortho_lstm_builders:
        b.set_dropouts(0.5, 0.5)
        
    ortho_word_embs = dy.lookup(orthoW_emb, word)
    char_ids = [pad_ortho_char] + [ortho_char2i[c] for c in i2ortho_word(word)] + [pad_ortho_char]
    char_embs = [dy.lookup(orthoChar_emb, cid) for cid in char_ids] # orth ch embeddings
    
    fwd_embs = fwd_init.transduce(char_embs)
    bwd_embs = bwd_init.transduce(reversed(char_embs))
    #complete_char_rep = dy.concatenate([fwd_embs[-1], bwd_embs[-1]]) 
    
    #return dy.concatenate([ortho_word_embs, complete_char_rep])
    return ortho_word_embs, fwd_embs[-1], bwd_embs[-1]

In [16]:
def word_representation(word, ortho_word):
    fwd_init, bwd_init = [b.initial_state() for b in char_lstm_builders]
    for b in char_lstm_builders:
        b.set_dropouts(0.5, 0.5)    
    
    word_embs = dy.lookup(W_emb, word) # word embeddings
    char_ids = [pad_char] + [char2i[c] for c in i2w(word)] + [pad_char] # word characters
    char_embs = [dy.lookup(chW_emb, cid) for cid in char_ids] # word ch embeddings
    
    fwd_embs = fwd_init.transduce(char_embs)
    bwd_embs = bwd_init.transduce(reversed(char_embs))
    #complete_char_rep = dy.concatenate([fwd_embs[-1], bwd_embs[-1]]) 
    
    #get the ortho-char plus ortho-word rep
    orthow_embs, ortho_fwd_embs, ortho_bwd_embs = orthoword_representation(ortho_word)
    #orthow_embs, ortho_fwd_embs, ortho_bwd_embs = [orthoword_representation(ow) for w, ow, t in sent]

    return dy.concatenate([word_embs, fwd_embs[-1], bwd_embs[-1], orthow_embs, ortho_fwd_embs, ortho_bwd_embs])
    #return word_embs, fwd_embs[-1], bwd_embs[-1]

In [17]:
def build_tagging_graph(sent):
    dy.renew_cg()
    
    #char_plus_word_embs = [word_representation(w, ow) for w, ow, t in sent]
    #word_embs, ch_fwd_embs, ch_bwd_embs = [word_representation(w) for w, ow, t in sent]
    #orthochar_plus_orthoword_embs = [orthoword_representation(ow) for w, ow, t in sent]
    #orthow_embs, ortho_fwd_embs, ortho_bwd_embs = [orthoword_representation(ow) for w, ow, t in sent]
    word_plus_ortho_rep_emb = [word_representation(w, ow) for w, ow, t in sent]
    
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    for b in lstm_builders:
        b.set_dropouts(0.5, 0.5) 
    
    fwd_embs = fwd_init.transduce(word_plus_ortho_rep_emb)
    bwd_embs = bwd_init.transduce(reversed(word_plus_ortho_rep_emb))
 
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    errs = []
    for (word, oword, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of a single word from LSTM
        predicted = W_sm_exp * complete_rep + b_sm_exp
        err = dy.pickneglogsoftmax(predicted, tag)
        errs.append(err)
    return dy.esum(errs)

In [18]:
def tag_sent(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @ return list of (word, predicted labels)
    """
    dy.renew_cg()
    
    word_embs = [word_representation(w, ow) for w,ow,t in sent]
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    
    fwd_embs = fwd_init.transduce(word_embs)
    bwd_embs = bwd_init.transduce(reversed(word_embs))
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    predicted_labels = []
    for (word, oword, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of word from LSTM
        scores = (W_sm_exp * complete_rep + b_sm_exp).npvalue()
        predict = np.argmax(scores)
        predicted_labels.append((word, predict))
    return predicted_labels

In [19]:
iter_max = 50 # max num. of iterations

for ITER in range(iter_max):
    """
    Trains a neural network using the defined computational graph.
    Prints the accuracy per iteration and outputs the predictions in the final
    iteration.
    
    Args:
        iter_max: the maximum number of iterations to train the network.
    """
    # Perform training
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    if ITER == iter_max - 1: # final iteration
        out = open('predicted/10labels/model-5.txt', 'w')
    for sent in train:
        sent_error = build_tagging_graph(sent)
        train_loss += sent_error.value()
        sent_error.backward()
        trainer.update()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start))
    
    total_acc = 0.0
    for sent in test:
        p_labels = tag_sent(sent)
        g_labels = [tags for word, oword, tags in sent]
        test_correct = 0
        for i, p_g in enumerate(zip(p_labels, g_labels)):
            word = p_g[0][0]
            predicted = p_g[0][1]
            gold = p_g[1]
            if predicted == gold:
                test_correct += 1
            if ITER == iter_max - 1:
                out.write(i2w(word) + '\t' + i2t(gold) + '\t' + i2t(predicted) + '\n') # writes the word, gold tag, and predicted tag
                if i == (len(p_g) - 1):
                    out.write('\n')
        total_acc += test_correct / len(g_labels)
    print("iter %r: test acc=%.4f" % (ITER, total_acc / len(test)))
out.close()

iter 0: train loss/sent=6.3065, time=145.71s
iter 0: test acc=0.9078
iter 1: train loss/sent=4.8337, time=143.21s
iter 1: test acc=0.9087
iter 2: train loss/sent=4.2075, time=145.96s
iter 2: test acc=0.9119
iter 3: train loss/sent=3.7988, time=144.18s
iter 3: test acc=0.9117
iter 4: train loss/sent=3.3812, time=144.50s
iter 4: test acc=0.9142
iter 5: train loss/sent=2.9973, time=148.52s
iter 5: test acc=0.9141
iter 6: train loss/sent=2.7645, time=148.46s
iter 6: test acc=0.9152
iter 7: train loss/sent=2.4300, time=145.52s
iter 7: test acc=0.9146
iter 8: train loss/sent=2.1645, time=148.27s
iter 8: test acc=0.9155
iter 9: train loss/sent=1.9521, time=138.29s
iter 9: test acc=0.9151
iter 10: train loss/sent=1.7231, time=141.85s
iter 10: test acc=0.9159
iter 11: train loss/sent=1.5715, time=138.68s
iter 11: test acc=0.9155
iter 12: train loss/sent=1.4081, time=143.40s
iter 12: test acc=0.9155
iter 13: train loss/sent=1.2548, time=144.46s
iter 13: test acc=0.9160
iter 14: train loss/sent=1

In [20]:
# runs conll evaluation script; minor change for python3 in file (line 155)
!python3 conlleval.py predicted/10labels/model-5.txt > predicted/10labels/model-5-eval.txt

processed 61908 tokens with 3621 phrases; found: 2163 phrases; correct: 657.
accuracy:  91.22%; precision:  30.37%; recall:  18.14%; FB1:  22.72
          company: precision:  55.95%; recall:  14.71%; FB1:  23.30  168
         facility: precision:  17.82%; recall:  13.53%; FB1:  15.38  202
          geo-loc: precision:  48.89%; recall:  34.72%; FB1:  40.61  630
            movie: precision:   0.00%; recall:   0.00%; FB1:   0.00  45
      musicartist: precision:   8.33%; recall:   5.10%; FB1:   6.33  120
            other: precision:  23.57%; recall:  17.15%; FB1:  19.85  454
           person: precision:  29.83%; recall:  14.14%; FB1:  19.19  238
          product: precision:  13.91%; recall:   5.63%; FB1:   8.02  115
       sportsteam: precision:   9.43%; recall:   9.93%; FB1:   9.68  159
           tvshow: precision:   0.00%; recall:   0.00%; FB1:   0.00  32


## Orthographic character embeddings + orthographic word embeddings + regular character embeddings + regular word embeddings + 50% dropout

The following system utilizes a combination of the orthographic character embeddings, orthographic word embeddings, regular character embeddings and regular word embeddings (like the previous system) + 50% dropout.

To give a concrete example, if we have 'Beyonce' as a word in a sentence, then the orthorgraphic representation would be 'Cccccc', and thus we would look up the embeddings of each individual character in the representation, as well as the embeddings of 'Cccccc' as a whole. Then like normal, we would look up the embeddings of each individual character in 'Beyonce' and 'Beyonce' itself (should it be in the look up dictionary and training set).

In [1]:
from collections import defaultdict
import time
import random
import dynet as dy
import numpy as np

In [2]:
w2i = defaultdict(lambda: len(w2i))   # word: its position in the doc; all unknown words(UNK) are in the position 0
t2i = defaultdict(lambda: len(t2i))   # tag: its position {'b-sportsteam': 11, 'i-musicartist': 19...}

char2i = defaultdict(lambda: len(char2i))

ortho_char2i = defaultdict(lambda: len(ortho_char2i))
ortho_word2i = defaultdict(lambda: len(ortho_word2i))

In [3]:
UNK = w2i["<unk>"]

UNK_char = char2i["<unk_char>"]
pad_char = char2i["<*>"]

pad_ortho_char = ortho_char2i["<*>"]
UNK_ortho_word = ortho_word2i["<unk_ortho_word>"]

In [4]:
def i2w(index):
    """Takes a word index and returns word.""" 
    return list(w2i.keys())[list(w2i.values()).index(index)]

def i2t(index):
    """Takes a tag index and returns a tag."""
    return list(t2i.keys())[list(t2i.values()).index(index)]

def i2ortho_char(index):
    """Takes a character index and returns a character."""
    return list(ortho_char2i.keys())[list(ortho_char2i.values()).index(index)]

def i2ortho_word(index):
    """Takes a character index and returns a character."""
    return list(ortho_word2i.keys())[list(ortho_word2i.values()).index(index)]

In [5]:
def get_ortho_crep(char):
    if char.isupper():
        return "C"
    if char.islower():
        return "c"
    if char.isdigit():
        return "n"
    else:
        return "p"

In [6]:
# function to read and preprocess data for training
# return: processed data [(1, 0, 0), (2, 1, 0), (3, 2, 0), (4, 2, 0),
def read_dataset(filename):
    with open(filename, "r") as f:
        data_list = []
        sent_list = []
        ortho_sent_list = []
        for line in f:
            if len(line.strip()) != 0:
                word, tag = line.strip().split("\t")
                #sent_list.append((w2i[word], t2i[tag])) # (word index, tag index)
                ortho_word = str()
                for char in word:
                    char2i[char]
                    ortho_char = get_ortho_crep(char)
                    ortho_word += ortho_char
                    ortho_char2i[ortho_char]
                sent_list.append((w2i[word], ortho_word2i[ortho_word], t2i[tag])) # (word index, orthographic word index, tag index)

            else:
                if len(sent_list) != 0:
                    data_list.append(sent_list)
                sent_list = []
        return data_list


In [7]:
train = read_dataset("train")
dev = read_dataset("dev")
train = train + dev

In [8]:
# freezes the w2i dict
w2i = defaultdict(lambda: UNK, w2i)
char2i = defaultdict(lambda: UNK_char, char2i)
ortho_word2i = defaultdict(lambda: UNK_ortho_word, ortho_word2i)

In [9]:
# number of words and number of tags
nwords = max(w2i.values()) + 1 # used to exclude extra UNK
nchar = max(char2i.values()) + 1

n_ortho_char = len(ortho_char2i)
n_ortho_words = max(ortho_word2i.values()) + 1

ntags = len(t2i)

In [10]:
print(nwords, nchar, n_ortho_char, n_ortho_words, ntags)

14879 94 5 2227 21


In [11]:
len(ortho_word2i)

2227

In [12]:
# reads in the test data;
test = read_dataset("test")

In [13]:
# Start DyNet and define trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

In [14]:
# Define the model
# parameter sizes are not correct; input_dim = word_embedding_dim + 128?
W_EMB_SIZE = 64
CH_EMB_SIZE = 30
ORTHO_CH_EMB_SIZE = 30
ORTHO_EMB_SIZE = 50
HID_SIZE = 64
#CHAR_LSTM_OUTPUT_SIZE = 200

W_emb = model.add_lookup_parameters((nwords, W_EMB_SIZE))  # Word embeddings
chW_emb = model.add_lookup_parameters((nchar, CH_EMB_SIZE))  # char embeddings

orthoW_emb = model.add_lookup_parameters((n_ortho_words, ORTHO_EMB_SIZE)) # ortho word embed
orthoChar_emb = model.add_lookup_parameters((n_ortho_char, CH_EMB_SIZE)) # ortho char embed

# Word-based biLSTM + WO_EMBED_SIZE + 4*CHAR_LSTM_OUTPUT_SIZE
lstm_builders = [dy.LSTMBuilder(1, W_EMB_SIZE + ORTHO_EMB_SIZE + 4*ORTHO_CH_EMB_SIZE, HID_SIZE, model), 
                 dy.LSTMBuilder(1, W_EMB_SIZE + ORTHO_EMB_SIZE + 4*ORTHO_CH_EMB_SIZE, HID_SIZE, model)] # fwd and bwd LSTM

# Char-based biLSTM
char_lstm_builders = [dy.LSTMBuilder(1, CH_EMB_SIZE, ORTHO_CH_EMB_SIZE, model), 
                     dy.LSTMBuilder(1, CH_EMB_SIZE, ORTHO_CH_EMB_SIZE, model)] # fwd and bwd LSTM
    
# Ortho features biLSTM
ortho_lstm_builders = [dy.LSTMBuilder(1, CH_EMB_SIZE, ORTHO_CH_EMB_SIZE, model), 
                     dy.LSTMBuilder(1, CH_EMB_SIZE, ORTHO_CH_EMB_SIZE, model)] # fwd and bwd LSTM
    
W_sm = model.add_parameters((ntags, 2 * HID_SIZE))  # Softmax weights
b_sm = model.add_parameters((ntags))  # Softmax bias

In [15]:
def orthoword_representation(word):
    fwd_init, bwd_init = [b.initial_state() for b in ortho_lstm_builders]
    for b in ortho_lstm_builders:
        b.set_dropouts(0.5, 0.5)
        
    ortho_word_embs = dy.lookup(orthoW_emb, word)
    char_ids = [pad_ortho_char] + [ortho_char2i[c] for c in i2ortho_word(word)] + [pad_ortho_char]
    char_embs = [dy.lookup(orthoChar_emb, cid) for cid in char_ids] # orth ch embeddings
    
    fwd_embs = fwd_init.transduce(char_embs)
    bwd_embs = bwd_init.transduce(reversed(char_embs))
    #complete_char_rep = dy.concatenate([fwd_embs[-1], bwd_embs[-1]]) 
    
    #return dy.concatenate([ortho_word_embs, complete_char_rep])
    return ortho_word_embs, fwd_embs[-1], bwd_embs[-1]

In [16]:
def word_representation(word, ortho_word):
    fwd_init, bwd_init = [b.initial_state() for b in char_lstm_builders]
    for b in char_lstm_builders:
        b.set_dropouts(0.5, 0.5)    
    
    word_embs = dy.lookup(W_emb, word) # word embeddings
    char_ids = [pad_char] + [char2i[c] for c in i2w(word)] + [pad_char] # word characters
    char_embs = [dy.lookup(chW_emb, cid) for cid in char_ids] # word ch embeddings
    
    fwd_embs = fwd_init.transduce(char_embs)
    bwd_embs = bwd_init.transduce(reversed(char_embs))
    #complete_char_rep = dy.concatenate([fwd_embs[-1], bwd_embs[-1]]) 
    
    #get the ortho-char plus ortho-word rep
    orthow_embs, ortho_fwd_embs, ortho_bwd_embs = orthoword_representation(ortho_word)
    #orthow_embs, ortho_fwd_embs, ortho_bwd_embs = [orthoword_representation(ow) for w, ow, t in sent]

    return dy.concatenate([word_embs, fwd_embs[-1], bwd_embs[-1], orthow_embs, ortho_fwd_embs, ortho_bwd_embs])
    #return word_embs, fwd_embs[-1], bwd_embs[-1]

In [17]:
def build_tagging_graph(sent):
    dy.renew_cg()
    
    #char_plus_word_embs = [word_representation(w, ow) for w, ow, t in sent]
    #word_embs, ch_fwd_embs, ch_bwd_embs = [word_representation(w) for w, ow, t in sent]
    #orthochar_plus_orthoword_embs = [orthoword_representation(ow) for w, ow, t in sent]
    #orthow_embs, ortho_fwd_embs, ortho_bwd_embs = [orthoword_representation(ow) for w, ow, t in sent]
    word_plus_ortho_rep_emb = [word_representation(w, ow) for w, ow, t in sent]
    
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    for b in lstm_builders:
        b.set_dropouts(0.5, 0.5) 
    
    fwd_embs = fwd_init.transduce(word_plus_ortho_rep_emb)
    bwd_embs = bwd_init.transduce(reversed(word_plus_ortho_rep_emb))
 
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    errs = []
    for (word, oword, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of a single word from LSTM
        predicted = W_sm_exp * complete_rep + b_sm_exp
        err = dy.pickneglogsoftmax(predicted, tag)
        errs.append(err)
    return dy.esum(errs)

In [18]:
def tag_sent(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @ return list of (word, predicted labels)
    """
    dy.renew_cg()
    
    word_embs = [word_representation(w, ow) for w,ow,t in sent]
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    
    fwd_embs = fwd_init.transduce(word_embs)
    bwd_embs = bwd_init.transduce(reversed(word_embs))
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    predicted_labels = []
    for (word, oword, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of word from LSTM
        scores = (W_sm_exp * complete_rep + b_sm_exp).npvalue()
        predict = np.argmax(scores)
        predicted_labels.append((word, predict))
    return predicted_labels

In [19]:
iter_max = 50 # max num. of iterations

for ITER in range(iter_max):
    """
    Trains a neural network using the defined computational graph.
    Prints the accuracy per iteration and outputs the predictions in the final
    iteration.
    
    Args:
        iter_max: the maximum number of iterations to train the network.
    """
    # Perform training
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    if ITER == iter_max - 1: # final iteration
        out = open('predicted/10labels/model-6.txt', 'w')
    for sent in train:
        sent_error = build_tagging_graph(sent)
        train_loss += sent_error.value()
        sent_error.backward()
        trainer.update()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start))
    
    total_acc = 0.0
    for sent in test:
        p_labels = tag_sent(sent)
        g_labels = [tags for word, oword, tags in sent]
        test_correct = 0
        for i, p_g in enumerate(zip(p_labels, g_labels)):
            word = p_g[0][0]
            predicted = p_g[0][1]
            gold = p_g[1]
            if predicted == gold:
                test_correct += 1
            if ITER == iter_max - 1:
                out.write(i2w(word) + '\t' + i2t(gold) + '\t' + i2t(predicted) + '\n') # writes the word, gold tag, and predicted tag
                if i == (len(p_g) - 1):
                    out.write('\n')
        total_acc += test_correct / len(g_labels)
    print("iter %r: test acc=%.4f" % (ITER, total_acc / len(test)))
out.close()

iter 0: train loss/sent=6.3065, time=145.71s
iter 0: test acc=0.9078
iter 1: train loss/sent=4.8337, time=143.21s
iter 1: test acc=0.9087
iter 2: train loss/sent=4.2075, time=145.96s
iter 2: test acc=0.9119
iter 3: train loss/sent=3.7988, time=144.18s
iter 3: test acc=0.9117
iter 4: train loss/sent=3.3812, time=144.50s
iter 4: test acc=0.9142
iter 5: train loss/sent=2.9973, time=148.52s
iter 5: test acc=0.9141
iter 6: train loss/sent=2.7645, time=148.46s
iter 6: test acc=0.9152
iter 7: train loss/sent=2.4300, time=145.52s
iter 7: test acc=0.9146
iter 8: train loss/sent=2.1645, time=148.27s
iter 8: test acc=0.9155
iter 9: train loss/sent=1.9521, time=138.29s
iter 9: test acc=0.9151
iter 10: train loss/sent=1.7231, time=141.85s
iter 10: test acc=0.9159
iter 11: train loss/sent=1.5715, time=138.68s
iter 11: test acc=0.9155
iter 12: train loss/sent=1.4081, time=143.40s
iter 12: test acc=0.9155
iter 13: train loss/sent=1.2548, time=144.46s
iter 13: test acc=0.9160
iter 14: train loss/sent=1

In [20]:
# runs conll evaluation script; minor change for python3 in file (line 155)
!python3 conlleval.py predicted/10labels/model-6.txt > predicted/10labels/model-6-eval.txt

processed 61908 tokens with 3621 phrases; found: 2163 phrases; correct: 657.
accuracy:  91.22%; precision:  30.37%; recall:  18.14%; FB1:  22.72
          company: precision:  55.95%; recall:  14.71%; FB1:  23.30  168
         facility: precision:  17.82%; recall:  13.53%; FB1:  15.38  202
          geo-loc: precision:  48.89%; recall:  34.72%; FB1:  40.61  630
            movie: precision:   0.00%; recall:   0.00%; FB1:   0.00  45
      musicartist: precision:   8.33%; recall:   5.10%; FB1:   6.33  120
            other: precision:  23.57%; recall:  17.15%; FB1:  19.85  454
           person: precision:  29.83%; recall:  14.14%; FB1:  19.19  238
          product: precision:  13.91%; recall:   5.63%; FB1:   8.02  115
       sportsteam: precision:   9.43%; recall:   9.93%; FB1:   9.68  159
           tvshow: precision:   0.00%; recall:   0.00%; FB1:   0.00  32


## Word2Vec Embeddings

This model uses the 200 dimension pre-trained word2vec embeddings from Godin et al. 2015 to represent the words which are then passed through the BiLSTM to predict the NER labels.

This was not fully ran as the 200d embeddings ran too slowly per iteration.

In [1]:
# import necessary packages
from collections import defaultdict
import time
import random
import dynet as dy
import numpy as np

In [2]:
# defines dicts to convert words and tags into indices
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"]

In [3]:
def i2w(index):
    """Takes a word index and returns word.""" 
    return list(w2i.keys())[list(w2i.values()).index(index)]

In [4]:
def i2t(index):
    """Takes a tag index and returns a tag."""
    return list(t2i.keys())[list(t2i.values()).index(index)]

In [5]:
def read_dataset(filename):
    """Reads a file from the WNUT17 dataset.

    Returns:
        A list containing each sentence from the dataset in a separate list.
        Each element inside the sentence list is a tuple containing the
        word index and tag index.
        
    For example:
    [[(1, 0), (2, 1), (3, 2), (4, 0)],[(2,1), (13,2), (14, 0), (15,0)]]
"""
    with open(filename, "r") as f:
        data_list = []
        sent_list = []
        for line in f:
            if len(line.strip()) != 0:
                word, tag = line.strip().split("\t")
                sent_list.append((w2i[word], t2i[tag])) # (word index, tag index)
            else:
                if len(sent_list) != 0:
                    data_list.append(sent_list)
                sent_list = []
        return data_list

In [6]:
# reads in the training and dev data; combines them both as train
train = read_dataset("wnut17/data/train")
dev = read_dataset("wnut17/data/dev")
train = train + dev

In [7]:
# freezes the w2i dict
w2i = defaultdict(lambda: UNK, w2i)

In [8]:
# number of words and number of tags
nwords = max(w2i.values()) + 1 # used to exclude extra UNK
ntags = len(t2i)

In [9]:
from word2vec_twitter_model import word2vecReader
model = word2vecReader.Word2Vec().load_word2vec_format("word2vec_twitter_model/word2vec_twitter_model.bin", binary=True)

In [11]:
len(model.vocab)

3039345

In [81]:
# reads in the test data;
test = read_dataset("wnut17/data/test")

In [54]:
# Start DyNet and define trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

In [55]:
# Define the model
EMB_SIZE = len(out[0])
HID_SIZE = 64
W_emb = model.add_lookup_parameters((nwords, EMB_SIZE))  # Word embeddings
W_emb.init_from_array(out)
lstm_builders = [dy.LSTMBuilder(1, EMB_SIZE, HID_SIZE, model), 
                 dy.LSTMBuilder(1, EMB_SIZE, HID_SIZE, model)] # fwd and bwd LSTM
W_sm = model.add_parameters((ntags, 2 * HID_SIZE))  # Softmax weights
b_sm = model.add_parameters((ntags))  # Softmax bias

In [60]:
def build_tagging_graph(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @return list of error for each tag
    """
    dy.renew_cg()
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    word_embs = [dy.lookup(W_emb, word) for word, tag in sent]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    errs = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of a single word from LSTM
        predicted = W_sm_exp * complete_rep + b_sm_exp
        err = dy.pickneglogsoftmax(predicted, tag)
        errs.append(err)
    return dy.esum(errs)

In [61]:
def tag_sent(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @ return list of (word, predicted labels)
    """
    dy.renew_cg()
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    word_embs = [dy.lookup(W_emb, word) for word, tag in sent]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    predicted_labels = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of word from LSTM
        scores = (W_sm_exp * complete_rep + b_sm_exp).npvalue()
        predict = np.argmax(scores)
        predicted_labels.append((word, predict))
    return predicted_labels

In [None]:
iter_max = 50 # max num. of iterations

for ITER in range(iter_max):
    """
    Trains a neural network using the defined computational graph.
    Prints the accuracy per iteration and outputs the predictions in the final
    iteration.
    
    Args:
        iter_max: the maximum number of iterations to train the network.
    """
    # Perform training
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    if ITER == iter_max - 1: # final iteration
        out = open('predicted/10labels/model-X.txt', 'w')
    for sent in train:
        sent_error = build_tagging_graph(sent)
        train_loss += sent_error.value()
        sent_error.backward()
        trainer.update()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start))
    
    total_acc = 0.0
    for sent in test:
        p_labels = tag_sent(sent)
        g_labels = [tags for word, tags in sent]
        test_correct = 0
        for i, p_g in enumerate(zip(p_labels, g_labels)):
            word = p_g[0][0]
            predicted = p_g[0][1]
            gold = p_g[1]
            if predicted == gold:
                test_correct += 1
            if ITER == iter_max - 1:
                out.write(i2w(word) + '\t' + i2t(gold) + '\t' + i2t(predicted) + '\n') # writes the word, gold tag, and predicted tag
                if i == (len(p_g) - 1):
                    out.write('\n')
        total_acc += test_correct / len(g_labels)
    print("iter %r: test acc=%.4f" % (ITER, total_acc / len(test)))
out.close()

In [70]:
# runs conll evaluation script; minor change for python3 in file (line 155)
!python3 conlleval.py predicted/10labels/model-X.txt > predicted/10labels/model-X-eval.txt