## Basic Neural Network Model

This model uses an embedding representation of the words in the vocabulary which is then passed through a BiLSTM to predict the NER labels.

### 10 labels model

In [1]:
# import necessary packages
from collections import defaultdict
import time
import random
import dynet as dy
import numpy as np

In [2]:
# defines dicts to convert words and tags into indices
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"]

In [3]:
def i2w(index):
    """Takes a word index and returns word.""" 
    return list(w2i.keys())[list(w2i.values()).index(index)]

In [4]:
def i2t(index):
    """Takes a tag index and returns a tag."""
    return list(t2i.keys())[list(t2i.values()).index(index)]

In [5]:
def read_dataset(filename):
    """Reads a file from the WNUT17 dataset.

    Returns:
        A list containing each sentence from the dataset in a separate list.
        Each element inside the sentence list is a tuple containing the
        word index and tag index.
        
    For example:
    [[(1, 0), (2, 1), (3, 2), (4, 0)],[(2,1), (13,2), (14, 0), (15,0)]]
"""
    with open(filename, "r") as f:
        data_list = []
        sent_list = []
        for line in f:
            if len(line.strip()) != 0:
                word, tag = line.strip().split("\t")
                sent_list.append((w2i[word], t2i[tag])) # (word index, tag index)
            else:
                if len(sent_list) != 0:
                    data_list.append(sent_list)
                sent_list = []
        return data_list

In [6]:
# reads in the training and dev data; combines them both as train
train = read_dataset("wnut17/data/train")
dev = read_dataset("wnut17/data/dev")
train = train + dev

In [7]:
# freezes the w2i dict
w2i = defaultdict(lambda: UNK, w2i)

In [8]:
# number of words and number of tags
nwords = max(w2i.values()) + 1 # used to exclude extra UNK
ntags = len(t2i)

In [9]:
# reads in the test data;
test = read_dataset("wnut17/data/test")

In [10]:
# Start DyNet and define trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

In [11]:
# Define the model
EMB_SIZE = 64
HID_SIZE = 64
W_emb = model.add_lookup_parameters((nwords, EMB_SIZE))  # Word embeddings
lstm_builders = [dy.LSTMBuilder(1, EMB_SIZE, HID_SIZE, model), 
                 dy.LSTMBuilder(1, EMB_SIZE, HID_SIZE, model)] # fwd and bwd LSTM
W_sm = model.add_parameters((ntags, 2 * HID_SIZE))  # Softmax weights
b_sm = model.add_parameters((ntags))  # Softmax bias

In [12]:
def build_tagging_graph(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @return list of error for each tag
    """
    dy.renew_cg()
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    word_embs = [dy.lookup(W_emb, word) for word, tag in sent]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    errs = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of a single word from LSTM
        predicted = W_sm_exp * complete_rep + b_sm_exp
        err = dy.pickneglogsoftmax(predicted, tag)
        errs.append(err)
    return dy.esum(errs)

In [13]:
def tag_sent(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @ return list of (word, predicted labels)
    """
    dy.renew_cg()
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    word_embs = [dy.lookup(W_emb, word) for word, tag in sent]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    predicted_labels = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of word from LSTM
        scores = (W_sm_exp * complete_rep + b_sm_exp).npvalue()
        predict = np.argmax(scores)
        predicted_labels.append((word, predict))
    return predicted_labels

In [14]:
iter_max = 50 # max num. of iterations

for ITER in range(iter_max):
    """
    Trains a neural network using the defined computational graph.
    Prints the accuracy per iteration and outputs the predictions in the final
    iteration.
    
    Args:
        iter_max: the maximum number of iterations to train the network.
    """
    # Perform training
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    if ITER == iter_max - 1: # final iteration
        out = open('predicted/10labels/model-1.txt', 'w')
    for sent in train:
        sent_error = build_tagging_graph(sent)
        train_loss += sent_error.value()
        sent_error.backward()
        trainer.update()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start))
    
    total_acc = 0.0
    for sent in test:
        p_labels = tag_sent(sent)
        g_labels = [tags for word, tags in sent]
        test_correct = 0
        for i, p_g in enumerate(zip(p_labels, g_labels)):
            word = p_g[0][0]
            predicted = p_g[0][1]
            gold = p_g[1]
            if predicted == gold:
                test_correct += 1
            if ITER == iter_max - 1:
                out.write(i2w(word) + '\t' + i2t(gold) + '\t' + i2t(predicted) + '\n') # writes the word, gold tag, and predicted tag
                if i == (len(p_g) - 1):
                    out.write('\n')
        total_acc += test_correct / len(g_labels)
    print("iter %r: test acc=%.4f" % (ITER, total_acc / len(test)))
out.close()

iter 0: train loss/sent=7.4537, time=8.34s
iter 0: test acc=0.9074
iter 1: train loss/sent=5.3288, time=7.71s
iter 1: test acc=0.9085
iter 2: train loss/sent=4.2602, time=8.04s
iter 2: test acc=0.9113
iter 3: train loss/sent=3.4010, time=8.43s
iter 3: test acc=0.9111
iter 4: train loss/sent=2.6634, time=7.99s
iter 4: test acc=0.9122
iter 5: train loss/sent=2.0325, time=8.69s
iter 5: test acc=0.9126
iter 6: train loss/sent=1.4901, time=8.86s
iter 6: test acc=0.9123
iter 7: train loss/sent=1.0708, time=8.37s
iter 7: test acc=0.9130
iter 8: train loss/sent=0.7558, time=8.27s
iter 8: test acc=0.9131
iter 9: train loss/sent=0.4685, time=8.61s
iter 9: test acc=0.9125
iter 10: train loss/sent=0.3234, time=8.53s
iter 10: test acc=0.9127
iter 11: train loss/sent=0.2138, time=8.52s
iter 11: test acc=0.9129
iter 12: train loss/sent=0.1567, time=8.28s
iter 12: test acc=0.9130
iter 13: train loss/sent=0.1143, time=8.58s
iter 13: test acc=0.9136
iter 14: train loss/sent=0.1108, time=8.30s
iter 14: t

In [28]:
# runs conll evaluation script; minor change for python3 in file (line 155)
!python3 conlleval.py predicted/10labels/model-1.txt

processed 61908 tokens with 3621 phrases; found: 1471 phrases; correct: 497.
accuracy:  91.24%; precision:  33.79%; recall:  13.73%; FB1:  19.52
          company: precision:  53.66%; recall:  10.33%; FB1:  17.32  123
         facility: precision:   7.21%; recall:   3.01%; FB1:   4.24  111
          geo-loc: precision:  51.20%; recall:  28.86%; FB1:  36.91  500
            movie: precision:   0.00%; recall:   0.00%; FB1:   0.00  3
      musicartist: precision:  30.30%; recall:   5.10%; FB1:   8.73  33
            other: precision:  23.99%; recall:  14.26%; FB1:  17.89  371
           person: precision:  21.98%; recall:  10.16%; FB1:  13.90  232
          product: precision:  25.81%; recall:   5.63%; FB1:   9.25  62
       sportsteam: precision:   3.23%; recall:   0.66%; FB1:   1.10  31
           tvshow: precision:   0.00%; recall:   0.00%; FB1:   0.00  5


### 2 labels model

In [33]:
# import necessary packages
from collections import defaultdict
import time
import random
import dynet as dy
import numpy as np

In [34]:
# defines dicts to convert words and tags into indices
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"]

In [35]:
def i2w(index):
    """Takes a word index and returns word.""" 
    return list(w2i.keys())[list(w2i.values()).index(index)]

In [36]:
def i2t(index):
    """Takes a tag index and returns a tag."""
    return list(t2i.keys())[list(t2i.values()).index(index)]

In [37]:
def read_dataset(filename):
    """Reads a file from the WNUT17 dataset.

    Returns:
        A list containing each sentence from the dataset in a separate list.
        Each element inside the sentence list is a tuple containing the
        word index and tag index.
        
    For example:
    [[(1, 0), (2, 1), (3, 2), (4, 0)],[(2,1), (13,2), (14, 0), (15,0)]]
"""
    with open(filename, "r") as f:
        data_list = []
        sent_list = []
        for line in f:
            if len(line.strip()) != 0:
                word, tag = line.strip().split("\t")
                sent_list.append((w2i[word], t2i[tag])) # (word index, tag index)
            else:
                if len(sent_list) != 0:
                    data_list.append(sent_list)
                sent_list = []
        return data_list

In [38]:
# reads in the training and dev data; combines them both as train
train = read_dataset("wnut17/data/train_notypes")
dev = read_dataset("wnut17/data/dev_notypes")
train = train + dev

In [39]:
# freezes the w2i dict
w2i = defaultdict(lambda: UNK, w2i)

In [40]:
# number of words and number of tags
nwords = max(w2i.values()) + 1 # used to exclude extra UNK
ntags = len(t2i)

In [41]:
# reads in the test data;
test = read_dataset("wnut17/data/test_notypes")

In [42]:
# Start DyNet and define trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

In [43]:
# Define the model
EMB_SIZE = 64
HID_SIZE = 64
W_emb = model.add_lookup_parameters((nwords, EMB_SIZE))  # Word embeddings
lstm_builders = [dy.LSTMBuilder(1, EMB_SIZE, HID_SIZE, model), 
                 dy.LSTMBuilder(1, EMB_SIZE, HID_SIZE, model)] # fwd and bwd LSTM
W_sm = model.add_parameters((ntags, 2 * HID_SIZE))  # Softmax weights
b_sm = model.add_parameters((ntags))  # Softmax bias

In [44]:
def build_tagging_graph(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @return list of error for each tag
    """
    dy.renew_cg()
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    word_embs = [dy.lookup(W_emb, word) for word, tag in sent]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    errs = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of a single word from LSTM
        predicted = W_sm_exp * complete_rep + b_sm_exp
        err = dy.pickneglogsoftmax(predicted, tag)
        errs.append(err)
    return dy.esum(errs)

In [45]:
def tag_sent(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @ return list of (word, predicted labels)
    """
    dy.renew_cg()
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    word_embs = [dy.lookup(W_emb, word) for word, tag in sent]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    predicted_labels = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of word from LSTM
        scores = (W_sm_exp * complete_rep + b_sm_exp).npvalue()
        predict = np.argmax(scores)
        predicted_labels.append((word, predict))
    return predicted_labels

In [46]:
iter_max = 50 # max num. of iterations

for ITER in range(iter_max):
    """
    Trains a neural network using the defined computational graph.
    Prints the accuracy per iteration and outputs the predictions in the final
    iteration.
    
    Args:
        iter_max: the maximum number of iterations to train the network.
    """
    # Perform training
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    if ITER == iter_max - 1: # final iteration
        out = open('predicted/2labels/model-1.txt', 'w')
    for sent in train:
        sent_error = build_tagging_graph(sent)
        train_loss += sent_error.value()
        sent_error.backward()
        trainer.update()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start))
    
    total_acc = 0.0
    for sent in test:
        p_labels = tag_sent(sent)
        g_labels = [tags for word, tags in sent]
        test_correct = 0
        for i, p_g in enumerate(zip(p_labels, g_labels)):
            word = p_g[0][0]
            predicted = p_g[0][1]
            gold = p_g[1]
            if predicted == gold:
                test_correct += 1
            if ITER == iter_max - 1:
                out.write(i2w(word) + '\t' + i2t(gold) + '\t' + i2t(predicted) + '\n') # writes the word, gold tag, and predicted tag
                if i == (len(p_g) - 1):
                    out.write('\n')
        total_acc += test_correct / len(g_labels)
    print("iter %r: test acc=%.4f" % (ITER, total_acc / len(test)))
out.close()

iter 0: train loss/sent=4.5196, time=9.37s
iter 0: test acc=0.9084
iter 1: train loss/sent=2.9940, time=8.65s
iter 1: test acc=0.9136
iter 2: train loss/sent=2.2171, time=8.09s
iter 2: test acc=0.9187
iter 3: train loss/sent=1.5927, time=7.80s
iter 3: test acc=0.9162
iter 4: train loss/sent=1.0688, time=7.77s
iter 4: test acc=0.9167
iter 5: train loss/sent=0.6863, time=7.91s
iter 5: test acc=0.9117
iter 6: train loss/sent=0.4192, time=8.08s
iter 6: test acc=0.9127
iter 7: train loss/sent=0.2398, time=7.74s
iter 7: test acc=0.9139
iter 8: train loss/sent=0.1514, time=7.77s
iter 8: test acc=0.9084
iter 9: train loss/sent=0.1105, time=7.61s
iter 9: test acc=0.9107
iter 10: train loss/sent=0.0718, time=7.61s
iter 10: test acc=0.9151
iter 11: train loss/sent=0.0741, time=8.16s
iter 11: test acc=0.9099
iter 12: train loss/sent=0.0668, time=9.51s
iter 12: test acc=0.9122
iter 13: train loss/sent=0.0627, time=8.36s
iter 13: test acc=0.9144
iter 14: train loss/sent=0.0720, time=8.75s
iter 14: t

In [47]:
# runs conll evaluation script; minor change for python3 in file (line 155)
!python3 conlleval.py predicted/2labels/model-1.txt

processed 61908 tokens with 3621 phrases; found: 2904 phrases; correct: 1021.
accuracy:  91.16%; precision:  35.16%; recall:  28.20%; FB1:  31.30
                 : precision:  35.16%; recall:  28.20%; FB1:  31.30  2904


### Glove Embeddings

This model uses the 200 dimension pre-trained Glove embeddings to represent the words which are then passed through the BiLSTM to predict the NER labels.

In [44]:
# import necessary packages
from collections import defaultdict
import time
import random
import dynet as dy
import numpy as np

In [45]:
# defines dicts to convert words and tags into indices
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"]

In [46]:
def i2w(index):
    """Takes a word index and returns word.""" 
    return list(w2i.keys())[list(w2i.values()).index(index)]

In [47]:
def i2t(index):
    """Takes a tag index and returns a tag."""
    return list(t2i.keys())[list(t2i.values()).index(index)]

In [48]:
def read_dataset(filename):
    """Reads a file from the WNUT17 dataset.

    Returns:
        A list containing each sentence from the dataset in a separate list.
        Each element inside the sentence list is a tuple containing the
        word index and tag index.
        
    For example:
    [[(1, 0), (2, 1), (3, 2), (4, 0)],[(2,1), (13,2), (14, 0), (15,0)]]
"""
    with open(filename, "r") as f:
        data_list = []
        sent_list = []
        for line in f:
            if len(line.strip()) != 0:
                word, tag = line.strip().split("\t")
                sent_list.append((w2i[word], t2i[tag])) # (word index, tag index)
            else:
                if len(sent_list) != 0:
                    data_list.append(sent_list)
                sent_list = []
        return data_list

In [49]:
# reads in the training and dev data; combines them both as train
train = read_dataset("wnut17/data/train")
dev = read_dataset("wnut17/data/dev")
train = train + dev

In [50]:
# freezes the w2i dict
w2i = defaultdict(lambda: UNK, w2i)

In [51]:
# number of words and number of tags
nwords = max(w2i.values()) + 1 # used to exclude extra UNK
ntags = len(t2i)

In [52]:
%%time
# load the pre-trained Glove embeddings
embeddings = {}
with open("glove.twitter.27B/glove.twitter.27B.200d.txt") as f:
    for line in f:
        split = line.split()
        word = split[0]
        vec = split[1:]
        embeddings[word] = vec
    embedding_dim = 200
    out = np.random.uniform(-0.8, 0.8, (nwords, embedding_dim))
    for word, embed in embeddings.items():
        embed_np = np.array(embed)
        if word in w2i.keys():
            out[w2i[word]] = embed_np

CPU times: user 1min 37s, sys: 1min 33s, total: 3min 10s
Wall time: 3min 21s


In [53]:
# reads in the test data;
test = read_dataset("wnut17/data/test")

In [54]:
# Start DyNet and define trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

In [55]:
# Define the model
EMB_SIZE = len(out[0])
HID_SIZE = 64
W_emb = model.add_lookup_parameters((nwords, EMB_SIZE))  # Word embeddings
W_emb.init_from_array(out)
lstm_builders = [dy.LSTMBuilder(1, EMB_SIZE, HID_SIZE, model), 
                 dy.LSTMBuilder(1, EMB_SIZE, HID_SIZE, model)] # fwd and bwd LSTM
W_sm = model.add_parameters((ntags, 2 * HID_SIZE))  # Softmax weights
b_sm = model.add_parameters((ntags))  # Softmax bias

In [60]:
def build_tagging_graph(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @return list of error for each tag
    """
    dy.renew_cg()
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    word_embs = [dy.lookup(W_emb, word) for word, tag in sent]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    errs = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of a single word from LSTM
        predicted = W_sm_exp * complete_rep + b_sm_exp
        err = dy.pickneglogsoftmax(predicted, tag)
        errs.append(err)
    return dy.esum(errs)

In [61]:
def tag_sent(sent):
    """
    Builds the comp graph for the model with:
    * Embeddings
    * BiLSTM
    @ return list of (word, predicted labels)
    """
    dy.renew_cg()
    fwd_init, bwd_init = [b.initial_state() for b in lstm_builders]
    word_embs = [dy.lookup(W_emb, word) for word, tag in sent]
    
    fwd_embs = [x.output() for x in fwd_init.add_inputs(word_embs)]
    bwd_embs = [x.output() for x in bwd_init.add_inputs(reversed(word_embs))]
    
    W_sm_exp = dy.parameter(W_sm)
    b_sm_exp = dy.parameter(b_sm)
    
    predicted_labels = []
    for (word, tag), f_rep, b_rep in zip(sent, fwd_embs, reversed(bwd_embs)):
        complete_rep = dy.concatenate([f_rep, b_rep]) # complete rep of word from LSTM
        scores = (W_sm_exp * complete_rep + b_sm_exp).npvalue()
        predict = np.argmax(scores)
        predicted_labels.append((word, predict))
    return predicted_labels

In [62]:
iter_max = 50 # max num. of iterations

for ITER in range(iter_max):
    """
    Trains a neural network using the defined computational graph.
    Prints the accuracy per iteration and outputs the predictions in the final
    iteration.
    
    Args:
        iter_max: the maximum number of iterations to train the network.
    """
    # Perform training
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    if ITER == iter_max - 1: # final iteration
        out = open('predicted/10labels/model-2.txt', 'w')
    for sent in train:
        sent_error = build_tagging_graph(sent)
        train_loss += sent_error.value()
        sent_error.backward()
        trainer.update()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start))
    
    total_acc = 0.0
    for sent in test:
        p_labels = tag_sent(sent)
        g_labels = [tags for word, tags in sent]
        test_correct = 0
        for i, p_g in enumerate(zip(p_labels, g_labels)):
            word = p_g[0][0]
            predicted = p_g[0][1]
            gold = p_g[1]
            if predicted == gold:
                test_correct += 1
            if ITER == iter_max - 1:
                out.write(i2w(word) + '\t' + i2t(gold) + '\t' + i2t(predicted) + '\n') # writes the word, gold tag, and predicted tag
                if i == (len(p_g) - 1):
                    out.write('\n')
        total_acc += test_correct / len(g_labels)
    print("iter %r: test acc=%.4f" % (ITER, total_acc / len(test)))
out.close()

iter 0: train loss/sent=0.0332, time=13.32s
iter 0: test acc=0.8939
iter 1: train loss/sent=0.0278, time=12.99s
iter 1: test acc=0.9003
iter 2: train loss/sent=0.0377, time=12.98s
iter 2: test acc=0.8917
iter 3: train loss/sent=0.0283, time=13.00s
iter 3: test acc=0.8963
iter 4: train loss/sent=0.0419, time=12.96s
iter 4: test acc=0.8949
iter 5: train loss/sent=0.0348, time=12.98s
iter 5: test acc=0.8970
iter 6: train loss/sent=0.0321, time=12.96s
iter 6: test acc=0.9037
iter 7: train loss/sent=0.0303, time=13.07s
iter 7: test acc=0.9046
iter 8: train loss/sent=0.0246, time=13.02s
iter 8: test acc=0.9041
iter 9: train loss/sent=0.0271, time=13.05s
iter 9: test acc=0.9078
iter 10: train loss/sent=0.0295, time=12.99s
iter 10: test acc=0.9077
iter 11: train loss/sent=0.0276, time=12.97s
iter 11: test acc=0.9073
iter 12: train loss/sent=0.0276, time=12.98s
iter 12: test acc=0.9088
iter 13: train loss/sent=0.0256, time=12.98s
iter 13: test acc=0.9107
iter 14: train loss/sent=0.0368, time=12

In [63]:
# runs conll evaluation script; minor change for python3 in file (line 155)
!python3 conlleval.py predicted/10labels/model-2.txt

processed 61908 tokens with 3621 phrases; found: 1488 phrases; correct: 362.
accuracy:  90.25%; precision:  24.33%; recall:  10.00%; FB1:  14.17
          company: precision:  59.20%; recall:  11.58%; FB1:  19.37  125
         facility: precision:   3.12%; recall:   1.13%; FB1:   1.66  96
          geo-loc: precision:  60.85%; recall:  16.12%; FB1:  25.49  235
            movie: precision:   0.00%; recall:   0.00%; FB1:   0.00  14
      musicartist: precision:  10.71%; recall:   3.06%; FB1:   4.76  56
            other: precision:  36.52%; recall:  10.42%; FB1:  16.21  178
           person: precision:   8.64%; recall:  12.55%; FB1:  10.24  729
          product: precision:  24.14%; recall:   2.46%; FB1:   4.47  29
       sportsteam: precision:   4.76%; recall:   0.66%; FB1:   1.16  21
           tvshow: precision:   0.00%; recall:   0.00%; FB1:   0.00  5
