In [175]:
from __future__ import print_function
from collections import defaultdict, namedtuple
import fileinput
import sys


In [176]:
def get_sents(lines):
    """
    Args:
        lines (Iterable[str]): the lines

    Yields:
        List[str]: the lines delimited by an empty line
    """
    sent = []
    stripped_lines = (line.strip() for line in lines)
    for line in stripped_lines:
        if line == '':
            yield sent
            sent = []
        else:
            sent.append(line)
    yield sent

In [177]:
Token = namedtuple('Token', 'sent_id word_id word bio tag')
wnut_bio = ('B', 'I', 'O')
wnut_tags = ('corporation', 'creative-work', 'group', 'location', 'person', 'product')


In [178]:
def make_tok(word, bio_tag, sent_id=-1, word_id=-1):
    """
    Args:
        word (str): the surface form of the word
        bio_tag (str): the tag with BIO annotation
        sent_id (int): the sentence ID
        word_id (int): the word ID

    Returns:
        Token

    Raises:
        ValueError
    """
    if bio_tag == 'O':
        bio, tag = 'O', 'O'
    else:
        bio, tag = bio_tag.split('-', 1)
        if bio not in wnut_bio or tag not in wnut_tags:
            raise ValueError('Invalid tag: %s %s %d %d' % (word, bio_tag, sent_id, word_id))
    return Token(sent_id, word_id, word, bio, tag)


def token_to_conll(tok):
    """
    Args:
        tok (Token): 

    Returns:
        str:
    """
    return '%s\t%s' % (tok.word, tok.tag if tok.tag == 'O' else '%s-%s' % (tok.bio, tok.tag))

In [179]:
def line_to_toks(line, sent_id=-1, word_id=-1):
    """
    Args:
        line (str): the input line
        sent_id (int): the current sentence ID
        word_id (int): the current word ID

    Returns:
        Dictionary[str,Token]: the gold and guess tokens stored in a dict with keys for gold and guess

    Raises:
        ValueError
    """
    def make_lbl(i):
        return 'gold' if i == 0 else 'sys_%d' % i

    try:
        fields = line.split('\t')
        word = fields[0]
        return {make_lbl(i): make_tok(word, bio_tag, sent_id, word_id)
                for i, bio_tag in enumerate(fields[1:])}
    except ValueError:
        raise ValueError('Invalid line: %s %d %d' % (line, sent_id, word_id))


def sent_to_toks(sent, sent_id=-1):
    """
    Args:
        sent (Iterator[str]): the lines that comprise a sentence
        sent_id (int): the sentence ID

    Returns:
        Dictionary[str,List[Token]]: the gold and guess tokens for each word in the sentence,
        stored in a dict with keys for gold and guess
    """
    toks = defaultdict(list)
    for word_id, line in enumerate(sent):
        for src, tok in line_to_toks(line, sent_id, word_id).items():
            toks[src].append(tok)
    return toks


In [180]:
Entity = namedtuple('Entity', 'words sent_id word_id_start word_id_stop tag')

In [181]:
def entity_to_tokens(entity):
    """
    Args:
        entity (Entity): 

    Returns:
        List[Token]: 
    """
    def get_bio(_i):
        if entity.tag == 'O':
            return 'O'
        elif _i == 0:
            return 'B'
        else:
            return 'I'

    return [Token(entity.sent_id, entity.word_id_start + i, word, get_bio(i), entity.tag)
            for i, word in enumerate(entity.words)]


In [182]:
def entity_to_conll(entity):
    """
    Args:
        entity (Entity): 

    Returns:
        List[str]: a conll-formatted token tag
    """
    return [token_to_conll(tok) for tok in entity_to_tokens(entity)]


def get_phrases(entities):
    """
    Args:
        entities (Iterable[Entity]): 

    Returns:
        Set[Tuple[str]]
    """
    return {entity.words for entity in entities}

In [183]:
def get_phrases_and_tags(entities):
    """
    Args:
        entities (Iterable[Entity]): 

    Returns:
        Set[Tuple[Tuple[str],str]]:
    """
    return {(entity.words, entity.tag) for entity in entities}


def toks_to_entities(toks):
    """
    Args:
        toks (Iterable[Token]): the tokens in a sentence

    Returns:
        Iterable[Entity]: the corresponding entities in a sentence

    Raises:
        ValueError
    """
    def make_entity(tok):
        return Entity((tok.word, ), tok.sent_id, tok.word_id, tok.word_id+1, tok.tag)

    def extend_entity(entity, tok):
        return Entity(entity.words + (tok.word, ), entity.sent_id, entity.word_id_start, tok.word_id+1, entity.tag)

    def reducer(_entities, tok):
        last = _entities.pop()
        if tok.bio == 'I' and tok.tag == last.tag:
            entity = extend_entity(last, tok)
            _entities.append(entity)
        elif tok.bio == 'B' or (tok.bio == 'O' and tok.tag == 'O'):
            entity = make_entity(tok)
            _entities.extend([last, entity])
        # invalid token sequence tag1 => I-tag2: interpret as tag1 => B-tag2
        elif tok.bio == 'I' and tok.tag != last.tag:
            print('Invalid tag sequence: %s => %s' % (last, tok), file=sys.stderr)
            entity = make_entity(tok)
            _entities.extend([last, entity])
        else:
            raise ValueError('Invalid tag sequence: %s %s' % (last, tok))
        return _entities

    return reduce(reducer, toks[1:], [make_entity(toks[0]), ])


In [184]:
def non_other(entity):
    # type: (Entity) -> bool
    """
    Args:
        entity (Entity): 

    Returns:
        bool
    """
    return entity.tag != 'O'


def filter_entities(entities, p):
    """
    Args:
        entities (Iterable[Entity]): the entities in a sentence
        p (Call[[Entity],bool): the predicate

    Returns:
        List(Entity): the entities filtered by predicate p
    """
    return [entity for entity in entities if p(entity)]


def drop_other_entities(entities):
    """
    Args:
        entities (Iterable[Entity]): 

    Returns:
        Iterator[Entity]
    """
    return filter_entities(entities, non_other)


In [185]:

def doc_to_tokses(lines):
    """
    Args:
        lines (Iterable[str]): the lines in a document

    Returns:
        Dictionary[str,List[List[Tokens]]]: a nested list of list of tokens,
        with one list for each sentence, stored in a dict with keys for gold and guess
    """
    
    sents = get_sents(lines)
    tokses = defaultdict(list)
    for sent_id, sent in enumerate(sents):
        for src, toks in sent_to_toks(sent, sent_id).items():
            tokses[src].append(toks)
    return tokses


def flatten(nested):
    """
    Args:
        nested (Iterable[Iterable[T]]): a nested iterator

    Returns:
        List[T]: the iterator flattened into a list
    """
    return [x for xs in nested for x in xs]


def doc_to_toks(lines):
    """
    Args:
        lines (Iterator[str]): the lines in a document

    Returns:
        Dictionary[str,List[Tokens]]: a lists of all tokens in the document,
        stored in a dict with keys for gold and guess
    """
    return {src: flatten(nested)
            for src, nested in doc_to_tokses(lines).items()}



In [186]:
def doc_to_entitieses(lines):
    """
    Args:
        lines (Iterator[str]): the lines in a document

    Returns:
        Dictionary[str,List[List[Entity]]]: a nested list of lists of entities,
        stored in a dict with keys for gold and guess

    """
    entitieses = defaultdict(list)
    for src, tokses in doc_to_tokses(lines).items():
        entitieses[src] = [toks_to_entities(toks) for toks in tokses]
    return entitieses


def doc_to_entities(lines):
    """
    Args:
        lines (Iterator[str]): the lines in a document

    Returns:
        Dictionary[str,List[Entities]]: a lists of all entities in the document,
        stored in a dict with keys for gold and guess
    """
    return {src: flatten(nested)
            for src, nested in doc_to_entitieses(lines).items()}


def get_tags(entities):
    """
    Args:
        entities (Iterable[Entity]): the entities in a sentence

    Returns:
        Set[str]: a set of their tags, excluding 'O'
    """
    return {entity.tag for entity in entities} - {'O'}




In [187]:
Results = namedtuple('Results', 'gold guess correct p r f')

In [188]:
def get_tagged_entities(entities):
    """
    Args:
        entities (Dict[str,List[Entity]]): 

    Returns:
        Dict[str,List[Entity]]
    """
    return {src: drop_other_entities(entities)
            for src, entities in entities.items()}


def get_correct(gold, guess):
    """
    Args:
        gold (Iterable[T]): 
        guess (Iterable[T]): 

    Returns:
        Set[T]
    """
    return set(gold) & set(guess)


def get_tp(gold, guess):
    """
    Args:
        gold (Iterable[T]): 
        guess (Iterable[T]): 

    Returns:
        Set[T]
    """
    return get_correct(gold, guess)


def get_fn(gold, guess):
    """
    Args:
        gold (Iterable[T]): 
        guess (Iterable[T]): 

    Returns:
        Set[T]
    """
    return set(gold) - set(guess)


def get_fp(gold, guess):
    """
    Args:
        gold (Iterable[T]): 
        guess (Iterable[T]): 

    Returns:
        Set[T]
    """
    return set(guess) - set(gold)


def get_tn(tp, fp, fn, _all):
    """
    Args:
        tp (Set[T]): 
        fp (Set[T]): 
        fn (Set[T]):
        _all (Iterable[T]):

    Returns:
        Set[T]
    """
    return set(_all) - tp - fp - fn


def get_tp_fp_fn_tn(gold, guess, _all):
    """
    Args:
        gold (Iterator[T]): 
        guess (Iterator[T]): 
        _all (Iterator[T]):

    Returns:
        Tuple[Set[str],Set[str],Set[str],Set[str]]:
    """
    tp = get_tp(gold, guess)
    fp = get_fp(gold, guess)
    fn = get_fn(gold, guess)
    tn = get_tn(tp, fp, fn, _all)
    return tp, fp, fn, tn


def get_tp_fp_fn_tn_phrases(gold, guess, _all):
    """
    Args:
        gold: List[Entity]
        guess: List[Entity]
        _all: List[Entity]

    Returns:
        Tuple[Set[str],Set[str],Set[str],Set[str]]:
    """
    all_phrases = get_phrases(_all)
    gold_phrases = get_phrases(gold)
    guess_phrases = get_phrases(guess)
    correct_phrases = get_phrases(get_correct(gold, guess))
    tp = correct_phrases
    fp = guess_phrases - tp
    fn = gold_phrases - tp
    tn = get_tn(tp, fp, fn, all_phrases)
    return tp, fp, fn, tn

In [189]:

def calc_results(gold_entities, guess_entities, surface_form=False):
    """
    Args:
        gold_entities (List[Entity]): the gold standard entity annotations
        guess_entities (List[Entity]): a system's entity guesses
        surface_form (bool): whether or not to calculate f1-scores on the entity surface forms

    Returns:
        Results: the results stored in a namedtuple
    """
    # get the correct system guesses by taking the intersection of gold and guess entities,
    # taking into account tags and document locations
    correct_entities = get_correct(gold_entities, guess_entities)
    if surface_form:  # count only unique surface forms when True
        correct_entities = get_phrases_and_tags(correct_entities)
        gold_entities = get_phrases_and_tags(gold_entities)
        guess_entities = get_phrases_and_tags(guess_entities)

    gold = len(gold_entities)
    guess = len(guess_entities)
    correct = len(correct_entities)

    try:
        p = correct / float(guess)
    except ZeroDivisionError:
        p = 0.0
    try:
        r = correct / float(gold)
    except ZeroDivisionError:
        r = 0.0
    try:
        f = 2.0 * p * r / (p + r)
    except ZeroDivisionError:
        f = 0.0

    return Results(gold, guess, correct, p, r, f)

In [190]:
def fmt_results(tokens, all_entities, surface_form=False):
    """
    Args:
        tokens (Dict[str,List[Tokens]): a dictionary of gold and guess tokens
        all_entities (Dict[str,List[Entity]): a dictionary of gold and guess entities
        surface_form (bool): whether or not to calculate f1-scores on the entity surface forms

    Yield:
        str: (near) W-NUT format evaluation results
    """
    _sys = 'sys_1'
    # throw out 'O' tags to get overall p/r/f
    tagged_entities = get_tagged_entities(all_entities)
    results = {'all': calc_results(all_entities['gold'], all_entities[_sys], surface_form=False),
               'tagged': calc_results(tagged_entities['gold'], tagged_entities[_sys], surface_form),
               'tokens': calc_results(tokens['gold'], tokens[_sys], surface_form=False)}

    yield('processed %d tokens with %d phrases; ' %
          (results['tokens'].gold, results['tagged'].gold))
    yield('found: %d phrases; correct: %d.\n' %
          (results['tagged'].guess, results['tagged'].correct))

    if results['tokens'].gold > 0:
        # only use token counts for accuracy
        yield('accuracy: %6.2f%%; ' %
              (100. * results['tokens'].correct / results['tokens'].gold))
        yield('precision: %6.2f%%; ' % (100. * results['tagged'].p))
        yield('recall: %6.2f%%; ' % (100. * results['tagged'].r))
        yield('FB1: %6.2f\n' % (100. * results['tagged'].f))

    # get results for each entity category
    tags = get_tags(all_entities['gold'])
    for tag in sorted(tags):
        entities = {src: filter_entities(entities, lambda e: e.tag == tag)
                    for src, entities in all_entities.items()}
        results = calc_results(entities['gold'], entities[_sys], surface_form)
        yield('%17s: ' % tag)
        yield('precision: %6.2f%%; ' % (100. * results.p))
        yield('recall: %6.2f%%; ' % (100. * results.r))
        yield('FB1: %6.2f  %d\n' % (100. * results.f, results.correct))


# Training NER - WNUT17 Dataset

In [4]:
import sys, os
# Make sure you put the mitielib folder into the python search path.  There are
# a lot of ways to do this, here we do it programmatically with the following
# two statements:
parent = os.path.dirname(os.path.realpath('__file__'))
sys.path.append(parent + '/MITIE/mitielib')
#print(sys.path)

from mitie import *

In [None]:
### my code ###
words = []
tags = []
with open('/home/saad/Lab_Proj/emerging.dev.conll') as file:

        for line in file:
            s = (line.rstrip('\n')).split('\t')
            if s[0]!='':
                words.append(s[0])
                tags.append(s[1])
                #print(s[1])
#print(type(t))
#print(type(words))
print(len(words))

In [None]:
sample = ner_training_instance(words[0:9999])
s = [] 
for i in range(0,9999):
    #print(tags[i])
    #sample.add_entity(xrange(0,len(words[i])), tags[i])
    #print("%d:%d : %s" % (i,i+1,tags[i]))
    sample.add_entity(xrange(i,i+1), tags[i])
#sample.add_entity(xrange(3,5), "person")
#sample.add_entity(xrange(9,10), "org")




# And we add another training example
#sample2 = ner_training_instance(["The", "other", "day", "at", "work", "I", "saw", "Brian", "Smith", "from", "CMU", "."])
#sample2.add_entity(xrange(7,9), "person")
#sample2.add_entity(xrange(10,11), "org")

In [None]:
trainer = ner_trainer("/home/saad/MITIE/MITIE-models/english/total_word_feature_extractor.dat")
trainer.add(sample)
trainer.num_threads = 4

In [None]:
#ner = trainer.train()
#ner.save_to_disk("new_ner_model.dat")

In [None]:
print ("tags:", ner.get_possible_ner_tags())

In [None]:
tokens = ["I", "met", "with", "John", "Becker", "at", "HBU", "."]
entities = ner.extract_entities(tokens)
# Happily, it found the correct answers, "John Becker" and "HBU" in this case which we
# print out below.
print ("\nEntities found:", entities)
print ("\nNumber of entities detected:", len(entities))
for e in entities:
    range = e[0]
    tag = e[1]
    entity_text = " ".join(tokens[i] for i in range)
    print ("    " + tag + ": " + entity_text)

## Loading Trained Model and testing on WNut test dataset

In [8]:
import sys, os
# Make sure you put the mitielib folder into the python search path.  There are
# a lot of ways to do this, here we do it programmatically with the following
# two statements:
parent = os.path.dirname(os.path.realpath('__file__'))
sys.path.append(parent + '/MITIE/mitielib')

from mitie import *
from collections import defaultdict
import numpy as np

In [38]:


print("loading MITIE's OWN NER model...")
ner_model = named_entity_extractor('MITIE/MITIE-models/english/ner_model.dat')


loading MITIE's OWN NER model...
loading MITIE's WNUT Trained NER model...
('Tags output by this NER model:', ['O', 'B-location', 'I-location', 'B-person', 'I-person', 'B-corporation', 'I-corporation'])


#### Wnut - model

In [None]:
print("loading MITIE's WNUT Trained NER model...")
ner_w_model = named_entity_extractor('MITIE/MITIE-models/english/new_ner_model.dat')

print("Tags output by this NER model:", ner_w_model.get_possible_ner_tags())

In [25]:
tokens = []
with open('data/emerging.test.conll') as file:
        for line in file:
            s = (line.rstrip('\n')).split('\t')
            if s[0]!='':
                tokens.append(s[0])

#### Extracting Entities MITIE NER 

In [29]:
mitie_entities = ner_model.extract_entities(tokens)

#print("\nEntities found:", entities[0])
print("Number of entities detected by mitie model excluding 'O' :", len(mitie_entities))




("Number of entities detected by mitie model excluding 'O' :", 655)


#### Extracting Entities MITIE WNUT17 NER 

In [32]:
model_w_entities = ner_w_model.extract_entities(tokens)
print("\nNumber of entities detected by mitienut17 model:", len(model_w_entities))


('\nNumber of entities detected by mitienut17 model:', 22672)


In [35]:
with open('w_output.txt', 'w') as the_file:
    for e in model_w_entities:
        range = e[0]
        tag = e[1]
        score = e[2]
        score_text = "{:0.3f}".format(score)
        entity_text = " ".join(tokens[i] for i in range)   
        the_file.write(entity_text+"\t"+tag+"\n")
        print("   Score: " + score_text + ": " + tag + ": " + entity_text)
        #print("   Score: " + score_text + ": " + tag )


   Score: 0.974: O: &
   Score: 0.839: O: gt
   Score: 1.222: O: ;
   Score: 1.073: O: *
   Score: 0.820: O: The
   Score: 0.774: O: soldier
   Score: 1.155: O: was
   Score: 1.226: O: killed
   Score: 1.431: O: when
   Score: 1.141: O: another
   Score: 1.196: O: avalanche
   Score: 1.422: O: hit
   Score: 1.312: O: an
   Score: 1.001: O: army
   Score: 1.317: O: barracks
   Score: 1.454: O: in
   Score: 1.213: O: the
   Score: 1.217: O: northern
   Score: 1.164: O: area
   Score: 1.062: O: of
   Score: 0.406: O: Sonmarg
   Score: 1.365: O: ,
   Score: 1.182: O: said
   Score: 1.176: O: a
   Score: 1.297: O: military
   Score: 0.832: O: spokesman
   Score: 1.191: O: .
   Score: 1.292: O: &
   Score: 0.958: O: gt
   Score: 1.144: O: ;
   Score: 0.984: O: *
   Score: 0.804: O: Police
   Score: 0.911: O: last
   Score: 0.956: O: week
   Score: 1.042: O: evacuated
   Score: 0.947: O: 80
   Score: 0.830: O: villagers
   Score: 1.405: O: from
   Score: 0.550: O: Waltengoo
   Score: 0.572: O

   Score: 1.503: O: or
   Score: 1.304: O: undermined
   Score: 1.375: O: their
   Score: 1.171: O: interests
   Score: 1.539: O: ?
   Score: 1.318: O: "
   Score: 0.961: O: He
   Score: 1.501: O: got
   Score: 1.571: O: sent
   Score: 1.687: O: off
   Score: 1.342: O: in
   Score: 1.039: O: the
   Score: 1.125: O: last
   Score: 1.485: O: game
   Score: 1.097: O: vs
   Score: 1.449: O: .
   Score: 1.029: O: https://www.reddit.com/r/newzealand/comments/5p0edk/comment/dcnrf9t?st=IY5U4XSO&amp;sh=9fd7c3a9Ijustcouldn'tseetheedgeoftheNewplymouthradarcircleonOP'spic.
   Score: 0.855: O: COYB
   Score: 1.362: O: !
   Score: 1.036: O: Oh
   Score: 0.761: O: well
   Score: 1.216: O: .
   Score: 1.521: O: /
   Score: 1.105: O: r
   Score: 1.381: O: /
   Score: 0.957: O: politics
   Score: 1.088: O: yesterday
   Score: 1.259: O: :
   Score: 0.741: B-person: Trump
   Score: 1.520: O: has
   Score: 1.449: O: n
   Score: 1.348: O: '
   Score: 1.290: O: t
   Score: 1.598: O: had
   Score: 1.243: O: a

   Score: 1.068: O: ]
   Score: 1.156: O: (
   Score: 0.802: O: https://www.reddit.com/r/worldnews/comments/54khw0/saudi_women_file_petition_to_end_male/d82ygzg/
   Score: 1.088: O: )
   Score: 1.326: O: .
   Score: 1.484: O: I
   Score: 1.172: O: remember
   Score: 1.528: O: around
   Score: 1.224: O: a
   Score: 1.199: O: year
   Score: 1.118: O: ago
   Score: 1.093: O: there
   Score: 1.579: O: was
   Score: 1.163: O: a
   Score: 1.605: O: post
   Score: 1.614: O: about
   Score: 1.264: O: how
   Score: 0.721: B-person: Ronaldo
   Score: 1.249: O: had
   Score: 1.271: O: scored
   Score: 1.484: O: over
   Score: 1.219: O: 40
   Score: 1.244: O: %
   Score: 1.168: O: of
   Score: 1.244: O: '
   Score: 1.433: O: s
   Score: 1.351: O: goals
   Score: 1.553: O: since
   Score: 0.998: O: 2009
   Score: 1.626: O: .
   Score: 1.325: O: They
   Score: 1.377: O: actually
   Score: 1.340: O: regularly
   Score: 0.957: O: upvote
   Score: 1.421: O: posts
   Score: 1.459: O: to
   Score: 1.190:

   Score: 1.066: O: format
   Score: 1.046: O: ]
   Score: 1.089: O: (
   Score: 0.610: O: https://www.reddit.com/r/soccer/comments/5n5fg7/oc_an_innovative_format_to_save_the_world_cup_and/
   Score: 1.210: O: )
   Score: 1.266: O: Thanks
   Score: 0.910: O: to
   Score: 0.982: O: u
   Score: 1.225: O: /
   Score: 0.475: B-person: BordNaMonaLisa
   Score: 1.400: O: for
   Score: 1.180: O: reminding
   Score: 1.396: O: me
   Score: 1.098: O: of
   Score: 1.338: O: this
   Score: 1.590: O: .
   Score: 1.576: O: I
   Score: 1.441: O: just
   Score: 1.335: O: wrote
   Score: 1.158: O: a
   Score: 1.438: O: fucking
   Score: 1.342: O: novel
   Score: 1.173: O: regarding
   Score: 0.507: O: BLM
   Score: 1.248: O: '
   Score: 1.442: O: s
   Score: 1.337: O: use
   Score: 1.368: O: of
   Score: 1.085: O: blanket
   Score: 1.294: O: statements
   Score: 1.363: O: [
   Score: 1.239: O: here
   Score: 1.255: O: .
   Score: 1.228: O: '
   Score: 1.217: O: Many
   Score: 1.036: O: wounded
   Score

   Score: 1.382: O: the
   Score: 1.368: O: COM
   Score: 1.542: O: and
   Score: 1.507: O: the
   Score: 1.544: O: moment
   Score: 1.143: O: it
   Score: 1.010: O: produces
   Score: 1.337: O: .
   Score: 1.566: O: And
   Score: 1.131: O: one
   Score: 1.181: O: might
   Score: 1.034: O: also
   Score: 1.140: O: ask
   Score: 1.243: O: .
   Score: 1.265: O: .
   Score: 1.353: O: .
   Score: 1.066: O: *
   Score: 1.374: O: what
   Score: 1.481: O: was
   Score: 1.373: O: his
   Score: 1.628: O: plan
   Score: 1.296: O: there
   Score: 1.577: O: ,
   Score: 1.092: O: anyway
   Score: 1.370: O: ?
   Score: 1.488: O: !
   Score: 1.037: O: *
   Score: 1.359: O: How
   Score: 1.335: O: on
   Score: 0.797: O: earth
   Score: 1.375: O: did
   Score: 1.203: O: he
   Score: 1.320: O: know
   Score: 1.250: O: where
   Score: 1.230: O: '
   Score: 1.247: O: ll
   Score: 1.345: O: hit
   Score: 1.371: O: and
   Score: 1.034: O: who
   Score: 1.306: O: '
   Score: 1.261: O: ll
   Score: 1.128: O: 

   Score: 1.502: O: This
   Score: 1.552: O: isn
   Score: 1.263: O: ’
   Score: 1.208: O: t
   Score: 1.224: O: *
   Score: 1.592: O: that
   Score: 1.035: O: *
   Score: 1.046: O: unrealistic
   Score: 1.529: O: for
   Score: 1.118: O: an
   Score: 1.261: O: item
   Score: 1.119: O: ’
   Score: 0.949: O: s
   Score: 0.943: O: value
   Score: 1.301: O: (
   Score: 1.245: O: as
   Score: 1.024: O: a
   Score: 1.606: O: gift
   Score: 1.516: O: ,
   Score: 1.360: O: it
   Score: 1.162: O: kind
   Score: 0.970: O: of
   Score: 1.582: O: is
   Score: 1.061: O: though
   Score: 1.207: O: )
   Score: 1.089: O: .
   Score: 1.185: O: Do
   Score: 0.706: O: politicians
   Score: 1.420: O: ever
   Score: 1.210: O: admit
   Score: 1.051: O: they
   Score: 1.512: O: made
   Score: 1.366: O: a
   Score: 1.751: O: bad
   Score: 1.460: O: decision
   Score: 1.519: O: ?
   Score: 1.590: O: I
   Score: 1.331: O: still
   Score: 1.712: O: think
   Score: 1.558: O: that
   Score: 1.098: O: with
   Score

   Score: 1.317: O: more
   Score: 1.242: O: prayers
   Score: 1.580: O: or
   Score: 0.969: O: religious
   Score: 1.065: O: speeches
   Score: 1.218: O: happened
   Score: 1.225: O: ,
   Score: 1.574: O: and
   Score: 1.082: O: one
   Score: 1.282: O: of
   Score: 1.169: O: them
   Score: 1.613: O: was
   Score: 1.650: O: by
   Score: 1.329: O: a
   Score: 0.613: O: Rabbi
   Score: 1.498: O: .
   Score: 1.500: O: this
   Score: 1.644: O: goes
   Score: 1.531: O: to
   Score: 1.239: O: the
   Score: 1.268: O: point
   Score: 1.352: O: I
   Score: 1.416: O: made
   Score: 1.533: O: below
   Score: 1.171: O: -
   Score: 1.258: O: there
   Score: 1.351: O: are
   Score: 1.114: O: many
   Score: 1.383: O: different
   Score: 1.098: O: flavors
   Score: 1.144: O: of
   Score: 1.145: O: "
   Score: 1.175: O: secular
   Score: 1.036: O: "
   Score: 1.187: O: ,
   Score: 1.520: O: just
   Score: 1.441: O: as
   Score: 1.341: O: there
   Score: 1.458: O: are
   Score: 1.136: O: many
   Score: 

   Score: 1.220: O: show
   Score: 1.386: O: ?
   Score: 1.382: O: .
   Score: 1.441: O: .
   Score: 1.534: O: .
   Score: 0.778: O: Er
   Score: 1.347: O: ,
   Score: 1.604: O: did
   Score: 1.368: O: n
   Score: 1.566: O: '
   Score: 1.522: O: t
   Score: 1.491: O: it
   Score: 1.390: O: say
   Score: 1.216: O: on
   Score: 1.177: O: the
   Score: 1.398: O: page
   Score: 1.135: O: where
   Score: 1.031: O: it
   Score: 1.123: O: was
   Score: 1.178: O: from
   Score: 1.325: O: ?
   Score: 1.016: O: Looking
   Score: 1.318: O: for
   Score: 1.247: O: a
   Score: 1.313: O: movie
   Score: 1.386: O: where
   Score: 1.070: O: there
   Score: 1.602: O: is
   Score: 1.202: O: a
   Score: 1.127: O: Powerful
   Score: 0.945: O: Board
   Score: 0.937: O: Game
   Score: 1.540: O: played
   Score: 1.537: O: by
   Score: 1.163: O: two
   Score: 0.679: O: brothers
   Score: 1.407: O: ?
   Score: 1.273: O: my
   Score: 1.093: O: concern
   Score: 1.564: O: is
   Score: 1.652: O: about
   Score: 1

   Score: 1.598: O: is
   Score: 1.476: O: going
   Score: 1.446: O: to
   Score: 0.979: O: vindicate
   Score: 0.853: O: someone
   Score: 1.071: O: .
   Score: 0.948: O: @
   Score: 0.597: O: itpastorn
   Score: 0.930: O: yes
   Score: 1.330: O: ,
   Score: 1.450: O: but
   Score: 1.532: O: I
   Score: 1.406: O: think
   Score: 1.679: O: you
   Score: 1.234: O: can
   Score: 1.201: O: also
   Score: 1.403: O: take
   Score: 1.241: O: the
   Score: 1.640: O: ones
   Score: 1.591: O: that
   Score: 1.089: O: are
   Score: 1.390: O: described
   Score: 1.270: O: as
   Score: 1.500: O: evidence
   Score: 1.710: O: that
   Score: 1.243: O: there
   Score: 1.536: O: generally
   Score: 1.179: O: *
   Score: 1.239: O: are
   Score: 1.087: O: *
   Score: 1.065: O: holidays
   Score: 1.275: O: ,
   Score: 1.505: O: probably
   Score: 1.247: O: many
   Score: 1.325: O: different
   Score: 1.599: O: ones
   Score: 1.296: O: in
   Score: 0.962: O: many
   Score: 1.112: O: different
   Score: 1.0

   Score: 1.540: O: Being
   Score: 1.367: O: sexually
   Score: 1.117: O: frustrated
   Score: 1.471: O: is
   Score: 1.545: O: the
   Score: 1.703: O: worst
   Score: 0.894: O: RT
   Score: 1.044: O: @
   Score: 0.750: O: CreepBJ
   Score: 1.187: O: :
   Score: 1.322: O: Wow
   Score: 1.270: O: !
   Score: 1.506: O: !
   Score: 1.211: O: See
   Score: 1.262: O: thru
   Score: 1.132: O: leggings
   Score: 1.433: O: are
   Score: 1.656: O: awesome
   Score: 1.204: O: https://t.co/YPSuF6GgVm
   Score: 1.053: O: via
   Score: 0.996: O: @
   Score: 0.353: O: SexySights
   Score: 0.904: O: @
   Score: 0.537: O: SexyCreeps
   Score: 0.947: O: @
   Score: 0.544: O: CreepShotsLive
   Score: 1.013: O: https://t.co/0TwauVET…
   Score: 1.250: O: Just
   Score: 1.497: O: posted
   Score: 1.343: O: a
   Score: 1.346: O: photo
   Score: 1.238: O: https://t.co/mi0kwHp0vl
   Score: 0.601: O: Legal
   Score: 0.609: O: Assistant
   Score: 0.722: O: Location
   Score: 1.110: O: :
   Score: 0.852: O: Los

   Score: 1.263: O: @
   Score: 1.255: O: 69
   Score: 1.084: O: sexxxo
   Score: 1.092: O: @
   Score: 0.677: O: Teddys
   Score: 1.352: O: _
   Score: 1.365: O: takeover
   Score: 1.126: O: @
   Score: 0.724: O: irinagomez
   Score: 0.916: O: 60
   Score: 1.114: O: @
   Score: 0.559: O: Chicas
   Score: 1.226: O: _
   Score: 0.909: O: Web
   Score: 0.955: O: @
   Score: 0.587: O: VirtuAss
   Score: 0.949: O: @
   Score: 0.866: O: sexotx
   Score: 1.078: O: @
   Score: 0.773: O: tias
   Score: 1.240: O: _
   Score: 0.782: O: twiter
   Score: 0.941: O: @
   Score: 0.373: O: PornHardd
   Score: 1.076: O: @
   Score: 0.567: O: ophelia
   Score: 0.851: O: 231013
   Score: 1.016: O: …
   Score: 1.218: O: @
   Score: 1.058: O: rmathis
   Score: 1.108: O: 15
   Score: 0.577: B-person: Nicole
   Score: 1.267: O: '
   Score: 1.275: O: s
   Score: 1.238: O: sorbet
   Score: 0.948: O: gum
   Score: 1.234: O: ?
   Score: 0.998: O: @
   Score: 0.414: O: grigiosluts
   Score: 0.939: O: @
   Score: 

   Score: 0.993: O: :
   Score: 1.192: O: What
   Score: 1.741: O: if
   Score: 1.456: O: this
   Score: 1.547: O: account
   Score: 1.553: O: that
   Score: 1.608: O: is
   Score: 1.472: O: simply
   Score: 1.481: O: half
   Score: 1.317: O: an
   Score: 0.959: O: onion
   Score: 1.346: O: in
   Score: 1.009: O: a
   Score: 1.230: O: bag
   Score: 1.301: O: ended
   Score: 1.533: O: up
   Score: 1.227: O: with
   Score: 1.386: O: more
   Score: 1.303: O: followers
   Score: 1.539: O: than
   Score: 1.065: O: …
   Score: 0.809: O: https://t.co/E4qbKeQnqK
   Score: 0.888: O: @
   Score: 0.768: O: CNNPolitics
   Score: 1.117: O: this
   Score: 1.600: O: is
   Score: 1.145: O: a
   Score: 1.580: O: joke
   Score: 1.353: O: right
   Score: 0.486: O: RT
   Score: 0.665: O: @
   Score: 0.441: O: HalfOnionInABag
   Score: 0.821: O: :
   Score: 1.390: O: When
   Score: 1.502: O: you
   Score: 1.234: O: realize
   Score: 1.165: O: you
   Score: 1.235: O: would
   Score: 1.234: O: '
   Score: 1.

   Score: 1.419: O: old
   Score: 1.052: O: classic
   Score: 0.958: O: games
   Score: 1.258: O: !
   Score: 1.087: O: https://t.co/BJNEZfJbge
   Score: 1.051: O: Then
   Score: 0.526: O: raoul
   Score: 1.180: O: spoke
   Score: 0.814: O: m
   Score: 1.276: O: ,
   Score: 1.182: O: her
   Score: 1.396: O: eyes
   Score: 1.367: O: in
   Score: 1.152: O: all
   Score: 1.123: O: their
   Score: 1.267: O: pains
   Score: 1.294: O: ,
   Score: 1.057: O: oh
   Score: 0.778: O: madam
   Score: 0.495: O: mina
   Score: 1.343: O: .
   Score: 1.165: O: https://t.co/BJSV3u2Bo1
   Score: 0.713: O: RT
   Score: 1.193: O: @
   Score: 0.993: O: meoss
   Score: 1.095: O: 31
   Score: 1.347: O: :
   Score: 0.826: B-person: Ed
   Score: 1.405: O: '
   Score: 1.400: O: s
   Score: 1.485: O: -
   Score: 1.859: O: you
   Score: 1.602: O: are
   Score: 1.546: O: the
   Score: 1.522: O: one
   Score: 1.152: O: who
   Score: 1.347: O: makes
   Score: 1.219: O: me
   Score: 1.183: O: happy
   Score: 1.157: O

   Score: 1.371: O: always
   Score: 1.263: O: exist
   Score: 1.251: O: Listen
   Score: 1.442: O: to
   Score: 1.343: O: by
   Score: 0.525: O: RichiDenz
   Score: 1.227: O: #
   Score: 1.386: O: np
   Score: 1.413: O: on
   Score: 1.092: O: #
   Score: 0.864: O: SoundCloud
   Score: 1.114: O: https://t.co/5x6wc5sRqA
   Score: 0.841: O: RT
   Score: 1.127: O: @
   Score: 1.134: O: paleofuture
   Score: 1.172: O: :
   Score: 1.258: O: Seriously
   Score: 1.509: O: ,
   Score: 1.302: O: why
   Score: 1.559: O: is
   Score: 0.593: B-person: Ivanka
   Score: 1.484: O: in
   Score: 1.242: O: these
   Score: 1.228: O: meetings
   Score: 1.068: O: with
   Score: 1.037: O: world
   Score: 0.884: O: leaders
   Score: 1.289: O: ?
   Score: 1.085: O: Has
   Score: 1.086: O: she
   Score: 1.127: O: been
   Score: 1.515: O: given
   Score: 1.312: O: a
   Score: 1.332: O: title
   Score: 1.472: O: yet
   Score: 1.197: O: because
   Score: 1.210: O: this
   Score: 1.474: O: is
   Score: 1.413: O: j

   Score: 1.458: O: .
   Score: 1.374: O: #
   Score: 0.902: O: NDP
   Score: 1.090: O: #
   Score: 1.192: O: abpoli
   Score: 1.001: O: #
   Score: 0.858: O: Notley
   Score: 1.176: O: https://t.co/xbGXAlz2q9
   Score: 1.315: O: #
   Score: 1.444: O: resist
   Score: 1.290: O: #
   Score: 1.470: O: respect
   Score: 1.236: O: https://t.co/S33Aqnzc2r
   Score: 1.510: O: I
   Score: 1.629: O: just
   Score: 1.814: O: want
   Score: 1.422: O: a
   Score: 1.510: O: man
   Score: 1.716: O: that
   Score: 1.486: O: loves
   Score: 1.548: O: me
   Score: 1.407: O: the
   Score: 1.706: O: way
   Score: 1.194: B-person: Jack
   Score: 0.655: I-person: Pearson
   Score: 1.076: O: loves
   Score: 0.732: B-person: Rebecca
   Score: 1.211: O: .
   Score: 0.749: O: RT
   Score: 0.953: O: @
   Score: 0.995: O: twentyonepilots
   Score: 1.237: O: :
   Score: 1.240: O: feb
   Score: 1.424: O: 26
   Score: 1.452: O: n
   Score: 0.836: O: charleston
   Score: 1.268: O: #
   Score: 1.314: O: photos
   Sc

In [36]:
d = []
with open('mitie_output.txt', 'w') as the_file:
    for e in mitie_entities:
        range = e[0]
        tag = e[1]
        score = e[2]
        score_text = "{:0.3f}".format(score)
        entity_text = " ".join(tokens[i] for i in range) 
        #print(len(entity_text.split()))
        if tag == 'ORGANIZATION':
            tag = 'corporation'
        if(len(entity_text.split())>1):
            #print(entity_text)
            for i, val in enumerate(entity_text.split(), 0):
                if(i==0):
                    the_file.write(val+"\t"+('B-'+tag)+"\n")
                else:
                    the_file.write(val+"\t"+('I-'+tag)+"\n")
        else:
            the_file.write(entity_text+"\t"+('B-'+tag)+"\n")
        d.append(entity_text.split())

In [17]:
toks = []
l = [x for x in tokens if x not in d]
for item in tokens:
    if item in d:
        print(item)
        pass
    else:
        toks.append(item)


In [20]:
print(len(toks),len(l))

(22672, 22672)


### Evaluating Results of MITIE Trained NER

In [339]:
with open('data/emerging.test.conll', 'r') as the_file:
    eval_set = the_file.read().splitlines()
print(type(eval_set))

<type 'list'>


In [340]:
with open('w_output.txt', 'r') as the_file:
    w_output = the_file.read().splitlines()
print(type(w_output))
#for x in enumerate(eval_set):
#        eval_set = [j + w_output[] for j in eval_set]

<type 'list'>


In [341]:
with open('mtaner_output', 'r') as the_file:
        nerw_output = the_file.read().splitlines()
        


In [354]:
with open('mtaner_output', 'r') as the_file:
        nerw_output = the_file.read().splitlines()
with open('test.conll', 'r') as the_file:
        test_conll = the_file.read().splitlines()


t = [i.split('\t')[0] for i in w_output]
t2 = [i.split('\t')[0] for i in nerw_output]
#print(t)
#s = set(t).intersection(t2)

#s = list(s)
#print(s)


In [355]:
#for i,x in enumerate(eval_set):   
#    eval_set = [j + w_output[i] for j in eval_set]
#print(s)
mta_tags_only = []
tags_only = [i.split('\t')[1] for i in w_output]
mta_tags_only = [i.split('\t')[1] for i in nerw_output]


#for i,x in enumerate(nerw_output):
#        mta_tags_only.append(x.split('\t')[1])
        #print(x.split('\t')[1])
    
print(len(test_conll))
print(len(mta_tags_only))
print(len(tags_only))
print(len(eval_set))

23394
23394
22672
22672


### MITIEWNUT17 EVal set

In [345]:
for i,x in enumerate(eval_set): 
    print(x + '\t'+tags_only[i])
    eval_set[i] = x + '\t'+tags_only[i]
print(len(eval_set))

&	O	O
gt	O	O
;	O	O
*	O	O
The	O	O
soldier	O	O
was	O	O
killed	O	O
when	O	O
another	O	O
avalanche	O	O
hit	O	O
an	O	O
army	O	O
barracks	O	O
in	O	O
the	O	O
northern	O	O
area	O	O
of	O	O
Sonmarg	B-location	O
,	O	O
said	O	O
a	O	O
military	O	O
spokesman	O	O
.	O	O
&	O	O
gt	O	O
;	O	O
*	O	O
Police	O	O
last	O	O
week	O	O
evacuated	O	O
80	O	O
villagers	O	O
from	O	O
Waltengoo	B-location	O
Nar	I-location	O
where	O	O
dozens	O	O
were	O	O
killed	O	O
after	O	O
a	O	O
series	O	O
of	O	O
avalanches	O	O
hit	O	O
the	O	O
area	O	O
in	O	O
2005	O	O
in	O	O
the	O	O
south	O	O
of	O	O
the	O	O
territory	O	O
.	O	O
&	O	O
gt	O	O
;	O	O
*	O	O
The	O	O
army	O	O
on	O	O
Thursday	O	O
recovered	O	O
the	O	O
bodies	O	O
of	O	O
ten	O	O
of	O	O
its	O	O
men	O	O
who	O	O
were	O	O
killed	O	O
in	O	O
an	O	O
avalanche	O	O
the	O	O
previous	O	O
day	O	O
.	O	O
&	O	O
gt	O	O
;	O	O
*	O	O
The	O	O
four	O	O
civilians	O	O
killed	O	O
included	O	O
two	O	O
children	O	O
of	O	O
a	O	O
family	O	O
whose	O	O
house	O	O
was	O	O
hit	O	O
by	O	O
a	O	O
separate	O	O
avala

the	O	O
country	O	O
.	O	O
Did	O	O
you	O	O
miss	O	O
[	O	O
this	O	O
]	O	O
(	O	O
https://www.reddit.com/r/soccer/comments/5lrdkr/ars%C3%A8ne_wenger_in_20_years_it_is_the_most_uneven/?ref=search_posts	O	O
)	O	O
yesterday	O	O
?	O	O
[	O	O
Ritchie	B-person	B-person
McClaw	I-person	O
]	O	O
(	O	O
https://imgur.com/a/xP0Ta	O	O
)	O	O
Crosspost	O	O
do	O	O
r	O	O
/	O	O
worldnews	O	O
,	O	O
https://www.reddit.com/r/worldnews/comments/5nktt8/brazilian_farm_owners_form_militias_to_attack/	O	O
Ca	O	O
serait	O	O
meme	O	O
plutot	O	O
le	O	O
contraire	O	O
,	O	O
sans	O	O
ce	O	O
feu	O	O
,	O	O
il	O	O
n	O	O
'	O	O
y	O	O
aurait	O	O
eu	O	O
probablement	O	O
pas	O	O
de	O	O
survivant	O	O
:	O	O
https://np.reddit.com/r/worldnews/comments/5lf41g/titanic_sank_due_to_enormous_uncontrollable_fire/dbvlteg/?contex=7	O	O
I	O	O
'	O	O
m	O	O
saying	O	O
this	O	O
guy	O	O
is	O	O
assuming	O	O
that	O	O
they	O	O
assumed	O	O
he	O	O
supported	O	O
Trump	B-person	B-person
because	O	O
he	O	O
'	O	O
s	O	O
white	O	O
.	O	O
Hey	O	O
u	B-person	O


cannot	O	O
apply	O	O
Kirchhoff	B-person	O
'	O	O
s	O	O
law	O	O
then	O	O
.	O	O
Looking	O	O
up	O	O
touchscreen	O	O
history	O	O
-	O	O
found	O	O
this	O	O
1981	O	O
computer	O	O
that	O	O
used	O	O
Infrared	O	O
to	O	O
detect	O	O
finger	O	O
movement	O	O
.	O	O
Clearly	O	O
was	O	O
an	O	O
inspiration	O	O
.	O	O
https://en.wikipedia.org/wiki/Touchscreen#/media/File:Platovterm1981.jpg	O	O
I	O	O
'	O	O
m	O	O
referring	O	O
to	O	O
the	O	O
ones	O	O
newly	O	O
introduced	O	O
in	O	O
-	O	O
I	O	O
ca	O	O
n	O	O
'	O	O
t	O	O
remember	O	O
if	O	O
they	O	O
were	O	O
given	O	O
a	O	O
specific	O	O
name	O	O
-	O	O
I	O	O
only	O	O
remember	O	O
them	O	O
being	O	O
referred	O	O
to	O	O
as	O	O
'	O	O
Droids	I-product	O
'	O	O
Religious	O	O
persecution	O	O
is	O	O
hardly	O	O
unique	O	O
to	O	O
communist	O	O
countries	O	O
.	O	O
It	O	O
'	O	O
s	O	O
practically	O	O
universal	O	O
.	O	O
How	O	O
would	O	O
you	O	O
classify	O	O
the	O	O
documentary	O	O
?	O	O
Which	O	O
protocol	O	O
should	O	O
I	O	O
use	O	O
for	O	O
secure	O	O
message	O	O
transfers

him	O	O
,	O	O
but	O	O
I	O	O
do	O	O
n	O	O
'	O	O
t	O	O
if	O	O
that	O	O
counts	O	O
:	O	O
-	O	O
)	O	O
But	O	O
,	O	O
I	O	O
guess	O	O
we	O	O
look	O	O
at	O	O
this	O	O
emphasizing	O	O
very	O	O
different	O	O
principles	O	O
.	O	O
I	O	O
appreciate	O	O
the	O	O
time	O	O
you	O	O
put	O	O
into	O	O
answering	O	O
my	O	O
comments	O	O
.	O	O
Looks	O	O
like	O	O
the	O	O
moderators	O	O
think	O	O
the	O	O
exchange	O	O
is	O	O
too	O	O
long	O	O
,	O	O
so	O	O
may	O	O
be	O	O
we	O	O
can	O	O
return	O	O
to	O	O
this	O	O
in	O	O
the	O	O
context	O	O
of	O	O
some	O	O
thread	O	O
.	O	O
What	O	O
'	O	O
s	O	O
meant	O	O
by	O	O
this	O	O
bill	O	O
(	O	O
)	O	O
text	O	O
?	O	O
What	O	O
was	O	O
funny	O	O
about	O	O
Hanzo	B-person	B-person
'	O	O
s	O	O
hand	O	O
in	O	O
the	O	O
film	O	O
?	O	O
I	O	O
'	O	O
m	O	O
uncomfortable	O	O
with	O	O
this	O	O
kind	O	O
of	O	O
question	O	O
.	O	O
It	O	O
'	O	O
s	O	O
asking	O	O
for	O	O
direct	O	O
help	O	O
in	O	O
cracking	O	O
someone	O	O
'	O	O
s	O	O
wifi	O	O
network	O	O
.	O	O
In	O	O
the	O	O
context	O	O
of	O	O
U	I

this	O	O
?	O	O
?	O	O
Where	O	O
was	O	O
the	O	O
planet	O	O
inhabited	O	O
by	O	O
"	O	O
"	O	O
located	O	O
relative	O	O
to	O	O
Earth	B-location	B-location
?	O	O
Why	O	O
is	O	O
the	O	O
perpendicular	O	O
part	O	O
a	O	O
scalar	O	O
in	O	O
the	O	O
electrodynamic	O	O
boundary	O	O
conditions	O	O
at	O	O
surfaces	O	O
?	O	O
Animated	O	O
TV	O	O
series	O	O
with	O	O
/	O	O
theme	O	O
;	O	O
'	O	O
'	O	O
replaced	O	O
by	O	O
robot	I-product	O
scepter	I-product	O
Is	O	O
the	O	O
story	O	O
plotline	O	O
or	O	O
arc	O	O
for	O	O
the	O	O
series	O	O
going	O	O
to	O	O
be	O	O
overhauled	O	O
with	O	O
Carrie	B-person	B-person
Fisher	I-person	I-person
'	O	O
s	O	O
death	O	O
?	O	O
What	O	O
1980	O	O
'	O	O
s	O	O
kids	O	O
book	O	O
series	O	O
about	O	O
boy	O	O
learning	O	O
magic	O	O
spells	O	O
?	O	O
Why	O	O
was	O	O
n	O	O
'	O	O
t	O	O
the	O	O
line	O	O
of	O	O
succession	O	O
for	O	O
cabinet	O	O
members	O	O
followed	O	O
after	O	O
they	O	O
died	O	O
?	O	O
can	O	O
a	O	O
nation	O	O
encompassing	O	O
a	O	O
diaspora	O	O
with	O	O
no	O	O
geog

you	O	O
mentioned	O	O
makes	O	O
sense	O	O
.	O	O
In	O	O
the	O	O
scene	O	O
where	O	O
Obi-	B-person	O
Wan	I-person	B-person
and	O	O
Anakin	B-person	O
are	O	O
stupid	O	O
enough	O	O
to	O	O
walk	O	O
backwards	O	O
into	O	O
the	O	O
elevator	O	O
,	O	O
the	O	O
could	O	O
have	O	O
just	O	O
shot	O	O
them	O	O
.	O	O
presence	O	O
in	O	O
other	O	O
countries	O	O
soil	O	O
Avoided	O	O
Crossing	O	O
in	O	O
QM	O	B-location
I	O	O
am	O	O
unable	O	O
to	O	O
get	O	O
rid	O	O
of	O	O
extra	O	O
charecters	O	O
with	O	O
comments	O	O
in	O	O
injection	O	O
payload	O	O
Why	O	O
do	O	O
n	O	O
'	O	O
t	O	O
cities	O	O
plant	O	O
more	O	O
fruit	O	O
trees	O	O
?	O	O
Help	O	O
identify	O	O
a	O	O
collection	O	O
of	O	O
short	O	O
stories	O	O
@	O	O
Bebs	B-person	O
That	O	O
'	O	O
s	O	O
possible	O	O
too	O	O
.	O	O
Why	O	O
is	O	O
imposing	O	O
vaccination	O	O
/	O	O
immunization	O	O
so	O	O
hard	O	O
to	O	O
achieve	O	O
?	O	O
Few	O	O
Questions	O	O
About	O	O
Hidden	I-product	O
Services	I-product	O
It	O	O
cannot	O	O
be	O	O
migrated	O	O
with	O	O
an	O

While	O	O
I	O	O
can	O	O
not	O	O
formally	O	O
prove	O	O
it	O	O
,	O	O
I	O	O
find	O	O
it	O	O
very	O	O
hard	O	O
to	O	O
believe	O	O
that	O	O
feeding	O	O
SHA	O	O
1	O	O
with	O	O
unpredictable	O	O
input	O	O
would	O	O
produce	O	O
predictable	O	O
output	O	O
.	O	O
What	O	O
is	O	O
the	O	O
significance	O	O
of	O	O
the	O	O
torn	O	O
off	O	O
button	O	O
scene	O	O
in	O	O
?	O	O
Higgs	O	O
boson	O	O
and	O	O
electroweak	O	O
gauge	O	O
boson	O	O
transformations	O	O
under	O	O
CP	O	O
Relative	O	O
world	O	O
carbon	O	O
footprints	O	O
by	O	O
nation	O	O
?	O	O
After	O	O
seeing	O	O
this	O	O
[	O	O
article	O	O
]	O	O
(	O	O
https://arxiv.org/abs/0704.3116	O	O
)	O	O
,	O	O
I	O	O
am	O	O
convinced	O	O
of	O	O
the	O	O
method	O	O
you	O	O
give	O	O
in	O	O
your	O	O
answer	O	O
.	O	O
However	O	O
,	O	O
I	O	O
have	O	O
n	O	O
'	O	O
t	O	O
still	O	O
been	O	O
able	O	O
to	O	O
derive	O	O
the	O	O
VEV	O	O
shown	O	O
in	O	O
the	O	O
question	O	O
.	O	O
Could	O	O
you	O	O
give	O	O
details	O	O
?	O	O
Also	O	O
,	O	O
I	O	O
'	O	O
m	O	O
not	O	O
sure	O	O
wh

urinal	O	O
porn	O	O
https://t.co/hroXOjtyem	O	O
RT	O	O
@	O	O
omsblvr	O	O
:	O	O
for	O	O
myself	O	O
,	O	O
a	O	O
heart	O	O
.	O	O
https://t.co/l3Hk7pWE5g	O	O
@	O	O
skipsy	O	O
_	O	O
l	O	O
Right	O	O
?	O	O
!	O	O
I	O	O
went	O	O
to	O	O
take	O	O
a	O	O
shower	O	O
&	O	O
amp	O	O
;	O	O
head	O	O
to	O	O
bed	O	O
.	O	O
My	O	O
phone	O	O
was	O	O
blowing	O	O
up	O	O
.	O	O
I	O	O
was	O	O
like	O	O
.	O	O
.	O	O
wth	O	O
is	O	O
happening	O	O
?	O	O
Then	O	O
booya	O	O
!	O	O
Hit	O	O
that	O	O
quota	O	O
!	O	O
Waitlist	O	O
#	O	O
3	O	O
but	O	O
doubt	O	O
I	O	O
'	O	O
ll	O	O
get	O	O
in	O	O
the	O	O
class	O	O
.	O	O
Low	O	O
key	O	O
not	O	O
even	O	O
mad	O	O
'	O	O
'	O	O
Ladies	O	O
And	O	O
Gentlemen	O	O
,	O	O
We	O	O
Bring	O	O
To	O	O
You	O	O
The	O	O
Girl	O	O
With	O	O
The	O	O
Biggest	O	O
Vagina	O	O
In	O	O
The	O	O
World	O	O
https://t.co/6JYh3Id7BW	O	O
RT	O	O
@	O	O
heyimbee	O	O
:	O	O
@	O	O
StrauberryJam	O	O
SHE	O	O
IS	O	O
DEAD	O	O
RT	O	O
@	O	O
_	O	O
yungJ	O	O
3	O	O
:	O	O
I	O	O
just	O	O
need	O	O
my	O	O
diploma	O	O
https://t.co/59y6vdu

@	O	O
Teddys	B-person	O
_	O	O
takeover	O	O
@	O	O
irinagomez	B-person	O
60	O	O
@	O	O
Chicas	O	O
_	O	O
Web	O	O
@	O	O
VirtuAss	O	O
@	O	O
sexotx	O	O
@	O	O
tias	O	O
_	O	O
twiter	O	O
@	O	O
PornHardd	O	O
@	O	O
ophelia	O	O
231013	O	O
…	O	O
@	O	O
rmathis	O	O
15	O	O
Nicole	B-person	B-person
'	O	O
s	O	O
sorbet	I-product	O
gum	I-product	O
?	O	O
@	O	O
grigiosluts	O	O
@	O	O
ArtpopRemixed	O	O
you	O	O
keep	O	O
saying	O	O
"	O	O
if	O	O
"	O	O
it	O	O
didn	O	O
'	O	O
t	O	O
face	O	O
it	O	O
ffs	O	O
RT	O	O
@	O	O
broodingbrahmin	O	O
:	O	O
As	O	O
a	O	O
veteran	O	O
,	O	O
this	O	O
fills	O	O
me	O	O
with	O	O
rage	O	O
.	O	O
Contemptuous	O	O
disregard	O	O
for	O	O
those	O	O
who	O	O
fought	O	O
and	O	O
died	O	O
honorably	O	O
serving	O	O
this	O	O
…	O	O
RT	O	O
@	O	O
romanhipaula	O	O
:	O	O
https://t.co/MzXu2uqOMC	O	O
The	O	O
River	B-location	O
Rom	I-location	B-person
,	O	O
also	O	O
in	O	O
places	O	O
known	O	O
as	O	O
the	O	O
River	B-location	O
Beam	I-location	O
,	O	O
is	O	O
a	O	O
river	O	O
in	O	O
Essex	B-location	O
which	O	O

ever	O	O
think	O	O
you	O	O
got	O	O
it	O	O
like	O	O
that	O	O
with	O	O
me	O	O
cause	O	O
the	O	O
moment	O	O
you	O	O
do	O	O
I	O	O
'	O	O
ll	O	O
show	O	O
you	O	O
that	O	O
you	O	O
ain	O	O
'	O	O
t	O	O
have	O	O
it	O	O
like	O	O
you	O	O
think	O	O
you	O	O
do	O	O
…	O	O
@	O	O
WarGit	O	O
@	O	O
northumbriana	O	O
If	O	O
Ney	B-person	B-person
had	O	O
appeared	O	O
in	O	O
uniform	O	O
he	O	O
would	O	O
have	O	O
been	O	O
degraded	O	O
.	O	O
All	O	O
his	O	O
stars	O	O
and	O	O
medals	O	O
torn	O	O
off	O	O
into	O	O
the	O	O
mud	O	O
Inspiron	I-product	O
17	I-product	O
R	I-product	O
5721	I-product	O
Core	I-product	O
i	I-product	O
5	I-product	O
-	I-product	O
3317	I-product	O
U	I-product	O
1	I-product	O
.	I-product	O
7	I-product	O
GHz	I-product	O
6	I-product	O
Gb	I-product	O
750	I-product	O
Gb	I-product	O
DVDRW	I-product	O
17	I-product	O
.	I-product	O
3	I-product	O
"	I-product	O
Laptop	I-product	O
https://t.co/SvPm1Qm5dg	O	O
https://t.co/eG2GWQ5Jv7	O	O
Get	O	O
sexy	O	O
girls	O	O
in	O	O
the	O	O
palm	O	O
of	O	O
hand	O	O


her	O	O
eyes	O	O
in	O	O
all	O	O
their	O	O
pains	O	O
,	O	O
oh	O	O
madam	B-person	O
mina	I-person	O
.	O	O
https://t.co/BJSV3u2Bo1	O	O
RT	O	O
@	O	O
meoss	O	O
31	O	O
:	O	O
Ed	B-person	B-person
'	O	O
s	O	O
-	O	O
you	O	O
are	O	O
the	O	O
one	O	O
who	O	O
makes	O	O
me	O	O
happy	O	O
Always	O	O
YOU	O	O
I	O	O
swear	O	O
after	O	O
five	O	O
years	O	O
,	O	O
you	O	O
are	O	O
mine	O	O
#	O	O
https://t…	O	O
RT	O	O
@	O	O
MarioLopezExtra	B-person	O
:	O	O
Whoa	O	O
.	O	O
.	O	O
.	O	O
Just	O	O
saw	O	O
the	O	O
new	O	O
#	O	O
!	O	O
@	O	O
WEtv	B-corporation	O
#	O	O
JessicaRabbit	B-person	O
https://t.co/15FkpMp9gH	O	O
@	O	O
Willpower	O	O
_	O	O
8	O	O
Sorry	O	O
for	O	O
the	O	O
delay	O	O
but	O	O
we	O	O
'	O	O
ve	O	O
got	O	O
your	O	O
reply	O	O
#	O	O
https://t.co/6ax9VLjnZP	O	O
Coe	O	B-person
Sat	O	O
18	O	O
th	O	O
Mar	O	O
00	O	O
:	O	O
00	O	O
:	O	O
The	O	O
#	O	O
Weather	O	O
in	O	O
#	O	O
Glasgow	B-location	O
is	O	O
currently	O	O
Partly	O	O
cloudy	O	O
and	O	O
9	O	O
C	O	O
|	O	O
Max	O	O
:	O	O
11	O	O
C	O	O
Min	O	O
:	O	O
9	O	O
C	

https://t.co/pmwJadrya7	O	O
https://t.co/kkJ1EVpuNf	O	O
https://t.co/DU6uOgMqVO	O	O
i	O	O
can	O	O
smell	O	O
money	O	O
already	O	O
https://t.co/Owbgx5peCE	O	O
Now	O	O
Playing	O	O
:	O	O
-	O	O
Valeti	B-person	O
Bhabiyan	I-person	O
https://t.co/Rplegi07vb	O	O
RT	O	O
@	O	O
AltYelloNatPark	B-location	O
:	O	O
https://t.co/To1QcXH8VD	O	O
RT	O	O
@	O	O
Genius	O	O
:	O	O
and	O	O
producers	O	O
can	O	O
now	O	O
get	O	O
paid	O	O
for	O	O
their	O	O
mixes	I-product	O
🙏	O	O
https://t.co/ovR779rVVr	O	O
https://t.co/ViWUQUYRCv	O	O
#	O	O
#	O	O
by	O	O
#	O	O
Kaye	B-person	O
_	O	O
Menner	B-person	O
#	O	O
photography	O	O
quality	O	O
prints	O	O
cards	O	O
and	O	O
more	O	O
at	O	O
:	O	O
https://t.co/1gjXZd9Mg7	O	O
@	O	O
JacobRhines	B-person	O
that	O	O
or	O	O
shit	O	O
on	O	O
them	O	O
lol	O	O
RT	O	O
@	O	O
JacobWhitesides	B-person	O
:	O	O
i	O	O
would	O	O
as	O	O
well	O	O
https://t.co/1RcEAlxWm5	O	O
RT	O	O
@	O	O
marIboros	B-person	O
:	O	O
how	O	O
do	O	O
you	O	O
uninstall	O	O
school	O	O
Could	O	O
someone	O	O
explain	O	O
t

### MTA17 Eval set

In [358]:
#for i,x in enumerate(test_conll): 
#    print(x + '\t'+mta_tags_only[i])
#    test_conll[i] = x + '\t'+mta_tags_only[i]
print(test_conll)

['&\tO\tO', 'gt\tO\tO', ';\tO\tO', '*\tO\tO', 'The\tO\tO', 'soldier\tO\tO', 'was\tO\tO', 'killed\tO\tO', 'when\tO\tO', 'another\tO\tO', 'avalanche\tO\tO', 'hit\tO\tO', 'an\tO\tO', 'army\tO\tO', 'barracks\tO\tO', 'in\tO\tO', 'the\tO\tO', 'northern\tO\tO', 'area\tO\tO', 'of\tO\tO', 'Sonmarg\tB-location\tO', ',\tO\tO', 'said\tO\tO', 'a\tO\tO', 'military\tO\tO', 'spokesman\tO\tO', '.\tO\tO', '&\tO\tO', 'gt\tO\tO', ';\tO\tO', '*\tO\tO', 'Police\tO\tO', 'last\tO\tO', 'week\tO\tO', 'evacuated\tO\tO', '80\tO\tO', 'villagers\tO\tO', 'from\tO\tO', 'Waltengoo\tB-location\tO', 'Nar\tI-location\tO', 'where\tO\tO', 'dozens\tO\tO', 'were\tO\tO', 'killed\tO\tO', 'after\tO\tO', 'a\tO\tO', 'series\tO\tO', 'of\tO\tO', 'avalanches\tO\tO', 'hit\tO\tO', 'the\tO\tO', 'area\tO\tO', 'in\tO\tO', '2005\tO\tO', 'in\tO\tO', 'the\tO\tO', 'south\tO\tO', 'of\tO\tO', 'the\tO\tO', 'territory\tO\tO', '.\tO\tO', '&\tO\tO', 'gt\tO\tO', ';\tO\tO', '*\tO\tO', 'The\tO\tO', 'army\tO\tO', 'on\tO\tO', 'Thursday\tO\tO', 'recover

In [359]:
def wnut_eval(inp):
    # get tokens and entities
    lines = [line for line in inp]
    #print(lines)
    tokens = doc_to_toks(lines)
    entities = doc_to_entities(lines)
    # report results
    print("### ENTITY F1-SCORES ###")
    for line in fmt_results(tokens, entities, surface_form=False):
        print(line)
    print()
    print("### SURFACE FORM F1-SCORES ###")
    for line in fmt_results(tokens, entities, surface_form=True):
        print(line)

### Evaluating Results of MITIEWNUT17 

In [312]:
wnut_eval(eval_set)

Invalid tag sequence: Entity(words=('Smith',), sent_id=0, word_id_start=871, word_id_stop=872, tag='person') => Token(sent_id=0, word_id=872, word='Tower', bio='I', tag='location')
Invalid tag sequence: Entity(words=('Europe',), sent_id=0, word_id_start=1056, word_id_stop=1057, tag='location') => Token(sent_id=0, word_id=1057, word='Cheney', bio='I', tag='person')
Invalid tag sequence: Entity(words=('North',), sent_id=0, word_id_start=1987, word_id_stop=1988, tag='O') => Token(sent_id=0, word_id=1988, word='Korea', bio='I', tag='location')
Invalid tag sequence: Entity(words=('chocolate',), sent_id=0, word_id_start=6245, word_id_stop=6246, tag='O') => Token(sent_id=0, word_id=6246, word='frog', bio='I', tag='person')
Invalid tag sequence: Entity(words=('Minas',), sent_id=0, word_id_start=7888, word_id_stop=7889, tag='O') => Token(sent_id=0, word_id=7889, word='Tirith', bio='I', tag='person')
Invalid tag sequence: Entity(words=('kill',), sent_id=0, word_id_start=8246, word_id_stop=8247, 

### ENTITY F1-SCORES ###
processed 22672 tokens with 702 phrases; 
found: 306 phrases; correct: 183.

accuracy:  96.43%; 
precision:  59.80%; 
recall:  26.07%; 
FB1:  36.31

      corporation: 
precision: 100.00%; 
recall:   1.52%; 
FB1:   2.99  1

         location: 
precision:  66.67%; 
recall:  20.00%; 
FB1:  30.77  30

           person: 
precision:  58.46%; 
recall:  35.43%; 
FB1:  44.12  152

          product: 
precision:   0.00%; 
recall:   0.00%; 
FB1:   0.00  0


### SURFACE FORM F1-SCORES ###
processed 22672 tokens with 613 phrases; 
found: 248 phrases; correct: 138.

accuracy:  96.43%; 
precision:  55.65%; 
recall:  22.51%; 
FB1:  32.06

      corporation: 
precision: 100.00%; 
recall:   1.67%; 
FB1:   3.28  1

         location: 
precision:  62.50%; 
recall:  16.00%; 
FB1:  25.48  20

           person: 
precision:  54.42%; 
recall:  31.12%; 
FB1:  39.59  117

          product: 
precision:   0.00%; 
recall:   0.00%; 
FB1:   0.00  0



### Evaluating Results of MITIEWNUT17 

In [362]:
wnut_eval(test_conll)

### ENTITY F1-SCORES ###
processed 23394 tokens with 1079 phrases; 
found: 617 phrases; correct: 355.

accuracy:  94.18%; 
precision:  57.54%; 
recall:  32.90%; 
FB1:  41.86

      corporation: 
precision:  31.91%; 
recall:  22.73%; 
FB1:  26.55  15

    creative-work: 
precision:  36.67%; 
recall:   7.75%; 
FB1:  12.79  11

            group: 
precision:  41.79%; 
recall:  16.97%; 
FB1:  24.14  28

         location: 
precision:  56.92%; 
recall:  49.33%; 
FB1:  52.86  74

           person: 
precision:  70.72%; 
recall:  50.12%; 
FB1:  58.66  215

          product: 
precision:  30.77%; 
recall:   9.45%; 
FB1:  14.46  12


### SURFACE FORM F1-SCORES ###
processed 23394 tokens with 955 phrases; 
found: 531 phrases; correct: 299.

accuracy:  94.18%; 
precision:  56.31%; 
recall:  31.31%; 
FB1:  40.24

      corporation: 
precision:  36.11%; 
recall:  21.67%; 
FB1:  27.08  13

    creative-work: 
precision:  35.71%; 
recall:   7.35%; 
FB1:  12.20  10

            group: 
precision:  39.