In [31]:
import sys
import re
from nltk import sent_tokenize, word_tokenize, pos_tag, ne_chunk
import nltk.data
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
import pickle
from collections import Iterable

from nltk.tag import ClassifierBasedTagger
from nltk.chunk import ChunkParserI
import ner
import string
from nltk.stem.snowball import SnowballStemmer
from nltk.chunk import conlltags2tree, tree2conlltags
from sklearn import metrics
import random
from sklearn.model_selection import train_test_split

In [2]:
%load_ext autoreload
%autoreload 2
from unwiki import unwiki

In [4]:
with open('../wiki_definitions_improved.txt', 'r') as wiki_f:
    wiki = wiki_f.readlines()

In [5]:
# Get data and train the Sentence tokenizer
text = ''
for i in range(550):
    text += unwiki.loads(eval(wiki[i].split('-#-%-')[2]))

trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(text)
tokenizer = PunktSentenceTokenizer(trainer.get_params())

In [6]:
title, section, defin = wiki[450].split('-#-%-')
dclean = unwiki.loads(eval(defin))
print(title)
print(dclean)
tokenizer.tokenize(dclean)    

Predation  


At the most basic level, predators kill and eat other organisms. However, the concept of predation is broad, defined differently in different contexts, and includes a wide variety of feeding methods; and some relationships that result in the prey's death are not generally called predation. A parasitoid, such as an ichneumon wasp, lays its eggs in or on its host; the eggs hatch into larvae, which eat the host, and it inevitably dies. Zoologists generally call this a form of parasitism, though conventionally parasites are thought not to kill their hosts. A predator can be defined to differ from a parasitoid in two ways: it kills its prey immediately; and it has many prey, captured over its lifetime, where a parasitoid's larva has just one, or at least has its food supply provisioned for it on just one occasion. 



There are other difficult and borderline cases. Micropredators are small animals that, like predators, feed entirely on other organisms; they include fleas and m

['\n\nAt the most basic level, predators kill and eat other organisms.',
 "However, the concept of predation is broad, defined differently in different contexts, and includes a wide variety of feeding methods; and some relationships that result in the prey's death are not generally called predation.",
 'A parasitoid, such as an ichneumon wasp, lays its eggs in or on its host; the eggs hatch into larvae, which eat the host, and it inevitably dies.',
 'Zoologists generally call this a form of parasitism, though conventionally parasites are thought not to kill their hosts.',
 "A predator can be defined to differ from a parasitoid in two ways: it kills its prey immediately; and it has many prey, captured over its lifetime, where a parasitoid's larva has just one, or at least has its food supply provisioned for it on just one occasion.",
 'There are other difficult and borderline cases.',
 'Micropredators are small animals that, like predators, feed entirely on other organisms; they include

In [39]:
# Get the data and POS and NER tags for each definition (LONG TIME)
def_lst = []
for i in range(len(wiki)):
    try:
        title, section, defin_raw = wiki[i].split('-#-%-')
        defin_all = unwiki.loads(eval(defin_raw))
        for d in tokenizer.tokenize(defin_all):
            if title.lower().strip() in d.lower():
                pos_tokens = pos_tag(word_tokenize(d))
                def_ner = ner.bio_tag.bio_tagger(title.strip().split(), pos_tokens)
                other_ner = [((d[0],d[1]),d[2]) for d in def_ner]
                tmp_dict = {'title': title,
                           'section': section,
                           'defin': d,
                           'ner': other_ner}
                def_lst.append(tmp_dict)
    except ValueError:
        print('parsing error')

In [40]:
class NamedEntityChunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        assert isinstance(train_sents, Iterable)
 
        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(
            train=train_sents,
            feature_detector=features,
            **kwargs)
    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)
 
        # Transform the result from [((w1, t1), iob1), ...] 
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
 
        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)       
        
random.shuffle(def_lst)
training_samples = [d['ner'] for d in def_lst[:int(len(def_lst) * 0.9)]]
test_samples = [d['ner'] for d in def_lst[int(len(def_lst) * 0.9):]]
 
print("#training samples = %s" % len(training_samples) )   # training samples = 55809
print("#test samples = %s" % len(test_samples))            # test samples = 6201
 

#training samples = 12709
#test samples = 1413


In [41]:
#train the NER Chunking Classifier (TAKES A LONG TIME)
%time chunker = NamedEntityChunker(random.sample(training_samples, len(training_samples)))

CPU times: user 2min 16s, sys: 1.39 s, total: 2min 18s
Wall time: 2min 19s


In [14]:
# An example of a user fed definition
print(chunker.parse(pos_tag(word_tokenize("We define a Banach space as a complete vector space."))))

(S
  We/PRP
  define/VBP
  a/DT
  (DFNDUM Banach/NNP space/NN)
  as/IN
  a/DT
  (DFNDUM complete/JJ vector/NN space/NN)
  ./.)


In [42]:
def prepare_for_metrics(int_range, chunker_fn, data_set = test_samples, print_output=False):
    '''
    `int_range` is an integer range
    NEEDS A TEST_SAMPLES VARIABLE CREATED WHEN SPLITTING THE 
    TRAINING AND TESTING DATA
    Returns two vectors ready to be used in the 
    metrics classification function
    '''
    if isinstance(int_range, int):
        int_range = [int_range]
    y_true = []
    y_pred = []
    for i in int_range:
        sample = data_set[i]
        sm = [s[0] for s in sample]
        y_true_tmp = [s[1] for s in sample]
        predicted = [v[2] for v in tree2conlltags(chunker_fn.parse(sm))]
        y_true += y_true_tmp
        y_pred += predicted
        if print_output:
            for k,s in enumerate(sm):
                print('{:15} {:>10}  {:>10}'.format(s[0], y_true_tmp[k], predicted[k]))
    return y_true, y_pred

In [29]:
OO = prepare_for_metrics(10584, chunker, data_set=training_samples, print_output=True)

It                       O           O
is                       O           O
convenient               O           O
to                       O           O
introduce                O           O
the                      O           O
Y-local                  O    B-DFNDUM
maps                     O    I-DFNDUM
F                        O           O
<                        O           O
sub                      O           O
>                        O           O
i                        O           O
<                        O           O
/sub                     O           O
>                        O           O
constructed              O           O
from                     O           O
the                      O           O
vertex                   O           O
functions                O           O
by                       O           O
_display_math_           O           O
The                      O           O
word                     O           O
w                        

In [43]:
y_true, predicted = prepare_for_metrics(range(len(test_samples)), chunker)
print(metrics.classification_report(y_true, predicted))

              precision    recall  f1-score   support

    B-DFNDUM       0.35      0.75      0.48      1341
    I-DFNDUM       0.32      0.80      0.46      1129
           O       0.99      0.92      0.95     45420

   micro avg       0.91      0.91      0.91     47890
   macro avg       0.55      0.82      0.63     47890
weighted avg       0.96      0.91      0.93     47890



In [24]:
y_true, predicted = prepare_for_metrics(range(len(test_samples)), chunker)
print(metrics.classification_report(y_true, predicted))

              precision    recall  f1-score   support

    B-DFNDUM       0.30      0.74      0.43      1329
    I-DFNDUM       0.29      0.76      0.42      1286
           O       0.99      0.90      0.94     46098

   micro avg       0.89      0.89      0.89     48713
   macro avg       0.53      0.80      0.60     48713
weighted avg       0.95      0.89      0.91     48713



In [34]:
y_true, predicted = prepare_for_metrics(range(len(test_samples)), chunker9K)
print(metrics.classification_report(y_true, predicted))

              precision    recall  f1-score   support

    B-DFNDUM       0.30      0.73      0.43      1263
    I-DFNDUM       0.28      0.75      0.41      1233
           O       0.99      0.90      0.94     44197

   micro avg       0.89      0.89      0.89     46693
   macro avg       0.52      0.80      0.59     46693
weighted avg       0.95      0.89      0.91     46693



In [18]:
print(tokenizer._params.abbrev_types)

{'vibrations', 'hk', 'digits', 'pl', 'wings', 'e.g', 'etc', 'x+2', 'ginebra', 'u.s', 'cover', 'i.e', 'dr', 'a<sub>2</sub>', 'missions', 'juniper', 'applications', 'r.a', 'eng', 'contaminants', 'estimated', "'is", 'al'}


In [25]:
def features(tokens, index, history):
    """
    `tokens`  = a POS-tagged sentence [(w1, t1), ...]
    `index`   = the index of the token we want to extract features for
    `history` = the previous predicted IOB tags
    """
 
    # init the stemmer
    stemmer = SnowballStemmer('english')
 
    # Pad the sequence with placeholders
    tokens = [('[START3]', '[START3]'),('[START2]', '[START2]'), ('[START1]', '[START1]')] +\
    list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]'), ('[END3]', '[END3]')]
    history = ['[START3]', '[START2]', '[START1]'] + list(history)
 
    # shift the index with 3, to accommodate the padding
    index += 3
 
    word, pos = tokens[index]
    prevword, prevpos = tokens[index - 1]
    prevprevword, prevprevpos = tokens[index - 2]
    prev3word, prev3pos = tokens[index - 3]
    nextword, nextpos = tokens[index + 1]
    nextnextword, nextnextpos = tokens[index + 2]
    next3word, next3pos = tokens[index + 3]
    previob = history[index - 1]
    prevpreviob = history[index - 2]
    prev3iob = history[index - 3]
    contains_dash = '-' in word
    contains_dot = '.' in word
    allascii = all([True for c in word if c in string.ascii_lowercase])
 
    allcaps = word == word.capitalize()
    capitalized = word[0] in string.ascii_uppercase
 
    prevallcaps = prevword == prevword.capitalize()
    prevcapitalized = prevword[0] in string.ascii_uppercase
 
    nextallcaps = prevword == prevword.capitalize()
    nextcapitalized = prevword[0] in string.ascii_uppercase
    
    is_math = lambda w:(w == '_inline_math_') or (w == '_display_math_')
    ismath = is_math(word)
    isprevmath = is_math(prevword)
    isprevprevmath = is_math(prevprevword)
 
    return {
        'word': word,
        'lemma': stemmer.stem(word),
                'pos': pos,
        'all-ascii': allascii,
 
        'next-word': nextword,
        'next-lemma': stemmer.stem(nextword),
        'next-pos': nextpos,
 
        'next-next-word': nextnextword,
        'nextnextpos': nextnextpos,
 
        'next3word': next3word,
        'next3pos': next3pos,
        
        'prev-word': prevword,
        'prev-lemma': stemmer.stem(prevword),
        'prev-pos': prevpos,
 
        'prev-prev-word': prevprevword,
        'prev-prev-pos': prevprevpos,
 
        'prev3word': prev3word,
        'prev3pos': prev3pos,
        
        'prev-iob': previob,
        
        'prev-prev-iob': prevpreviob,
 
        'contains-dash': contains_dash,
        'contains-dot': contains_dot,
 
        'all-caps': allcaps,
        'capitalized': capitalized,
 
        'prev-all-caps': prevallcaps,
        'prev-capitalized': prevcapitalized,
 
        'next-all-caps': nextallcaps,
        'next-capitalized': nextcapitalized,
        
        'ismath': ismath,
        'isprevmath': isprevmath,
        'isprevprevmath': isprevprevmath,
    }
 