In [53]:
import sys
import re
from nltk import sent_tokenize, word_tokenize, pos_tag, ne_chunk
import nltk.data
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
import pickle
from collections import Iterable

from nltk.tag import ClassifierBasedTagger
from nltk.chunk import ChunkParserI
import ner
import string
from nltk.stem.snowball import SnowballStemmer
from nltk.chunk import conlltags2tree, tree2conlltags
from sklearn import metrics
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [84]:
%load_ext autoreload
%autoreload 2
from unwiki import unwiki

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [56]:
# The results for the search for definition (currently just Wikipedia)
with open('data/wiki_definitions_improved.txt', 'r') as wiki_f:
    wiki = wiki_f.readlines()

In [80]:
# Get data and train the Sentence tokenizer
# Uses a standard algorithm (Kiss-Strunk) for unsupervised sentence boundary detection
text = ''
for i in range(550):
    text += unwiki.loads(eval(wiki[i].split('-#-%-')[2]))

trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(text)
tokenizer = PunktSentenceTokenizer(trainer.get_params())
print(tokenizer._params.abbrev_types)

{'u.n', 'mixture', 'sow', 'r.a', 'juniper', 'missions', 'eng', 'wings', 'jie', 'z-1', 'ca', 'u.s', 'j.w', 's^2', 'pl', 'hk', 'cf', 'az', 'e.g', 'al', 'ton', "'is", 'vibrations', 'neighbourhood', 'spacewalks', 'dr', 'a<sub>2</sub>', '2π', 'x+2', 'p.h.d', 'e.a', 'etc', 'jr', 'i.e', 'ex', 'ginebra', 'st'}


In [86]:
# The cleaning up of the wiki markup so that it looks like normal written english
title, section, defin = wiki[550].split('-#-%-')
dclean = unwiki.loads(eval(defin))
print(title)
print(dclean)
defin

Linear classifier  
thumb|right|In this case, the solid and empty dots can be correctly classified by any number of linear classifiers. H1 (blue) classifies them correctly, as does H2 (red). H2 could be considered "better" in the sense that it is also furthest from both groups.
H3 (green) fails to correctly classify the dots.

If the input feature vector to the classifier is a real vector _inline_math_, then the output score is

_display_math_

where _inline_math_ is a real vector of weights and f is a function that converts the dot product of the two vectors into the desired output. (In other words, _inline_math_ is a one-form or linear functional mapping _inline_math_ onto R.) The weight vector _inline_math_ is learned from a set of labeled training samples. Often f is a threshold function, which maps all values of _inline_math_ above a certain threshold to the first class and all other values to the second class; e.g., 

_display_math_

A more complex f might give the probability th

'  \'[[Image:Svm separating hyperplanes.png|thumb|right|In this case, the solid and empty dots can be correctly classified by any number of linear classifiers. H1 (blue) classifies them correctly, as does H2 (red). H2 could be considered "better" in the sense that it is also furthest from both groups.\\nH3 (green) fails to correctly classify the dots.]]\\n\\nIf the input feature vector to the classifier is a [[real number|real]] vector <math>\\\\vec x</math>, then the output score is\\n\\n:<math>y = f(\\\\vec{w}\\\\cdot\\\\vec{x}) = f\\\\left(\\\\sum_j w_j x_j\\\\right),</math>\\n\\nwhere <math>\\\\vec w</math> is a real vector of weights and \\\'\\\'f\\\'\\\' is a function that converts the [[dot product]] of the two vectors into the desired output. (In other words, <math>\\\\vec{w}</math> is a [[one-form]] or [[linear functional]] mapping <math>\\\\vec x</math> onto \\\'\\\'\\\'R\\\'\\\'\\\'.) The weight vector <math>\\\\vec w</math> is learned from a set of labeled training samples.

In [60]:
# Get the data and POS and NER tags for each definition (LONG TIME)
def_lst = []
for i in range(len(wiki)):
    try:
        title, section, defin_raw = wiki[i].split('-#-%-')
        defin_all = unwiki.loads(eval(defin_raw))
        for d in tokenizer.tokenize(defin_all):
            if title.lower().strip() in d.lower():
                pos_tokens = pos_tag(word_tokenize(d))
                def_ner = ner.bio_tag.bio_tagger(title.strip().split(), pos_tokens)
                other_ner = [((d[0],d[1]),d[2]) for d in def_ner]
                tmp_dict = {'title': title,
                           'section': section,
                           'defin': d,
                           'ner': other_ner}
                def_lst.append(tmp_dict)
    except ValueError:
        print('parsing error')

In [61]:
class NamedEntityChunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        assert isinstance(train_sents, Iterable)
 
        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(
            train=train_sents,
            feature_detector=features,
            **kwargs)
    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)
 
        # Transform the result from [((w1, t1), iob1), ...] 
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
 
        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)       
        
random.shuffle(def_lst)
training_samples = [d['ner'] for d in def_lst[:int(len(def_lst) * 0.9)]]
test_samples = [d['ner'] for d in def_lst[int(len(def_lst) * 0.9):]]
 
print("#training samples = %s" % len(training_samples) )   # training samples = 55809
print("#test samples = %s" % len(test_samples))            # test samples = 6201
 

#training samples = 12709
#test samples = 1413


In [62]:
#train the NER Chunking Classifier (TAKES A LONG TIME)
%time chunker = NamedEntityChunker(random.sample(training_samples, len(training_samples)))

CPU times: user 1min 32s, sys: 117 ms, total: 1min 32s
Wall time: 1min 32s


In [63]:
# Evaluate the most common metrics on the test dataset
unpack = lambda l: [(tok, pos, ner) for ((tok, pos), ner) in l]
Tree_lst = [conlltags2tree(unpack(t)) for t in test_samples]
print(chunker.evaluate(Tree_lst))

ChunkParse score:
    IOB Accuracy:  89.7%%
    Precision:     29.1%%
    Recall:        71.5%%
    F-Measure:     41.3%%


In [65]:
# An example of a user fed definition
print(chunker.parse(pos_tag(word_tokenize("We define a Banach space as a complete vector space."))))

(S
  We/PRP
  define/VBP
  a/DT
  (DFNDUM Banach/NNP space/NN)
  as/IN
  a/DT
  (DFNDUM complete/JJ vector/NN space/NN)
  ./.)


In [66]:
def prepare_for_metrics(int_range, chunker_fn, data_set = test_samples, print_output=False):
    '''
    `int_range` is an integer range
    NEEDS A TEST_SAMPLES VARIABLE CREATED WHEN SPLITTING THE 
    TRAINING AND TESTING DATA
    Returns two vectors ready to be used in the 
    metrics classification function
    '''
    if isinstance(int_range, int):
        int_range = [int_range]
    y_true = []
    y_pred = []
    for i in int_range:
        sample = data_set[i]
        sm = [s[0] for s in sample]
        y_true_tmp = [s[1] for s in sample]
        predicted = [v[2] for v in tree2conlltags(chunker_fn.parse(sm))]
        y_true += y_true_tmp
        y_pred += predicted
        if print_output:
            for k,s in enumerate(sm):
                print('{:15} {:>10}  {:>10}'.format(s[0], y_true_tmp[k], predicted[k]))
    return y_true, y_pred

In [78]:
OO = prepare_for_metrics(9789, chunker, data_set=training_samples, print_output=True)

There                    O           O
is                       O           O
a                        O           O
universal                O    B-DFNDUM
Mennicke          B-DFNDUM    I-DFNDUM
symbol            I-DFNDUM    I-DFNDUM
with                     O           O
values                   O           O
in                       O           O
a                        O           O
group                    O           O
C                        O           O
<                        O           O
sub                      O           O
>                        O           O
q                        O           O
<                        O           O
/sub                     O           O
>                        O           O
such                     O           O
that                     O           O
any                      O           O
Mennicke          B-DFNDUM    B-DFNDUM
symbol            I-DFNDUM    I-DFNDUM
with                     O           O
values                   

In [79]:
y_true, predicted = prepare_for_metrics(range(len(test_samples)), chunker)
print(metrics.classification_report(y_true, predicted))

              precision    recall  f1-score   support

    B-DFNDUM       0.33      0.80      0.46      1296
    I-DFNDUM       0.26      0.83      0.39      1051
           O       0.99      0.90      0.94     46449

   micro avg       0.90      0.90      0.90     48796
   macro avg       0.52      0.84      0.60     48796
weighted avg       0.96      0.90      0.92     48796



{'vibrations', 'hk', 'digits', 'pl', 'wings', 'e.g', 'etc', 'x+2', 'ginebra', 'u.s', 'cover', 'i.e', 'dr', 'a<sub>2</sub>', 'missions', 'juniper', 'applications', 'r.a', 'eng', 'contaminants', 'estimated', "'is", 'al'}


In [9]:
def features(tokens, index, history):
    """
    `tokens`  = a POS-tagged sentence [(w1, t1), ...]
    `index`   = the index of the token we want to extract features for
    `history` = the previous predicted IOB tags
    """
 
    # init the stemmer
    stemmer = SnowballStemmer('english')
 
    # Pad the sequence with placeholders
    tokens = [('[START3]', '[START3]'),('[START2]', '[START2]'), ('[START1]', '[START1]')] +\
    list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]'), ('[END3]', '[END3]')]
    history = ['[START3]', '[START2]', '[START1]'] + list(history)
 
    # shift the index with 3, to accommodate the padding
    index += 3
 
    word, pos = tokens[index]
    prevword, prevpos = tokens[index - 1]
    prevprevword, prevprevpos = tokens[index - 2]
    prev3word, prev3pos = tokens[index - 3]
    nextword, nextpos = tokens[index + 1]
    nextnextword, nextnextpos = tokens[index + 2]
    next3word, next3pos = tokens[index + 3]
    previob = history[index - 1]
    prevpreviob = history[index - 2]
    prev3iob = history[index - 3]
    contains_dash = '-' in word
    contains_dot = '.' in word
    allascii = all([True for c in word if c in string.ascii_lowercase])
 
    allcaps = word == word.capitalize()
    capitalized = word[0] in string.ascii_uppercase
 
    prevallcaps = prevword == prevword.capitalize()
    prevcapitalized = prevword[0] in string.ascii_uppercase
 
    nextallcaps = prevword == prevword.capitalize()
    nextcapitalized = prevword[0] in string.ascii_uppercase
    
    is_math = lambda w:(w == '_inline_math_') or (w == '_display_math_')
    ismath = is_math(word)
    isprevmath = is_math(prevword)
    isprevprevmath = is_math(prevprevword)
 
    return {
        'word': word,
        'lemma': stemmer.stem(word),
                'pos': pos,
        'all-ascii': allascii,
 
        'next-word': nextword,
        'next-lemma': stemmer.stem(nextword),
        'next-pos': nextpos,
 
        'next-next-word': nextnextword,
        'nextnextpos': nextnextpos,
 
        'next3word': next3word,
        'next3pos': next3pos,
        
        'prev-word': prevword,
        'prev-lemma': stemmer.stem(prevword),
        'prev-pos': prevpos,
 
        'prev-prev-word': prevprevword,
        'prev-prev-pos': prevprevpos,
 
        'prev3word': prev3word,
        'prev3pos': prev3pos,
        
        'prev-iob': previob,
        
        'prev-prev-iob': prevpreviob,
 
        'contains-dash': contains_dash,
        'contains-dot': contains_dot,
 
        'all-caps': allcaps,
        'capitalized': capitalized,
 
        'prev-all-caps': prevallcaps,
        'prev-capitalized': prevcapitalized,
 
        'next-all-caps': nextallcaps,
        'next-capitalized': nextcapitalized,
        
        'ismath': ismath,
        'isprevmath': isprevmath,
        'isprevprevmath': isprevprevmath,
    }
 