In [1]:
import sys
import re
from nltk import sent_tokenize, word_tokenize, pos_tag, ne_chunk
import nltk.data
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
import pickle
from collections import Iterable

from nltk.tag import ClassifierBasedTagger
from nltk.chunk import ChunkParserI
import ner
import string
from nltk.stem.snowball import SnowballStemmer
from nltk.chunk import conlltags2tree, tree2conlltags
from sklearn import metrics
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

  from collections import Iterable


In [2]:
%load_ext autoreload
%autoreload 2
from unwiki import unwiki
from ner.chunker import NamedEntityChunker

In [3]:
# The results for the search for definition (currently just Wikipedia)
with open('/media/hd1/wikipedia/wiki_definitions_improved.txt', 'r') as wiki_f:
    wiki = wiki_f.readlines()

In [4]:
# Get data and train the Sentence tokenizer
# Uses a standard algorithm (Kiss-Strunk) for unsupervised sentence boundary detection
text = ''
for i in range(550):
    text += unwiki.loads(eval(wiki[i].split('-#-%-')[2]))

trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(text)
tokenizer = PunktSentenceTokenizer(trainer.get_params())
print(tokenizer._params.abbrev_types)

{'mixture', "'is", 'u.n', 'pl', 'spacewalks', 'sow', 'ca', 'ton', 'neighbourhood', 'p.h.d', 'jie', 'eng', 'missions', 's^2', 'j.w', 'r.a', 'ex', 'ginebra', 'jr', 'x+2', 'u.s', 'cf', 'az', 'juniper', 'hk', '2π', 'al', 'etc', 'wings', 'z-1', 'e.g', 'vibrations', 'dr', 'i.e'}


In [5]:
# The cleaning up of the wiki markup so that it looks like normal written english
title, section, defin = wiki[850].split('-#-%-')
dclean = unwiki.loads(eval(defin))
print(title)
print(dclean)
defin

Indicator function  
The indicator function of a subset A of a set X is a function

_display_math_

defined as

_display_math_

The Iverson bracket allows the equivalent notation, _inline_math_, to be used instead of _inline_math_.

The function _inline_math_ is sometimes denoted _inline_math_, _inline_math_, _inline_math_ or even just _inline_math_. (The Greek letter _inline_math_ appears because it is the initial letter of the Greek word χαρακτήρ, which is the ultimate origin of the word characteristic.)

The set of all indicator functions on _inline_math_ can be identified with _inline_math_, the power set of _inline_math_.  Consequently, both sets are sometimes denoted by _inline_math_. This is a special case (_inline_math_) of the notation _inline_math_ for the set of all functions _inline_math_.


'  "The indicator function of a subset \'\'A\'\' of a set \'\'X\'\' is a function\\n\\n:<math>\\\\mathbf{1}_A \\\\colon X \\\\to \\\\{ 0,1 \\\\} </math>\\n\\ndefined as\\n\\n:<math>\\\\mathbf{1}_A(x) :=\\n\\\\begin{cases}\\n1 &\\\\text{if } x \\\\in A, \\\\\\\\\\n0 &\\\\text{if } x \\\\notin A.\\n\\\\end{cases}\\n</math>\\n\\nThe [[Iverson bracket]] allows the equivalent notation, <math>[x\\\\in A]</math>, to be used instead of <math>\\\\mathbf{1}_A(x)</math>.\\n\\nThe function <math>\\\\mathbf{1}_A</math> is sometimes denoted <math>I_A</math>, <math>\\\\chi_A</math>, \'\'K<sub>A</sub>\'\' or even just <math>A</math>. (The [[Greek alphabet|Greek letter]] <math>\\\\chi</math> appears because it is the initial letter of the Greek word χαρακτήρ, which is the ultimate origin of the word \'\'characteristic\'\'.)\\n\\nThe set of all indicator functions on <math>X</math> can be identified with <math>\\\\mathcal{P}(X)</math>, the [[power set]] of <math>X</math>.  Consequently, both sets are so

In [6]:
# Get the data and POS and NER tags for each definition (LONG TIME)
def_lst = []
for i in range(len(wiki)):
    try:
        title, section, defin_raw = wiki[i].split('-#-%-')
        defin_all = unwiki.loads(eval(defin_raw))
        for d in tokenizer.tokenize(defin_all):
            if title.lower().strip() in d.lower():
                pos_tokens = pos_tag(word_tokenize(d))
                def_ner = ner.bio_tag.bio_tagger(title.strip().split(), pos_tokens)
                other_ner = [((d[0],d[1]),d[2]) for d in def_ner]
                tmp_dict = {'title': title,
                           'section': section,
                           'defin': d,
                           'ner': other_ner}
                def_lst.append(tmp_dict)
    except ValueError:
        print('parsing error')

In [7]:
def_lst[0]

{'title': 'Arithmetic mean  ',
 'section': '  Definition  ',
 'defin': 'The arithmetic mean (or mean or average), _inline_math_ (read _inline_math_ bar), is the mean of the _inline_math_ values _inline_math_.',
 'ner': [(('The', 'DT'), 'O'),
  (('arithmetic', 'JJ'), 'B-DFNDUM'),
  (('mean', 'NN'), 'I-DFNDUM'),
  (('(', '('), 'O'),
  (('or', 'CC'), 'O'),
  (('mean', 'VB'), 'O'),
  (('or', 'CC'), 'O'),
  (('average', 'VB'), 'O'),
  ((')', ')'), 'O'),
  ((',', ','), 'O'),
  (('_inline_math_', 'NNP'), 'O'),
  (('(', '('), 'O'),
  (('read', 'VB'), 'O'),
  (('_inline_math_', 'NNP'), 'O'),
  (('bar', 'NN'), 'O'),
  ((')', ')'), 'O'),
  ((',', ','), 'O'),
  (('is', 'VBZ'), 'O'),
  (('the', 'DT'), 'O'),
  (('mean', 'NN'), 'O'),
  (('of', 'IN'), 'O'),
  (('the', 'DT'), 'O'),
  (('_inline_math_', 'NN'), 'O'),
  (('values', 'VBZ'), 'O'),
  (('_inline_math_', 'NNP'), 'O'),
  (('.', '.'), 'O')]}

In [10]:
random.shuffle(def_lst)
training_samples = [d['ner'] for d in def_lst[:int(len(def_lst) * 0.9)]]
test_samples = [d['ner'] for d in def_lst[int(len(def_lst) * 0.9):]]
 
print("#training samples = %s" % len(training_samples) )   # training samples = 55809
print("#test samples = %s" % len(test_samples))            # test samples = 6201

#training samples = 12602
#test samples = 1401


In [11]:
#train the NER Chunking Classifier (TAKES A LONG TIME)
%time chunker = NamedEntityChunker(training_samples)

CPU times: user 15.3 s, sys: 151 ms, total: 15.4 s
Wall time: 15.4 s


In [12]:
# Evaluate the most common metrics on the test dataset
unpack = lambda l: [(tok, pos, ner) for ((tok, pos), ner) in l]
Tree_lst = [conlltags2tree(unpack(t)) for t in test_samples]
print(chunker.evaluate(Tree_lst))

ChunkParse score:
    IOB Accuracy:  91.3%%
    Precision:     32.1%%
    Recall:        68.3%%
    F-Measure:     43.7%%


### Other Scores

Training with an amount of the dataset and evaluating with the rest
* With 80% of the dataset

* 60% of the data

```ChunkParse score:
    IOB Accuracy:  91.0%%
    Precision:     30.7%%
    Recall:        63.9%%
    F-Measure:     41.5%%```
    
```ChunkParse score:
    IOB Accuracy:  90.6%%
    Precision:     32.4%%
    Recall:        68.7%%
    F-Measure:     44.0%%```

* 90% of the data

```ChunkParse score:
    IOB Accuracy:  91.2%%
    Precision:     32.0%%
    Recall:        68.0%%
    F-Measure:     43.5%%```
    


In [13]:
# An example of a user fed definition
print(chunker.parse(pos_tag(word_tokenize("We define a Banach space as a complete vector space."))))

(S
  We/PRP
  define/VBP
  a/DT
  (DFNDUM Banach/NNP space/NN)
  as/IN
  a/DT
  complete/JJ
  vector/NN
  space/NN
  ./.)


In [14]:
def prepare_for_metrics(int_range, chunker_fn, data_set = test_samples, print_output=False):
    '''
    `int_range` is an integer range
    NEEDS A TEST_SAMPLES VARIABLE CREATED WHEN SPLITTING THE 
    TRAINING AND TESTING DATA
    Returns two vectors ready to be used in the 
    metrics classification function
    '''
    if isinstance(int_range, int):
        int_range = [int_range]
    y_true = []
    y_pred = []
    for i in int_range:
        sample = data_set[i]
        sm = [s[0] for s in sample]
        y_true_tmp = [s[1] for s in sample]
        predicted = [v[2] for v in tree2conlltags(chunker_fn.parse(sm))]
        y_true += y_true_tmp
        y_pred += predicted
        if print_output:
            for k,s in enumerate(sm):
                print('{:15} {:>10}  {:>10}'.format(s[0], y_true_tmp[k], predicted[k]))
    return y_true, y_pred

In [15]:
OO = prepare_for_metrics(119, chunker, data_set=test_samples, print_output=True)

Herbs                    O    B-DFNDUM
came                     O    I-DFNDUM
to                       O           O
be                       O           O
considered               O           O
in                       O           O
3                        O           O
groups                   O           O
,                        O           O
namely                   O           O
pot                      O           O
herbs                    O           O
(                        O           O
e.g                      O           O
.                        O           O
onions                   O           O
)                        O           O
,                        O           O
sweet                    O           O
herbs                    O           O
(                        O           O
e.g                      O           O
.                        O           O
thyme                    O           O
)                        O           O
and                      

In [16]:
y_true, predicted = prepare_for_metrics(range(len(test_samples)), chunker)
print(metrics.classification_report(y_true, predicted))

              precision    recall  f1-score   support

    B-DFNDUM       0.36      0.76      0.49      1336
    I-DFNDUM       0.33      0.80      0.47      1096
           O       0.99      0.92      0.95     43983

    accuracy                           0.91     46415
   macro avg       0.56      0.83      0.64     46415
weighted avg       0.96      0.91      0.93     46415

