In [1]:
import os
os.chdir('../')

In [2]:
from src.name_parser import NameParser

In [3]:
import re
import pandas as pd
import numpy as np

from src.utils.make_dataset import get_XMLtree, exctract_corpus, extract_dataset
import probablepeople as pp

In [4]:
test_data_path = '/workdir/data/person_labeled_test.xml'
tree = get_XMLtree(test_data_path)
dataset, target = extract_dataset(tree)

In [5]:
def evaluate_parser(dataset, target, p):
    errors = 0
    for i, dp in enumerate(dataset):
        string = ' '.join(dp)
        tagget_str = p.tag(string)
        if type(tagget_str) == tuple:
            tagget_str = list(zip(tagget_str[0].keys(), tagget_str[0].values()))
        predictions = [x[0] for x in tagget_str]
        if predictions != target[i]:
            errors += 1
            print('True: ')
            print(list(zip(target[i], dp)))
            print('Predicted: ')
            print(tagget_str)
            
    print(f"ACCURACY: {(len(dataset)-errors)/len(dataset)}, {errors} mistakes out of {len(dataset)}")

In [6]:
evaluate_parser(dataset, target, NameParser())

True: 
[('Surname', 'HOOPER'), ('SuffixGenerational', 'SR,'), ('GivenName', 'BARRON')]
Predicted: 
[('GivenName', 'HOOPER'), ('SuffixGenerational', 'SR,'), ('GivenName', 'BARRON')]
True: 
[('Surname', 'RAMIREZ,'), ('GivenName', 'PATRICIA'), ('MiddleInitial', 'M')]
Predicted: 
[('Surname', 'RAMIREZ,'), ('GivenName', 'PATRICIA'), ('LastInitial', 'M')]
True: 
[('Surname', 'RODRIGUEZ'), ('SuffixGenerational', 'JR,'), ('GivenName', 'FRANCISCO')]
Predicted: 
[('GivenName', 'RODRIGUEZ'), ('SuffixGenerational', 'JR,'), ('GivenName', 'FRANCISCO')]
ACCURACY: 0.9347826086956522, 3 mistakes out of 46


In [7]:
evaluate_parser(dataset, target, pp)

True: 
[('GivenName', 'Shaquille'), ('Nickname', '"Big'), ('Nickname', 'Aristotle"'), ('Surname', "O'Neal")]
Predicted: 
[('GivenName', 'Shaquille'), ('Nickname', '"Big Aristotle"'), ('Surname', "O'Neal")]
True: 
[('Surname', 'MC'), ('Surname', 'NICHOLAS,'), ('GivenName', 'DANIEL'), ('MiddleInitial', 'D')]
Predicted: 
[('Surname', 'MC NICHOLAS'), ('GivenName', 'DANIEL'), ('MiddleInitial', 'D')]
True: 
[('Surname', 'O'), ('Surname', 'BOYLE,'), ('GivenName', 'ROBERT'), ('MiddleInitial', 'L')]
Predicted: 
[('Surname', 'O BOYLE'), ('GivenName', 'ROBERT'), ('MiddleInitial', 'L')]
True: 
[('GivenName', 'Ben'), ('Nickname', '"Big'), ('Nickname', 'Ben"'), ('Surname', 'Wallace')]
Predicted: 
[('GivenName', 'Ben'), ('Nickname', '"Big Ben"'), ('Surname', 'Wallace')]
True: 
[('Surname', 'MC'), ('Surname', 'GLYNN,'), ('GivenName', 'ANDREW'), ('MiddleInitial', 'J')]
Predicted: 
[('Surname', 'MC GLYNN'), ('GivenName', 'ANDREW'), ('MiddleInitial', 'J')]
True: 
[('GivenName', 'Allen'), ('Nickname', '"T

In [8]:
SOME_RESEARCH = ['Heorhii Vdovychenko', 
                 'Vdovychenko Heorhii', 
                 'Vdovychenko, Heorhii', 
                 'Heorhii, Vdovychenko',
                 'John Bonzo Bonham',
                 'John "Bonzo" Bonham',
                 'Slash',
                 'Cheng Long',
                 'Li Lianjie',
                 'Ma Yun',
                 'Rajesh Khanna',
                 'Khanna Rajesh'
                  ]

In [9]:
for name in SOME_RESEARCH:
    print(f'Initial string: {name}')
    print(NameParser().tag(name))
    print(pp.tag(name)[0])
    print('')

Initial string: Heorhii Vdovychenko
[('GivenName', 'Heorhii'), ('Surname', 'Vdovychenko')]
OrderedDict([('CorporationName', 'Heorhii Vdovychenko')])

Initial string: Vdovychenko Heorhii
[('Surname', 'Vdovychenko'), ('Surname', 'Heorhii')]
OrderedDict([('CorporationName', 'Vdovychenko Heorhii')])

Initial string: Vdovychenko, Heorhii
[('Surname', 'Vdovychenko,'), ('GivenName', 'Heorhii')]
OrderedDict([('CorporationName', 'Vdovychenko, Heorhii')])

Initial string: Heorhii, Vdovychenko
[('Surname', 'Heorhii,'), ('GivenName', 'Vdovychenko')]
OrderedDict([('CorporationName', 'Heorhii, Vdovychenko')])

Initial string: John Bonzo Bonham
[('GivenName', 'John'), ('MiddleName', 'Bonzo'), ('Surname', 'Bonham')]
OrderedDict([('GivenName', 'John'), ('MiddleName', 'Bonzo'), ('Surname', 'Bonham')])

Initial string: John "Bonzo" Bonham
[('GivenName', 'John'), ('Nickname', '"Bonzo"'), ('Surname', 'Bonham')]
OrderedDict([('GivenName', 'John'), ('Nickname', '"Bonzo"'), ('Surname', 'Bonham')])

Initial st