In [1]:
import pandas as pd
import os

os.chdir('..')
from utils.metrics import *
from utils.matching import *

In [2]:
df_source = pd.read_csv('data/source/dnb.csv')
df_target = pd.read_csv('data/target/authors.csv')

In [3]:
df_source.head()

Unnamed: 0,name,link,page_number,text,dates,birth_year,death_year,floruit_years,last,first,middle
0,"Abbadie, Jacques",https://en.wikisource.org/wiki/Dictionary_of_N...,1,"ABBADIE , JACQUES (or JAMES), D.D.(1654?–1727...",,,,,Abbadie,Jacques,
1,"Abbot, Charles (d.1817)",https://en.wikisource.org/wiki/Dictionary_of_N...,3,"ABBOT , CHARLES ( d. 1817) botanist, sometime...",d.1817,,1817.0,,Abbot,Charles,
2,"Abbot, Charles (1757-1829)",https://en.wikisource.org/wiki/Dictionary_of_N...,3,"ABBOT , CHARLES, first Baron Colchester (1757...",1757-1829,1757.0,1829.0,1757-1829,Abbot,Charles,
3,"Abbot, George (1562-1633)",https://en.wikisource.org/wiki/Dictionary_of_N...,5,"ABBOT , GEORGE (1562–1633), archbishop of Can...",1562-1633,1562.0,1633.0,1562-1633,Abbot,George,
4,"Abbot, George (1603-1648)",https://en.wikisource.org/wiki/Dictionary_of_N...,20,"ABBOT , GEORGE (1603–1648), religious writer,...",1603-1648,1603.0,1648.0,1603-1648,Abbot,George,


In [4]:
df_target.head()

Unnamed: 0,author,title,year,full_name,corrected_name,corrected_name.1,titles,credentials,first,middle,last,formatted_name
0,"Francis Fox, M. Inst. C. E.","The boring of the Simplon Tunnel, and the dist...",1905,Francis Fox,"['Francis', 'Fox']",Francis Fox,,,Francis,,Fox,"Fox, Francis"
1,"C. V. Boys, F. R. S.","The boring of the Simplon Tunnel, and the dist...",1905,C. V. Boys,"['C.', 'V.', 'Boys']",C. V. Boys,,,C.,V.,Boys,"Boys, C. V."
2,"Professor C. Niven, F. R. S.",On a method of finding the conductivity for heat.,1905,Professor C. Niven,"['C.', 'Niven']",C. Niven,['professor'],,C.,,Niven,"Niven, C."
3,"Richard C. Maclaurin, M. A., LL. D.",Theory of the reflection of light near the pol...,1905,Richard C. Maclaurin,"['Richard', 'C.', 'Maclaurin']",Richard C. Maclaurin,,,Richard,C.,Maclaurin,"Maclaurin, Richard C."
4,"Professor J. Larmor, Sec. R. S.",Theory of the reflection of light near the pol...,1905,Professor J. Larmor,"['J.', 'Larmor']",J. Larmor,['professor'],,J.,,Larmor,"Larmor, J."


In [5]:
# The DNB does not include biographies of people who died after 1885, so we can filter out all authors who died after 1885
# Just because someone published before 1885 does not mean that they did not die much later and would not be in the DNB
# So, we'll just allow for a "Could not be resolved" category
df_target = df_target[df_target['year'] <= 1885]
df_target = df_target.reset_index(drop=True)

In [6]:
df_target

Unnamed: 0,author,title,year,full_name,corrected_name,corrected_name.1,titles,credentials,first,middle,last,formatted_name
0,Lieut. J. H. Hennessey.,XIV. On the atmospheric lines of the Solar spe...,1870,Lieut. J. H. Hennessey.,['J. H. Hennessey.'],J. H. Hennessey,['lieut'],,,,J. H. Hennessey,J. H. Hennessey
1,President.,XIV. On the atmospheric lines of the Solar spe...,1870,President.,[''],,['president'],,,,,
2,"Earl of Rosse, F. R. S.",XV. On the radiation of heat from the moon.\#x...,1870,Earl of Rosse,"['Earl', 'of', 'Rosse']",Earl of Rosse,,,,,Earl of Rosse,Earl of Rosse
3,"W. H. L. Russell, F. R. S.",XVI. On linear differential equations. \#x2014...,1870,W. H. L. Russell,"['W.', 'H.', 'L.', 'Russell']",W. H. L. Russell,,,W.,H. L.,Russell,"Russell, W. H. L."
4,A. Le Sueur.,XVII. Observations with the great melbourne te...,1870,A. Le Sueur.,"['A.', 'Le', 'Sueur.']",A. Le Sueur,,,A.,Le,Sueur,"Sueur, A. Le"
...,...,...,...,...,...,...,...,...,...,...,...,...
13644,W. E. Parry,Observations to determine the amount of Atmosp...,1826,W. E. Parry,"['W.', 'E.', 'Parry']",W. E. Parry,,,W.,E.,Parry,"Parry, W. E."
13645,Henry Foster,Observations to determine the amount of Atmosp...,1826,Henry Foster,"['Henry', 'Foster']",Henry Foster,,,Henry,,Foster,"Foster, Henry"
13646,J. C. Ross,Observations to determine the amount of Atmosp...,1826,J. C. Ross,"['J.', 'C.', 'Ross']",J. C. Ross,,,J.,C.,Ross,"Ross, J. C."
13647,William Thomson,The Bakerian Lecture.\#x2014;On the Electro-dy...,1856,William Thomson,"['William', 'Thomson']",William Thomson,,,William,,Thomson,"Thomson, William"


In [7]:
weights = {
    "first_lev": 0.2,
    "first_jac": 0.2,
    "first_init": 0.2,
    "middle_lev": 0.1,
    "middle_jac": 0.1,
    "middle_init": 0.1,
    "last_lev": 0.4,
    "last_jac": 0.2,
    "last_init": 0.1,
    "year": 0.3
}

matches = match(df_target.iloc[10000], df_source, weights=weights)

In [8]:
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=6)

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [9]:
# add prospective names to the whole target dataframe
df_target['prospective'] = df_target.parallel_apply(lambda x: match(x, df_source, weights=weights), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2275), Label(value='0 / 2275'))), …

In [10]:
# save the new df
import pickle

df_target.to_pickle("authors_with_prospective_canonical_names.pkl")
