In [1]:
import json
import re
import collections
import numpy as np
import multiprocessing

import warnings
warnings.filterwarnings('ignore')

import wikipedia
import pandas as pd

wikipedia.set_lang('en')

from fdh_gallica import Search

N_PROC = 8

In [2]:
df = pd.read_pickle('named_subject.pkl')

In [3]:
df.named_subject.unique().shape

(1937,)

### Wikipedia Search

In [4]:
all_names = df.named_subject.unique().tolist()[0:500]

In [5]:
#all_names

In [6]:
pool = multiprocessing.Pool(processes=N_PROC)
# wiki_searchs = dict(zip(all_names, pool.map(hacked_search, all_names)))
# with open('wiki_searchs.json', 'w') as fp:
#     json.dump(wiki_searchs, fp)
# print(len(wiki_searchs))

In [52]:
help(wikipedia.wikipedia)

Help on module wikipedia.wikipedia in wikipedia:

NAME
    wikipedia.wikipedia

CLASSES
    builtins.object
        WikipediaPage
    
    class WikipediaPage(builtins.object)
     |  WikipediaPage(title=None, pageid=None, redirect=True, preload=False, original_title='')
     |  
     |  Contains data from a Wikipedia page.
     |  Uses property methods to filter data from the raw HTML.
     |  
     |  Methods defined here:
     |  
     |  __eq__(self, other)
     |      Return self==value.
     |  
     |  __init__(self, title=None, pageid=None, redirect=True, preload=False, original_title='')
     |      Initialize self.  See help(type(self)) for accurate signature.
     |  
     |  __repr__(self)
     |      Return repr(self).
     |  
     |  html(self)
     |      Get full page HTML.
     |      
     |  
     |  section(self, section_title)
     |      Get the plain text content of a section from `self.sections`.
     |      Returns None if `section_title` isn't found, otherwis

In [7]:
def hacked_page(x):
    try:
        page = wikipedia.page(x, auto_suggest=False)
    except:
        page = ''
    return page

In [8]:
def hacked_content(x):
    try:
        page = wikipedia.page(x).content
    except:
        page = ''
    return page
    

#### First try searching page summaries

In [9]:
# copied from extract-wiki-info.ipunb

def hacked_summary(x):
    try:
        s = wikipedia.summary(x, sentences=2)
        return s
    except:
        return ''

In [10]:
#wiki_summaries = dict(zip(all_names, pool.map(hacked_summary, all_names)))
wiki_summaries = dict(zip(all_names,map(hacked_summary, all_names)))

save name-wiki summary association in json

In [11]:
with open('wiki_summaries.json', 'w') as fp:
    json.dump(wiki_summaries, fp)

dataframe of named subjects 

In [12]:
wiki_frame = pd.DataFrame(list(wiki_summaries.items()), columns=['name','summary'])

In [13]:
wiki_frame[wiki_frame.summary==''].shape

(172, 2)

In [14]:
wiki_frame.shape

(500, 2)

### To avoid False positives, check only the exact match

In [15]:
df.named_subject.unique().shape

(1937,)

In [16]:
names = df.named_subject.drop_duplicates().str.split(',')

In [17]:
def wiki_exact(name):
    try:
        s = wikipedia.summary(name, auto_suggest=False)
    except wikipedia.DisambiguationError:
        s = 'disambiguation'
    except:
        s = ''
    return s

In [18]:
tempname = 'Leopold II'

In [19]:
wiki_exact(tempname)

'disambiguation'

In [20]:
# first try with simple names

In [21]:
names1 = names[names.apply(lambda x: len(x) == 1)]
names1 = names1.apply(lambda x: ' '.join(x).strip())

In [22]:
names2 = names[names.apply(lambda x: len(x) == 2)]
names2 = names2.apply(lambda x: ' '.join(x[::-1]).strip())

In [23]:
wiki_summaries1 = dict(zip(names1,map(wiki_exact, names1)))

In [24]:
wiki_1 = pd.DataFrame(list(wiki_summaries1.items()), columns=['name','summary'])

In [25]:
wiki_1['page_exist'] = wiki_1.summary != ''

In [26]:
wiki_1['disambiguation'] = wiki_1.summary == 'disambiguation'

In [27]:
wiki_1['is_given_name'] = wiki_1.summary.apply(lambda x: 'given name' in x)

In [28]:
wiki_1

Unnamed: 0,name,summary,page_exist,disambiguation,is_given_name
0,Nâser ed-Din,,False,False,False
1,Bellot,disambiguation,True,True,False
2,Lugné-Poë,,False,False,False
3,Joséphine,Josephine is a female given name. It is the En...,True,False,True
4,Cassive,,False,False,False
...,...,...,...,...,...
119,Crambade,,False,False,False
120,Herbert,disambiguation,True,True,False
121,Morlet,Morlet is a commune in the Saône-et-Loire depa...,True,False,False
122,Femmes,,False,False,False


In [29]:
wiki_one = wiki_1[wiki_1.page_exist][~wiki_1.disambiguation][~wiki_1.is_given_name]

In [30]:
wiki_one

Unnamed: 0,name,summary,page_exist,disambiguation,is_given_name
5,Nadar,Gaspard-Félix Tournachon (6 April 1820 – 20 Ma...,True,False,False
7,Champfleury,Jules François Felix Fleury-Husson (17 Septemb...,True,False,False
8,Commerson,Philibert Commerson (French: [filibɛʁ kɔmɛʁsɔ̃...,True,False,False
14,Daubray,"Michel René Thibaut, known by his stage-name D...",True,False,False
17,Réjane,Gabrielle Charlotte Reju (5 June 1856 – 14 Jun...,True,False,False
20,Montrouge,Montrouge (French pronunciation: ​[mɔ̃ʁuʒ]) is...,True,False,False
22,Mélesville,"Baron Anne-Honoré-Joseph Duveyrier, surnamed M...",True,False,False
26,Pitre-Chevalier,"Pierre-Michel-François Chevalier, known as Pit...",True,False,False
30,Lockroy,"Joseph-Philippe Simon, called Lockroy (Februar...",True,False,False
31,Séverine,"Caroline Rémy de Guebhard (April 27, 1855 – Ap...",True,False,False


In [31]:
wiki_summaries2 = dict(zip(names2,map(wiki_exact, names2)))

In [32]:
wiki_2 = pd.DataFrame(list(wiki_summaries2.items()), columns=['name','summary'])

In [33]:
wiki_2

Unnamed: 0,name,summary
0,Mathilde Auguez,Pauline Mathilde Lucie Auguez de Montalant (28...
1,Aristide Bruant,Aristide Bruant (French: [aʁistid bʁyɑ̃]; 6 Ma...
2,Albert Brasseur,
3,Francine Decroza,
4,Marcelle Lender,Marcelle Lender (1862 – 27 September 1926) was...
...,...,...
1792,Louis Arsène Delaunay,"Louis-Arsène Delaunay (1826–1903), French acto..."
1793,Edouard Hamburger,
1794,François Lamy,disambiguation
1795,Eugène Delessert,


In [34]:
wiki_2['page_exist'] = wiki_2.summary != ''
wiki_2['disambiguation'] = wiki_2.summary == 'disambiguation'

In [35]:
wiki_two = wiki_2[wiki_2.page_exist][~wiki_2.disambiguation]

In [36]:
wiki_two

In [37]:
wiki_simple = pd.concat([wiki_one, wiki_two])

In [38]:
wiki_simple.shape

(830, 5)

In [39]:
wiki_simple['four_digit'] = wiki_simple.summary.apply(lambda x: re.findall('[\d][\d][\d][\d]',x))

In [40]:
def check_year(years):
    r = False
    year_range = range(1800,1900)
    return any(int(t) < 1900 and int(t)>1800 for t in years)

In [41]:
wiki_simple.four_digit.apply(check_year)

5        True
7        True
8       False
14       True
17       True
        ...  
1784    False
1787     True
1791     True
1792     True
1796     True
Name: four_digit, Length: 830, dtype: bool

In [42]:
wiki_simple[wiki_simple.four_digit.apply(check_year)]

Unnamed: 0,disambiguation,is_given_name,name,page_exist,summary,four_digit
5,False,False,Nadar,True,Gaspard-Félix Tournachon (6 April 1820 – 20 Ma...,"[1820, 1910, 1858]"
7,False,False,Champfleury,True,Jules François Felix Fleury-Husson (17 Septemb...,"[1821, 1889, 1843, 1848, 1850, 1851, 1856, 185..."
14,False,False,Daubray,True,"Michel René Thibaut, known by his stage-name D...","[1837, 1892]"
17,False,False,Réjane,True,Gabrielle Charlotte Reju (5 June 1856 – 14 Jun...,"[1856, 1920]"
22,False,False,Mélesville,True,"Baron Anne-Honoré-Joseph Duveyrier, surnamed M...","[1787, 1865]"
...,...,...,...,...,...,...
1781,False,,Jules-Théophile Boucher,True,Jules-Théophile Boucher (15 September 1847 – 2...,"[1847, 1924, 1866]"
1787,False,,Henry Roujon,True,"Henry Roujon (1 September 1853, Paris – 1 June...","[1853, 1914, 1894, 1903, 1911]"
1791,False,,Stonewall Jackson,True,"Thomas Jonathan ""Stonewall"" Jackson (January 2...","[1824, 1863, 1861, 1863, 1846, 1848, 1847, 185..."
1792,False,,Louis Arsène Delaunay,True,"Louis-Arsène Delaunay (1826–1903), French acto...","[1826, 1903, 1845, 1848, 1850, 1877, 1887, 1883]"


In [43]:
with open('wiki_1_match_summaries.json', 'w') as fp:
    json.dump(wiki_summaries1, fp)

In [77]:
# handling disambiguation

In [None]:

def hack_disamb(disamb_names):
    for tempname in disamb_names:
        print("========")
        print(tempname)
        try:
            s = wikipedia.summary(tempname, auto_suggest=False)
        except wikipedia.DisambiguationError as e:
            options = e.options
            for o in options:
                try:
                    s_temp = wikipedia.summary(o)
                    s = check_year(re.findall('[\d][\d][\d][\d]',s_temp))
                    print(s)
                except wikipedia.DisambiguationError :
                    pass
                
                
wiki_disamb = wiki_2[wiki_2.disambiguation]
disamb_names = wiki_disamb.name.tolist()

# however, it seems to be better to handle disambiguation cases manually. only 12 disambiguation for 2-word names