Goal: 
    - Extract unique names from subject and title and search in wikipedia information about them
    
Next Steps:
    - Improve the search using year (extract year from records and match with wikipedia description date) for the items with more than one reference attached in wikipedia

In [1]:
import json
import re
import collections
import numpy as np
import multiprocessing

import wikipedia
import pandas as pd

wikipedia.set_lang('en')

from fdh_gallica import Search

N_PROC = 8



In [2]:
# search = Search(all_fields="atelier nadar", dc_type="image", dc_creator='atelier nadar')
# search.execute()
# search.total_records
# with open('raw_records.json', 'w') as fp:
#     json.dump(search.records, fp)
with open('raw_records.json', 'r') as fp:
    raw_records = json.load(fp)

In [3]:
##### Copied from Cleaning data notebook #####
def find_names(name):
    return re.findall('([\S]* [\S]*|[\S]*|[\S]*\, [\S]*[ \S*]+) \([\d]{2}', name)

def find_names_undated(name):
    # mo
    return re.findall('([\S]*|[\S]*\, [\S]*[ \S*]+) -- Portraits', name)

def find_names_title(name):
    return [x.strip() for x in re.findall('([\S\s ^(\:)]*) \: ', name) if len(x.strip()) > 0]

def extract_from_brackets(name):
    return re.findall('\[([\w \-\,\.]*)\]', name)

def extract_caricature(name):
    return name.split('Caricature de')[-1].split('pour')[0][:-2]

In [17]:
def get_names(data):
    
    # in subject
    subject_lists = [[r['dc:subject']] if isinstance(r['dc:subject'], str) 
                     else r['dc:subject'] 
                     for r in raw_records]
    
    names_subject_with_date = [find_names(item) 
                               for sub in subject_lists
                               for item in sub if find_names(item)]  
    
    flat_names_subject_with_date = [item 
                                    for sub in names_subject_with_date 
                                    for item in sub]
    
    names_subject_without_date = [find_names_undated(item) 
                                  for sub in subject_lists
                                  for item in sub if find_names_undated(item)]
    flat_names_subject_without_date = [item.split('(')[0][:-2]
                                            for sub in names_subject_without_date
                                            for item in sub]
    
    # in title
    title_lists = [[r['dc:title']] if isinstance(r['dc:title'], str) 
                     else r['dc:title'] 
                     for r in raw_records]
    
    names_title = [find_names_title(item) 
                   for sub in title_lists
                   for item in sub if find_names_title(item)] 
    
    flat_names_title = [item.replace('[', '').replace(']', '')
                        for sub in names_title
                        for item in sub]
    
    all_names = flat_names_subject_with_date + flat_names_subject_without_date + flat_names_title
    
    all_names = [extract_caricature(x) for x in all_names]
    
    unique_names = [x for x in list(set(all_names)) if x!='']
    
    return unique_names

In [22]:
all_names = get_names(raw_records)
print(len(all_names))
all_names[987:1000]

12644


['M. Fugère et Landr',
 'Savenay. Variétés. La rev',
 'Onésime Recl',
 'Couralet. Folies Bergère. "Les Poules de Grévi',
 'M. Selli',
 'Arnaud, Alb',
 'Mlle Guitty. Gaîté. Bicyclistes en voya',
 'Coche',
 'Tardieu, Ambroi',
 'Baugniet, Charl',
 'Me Carnot. Mère du président de la Républiq',
 'Gildès, Anth',
 'Chavannes, Pier']

In [23]:
wikipedia.search("Plantade, Charles-François (1787-1870)")

['Charles-François Plantade', '1787 in music']

In [47]:
def hacked_search(x):
    return wikipedia.search(x) # just pure functions can be pickled and distributed

pool = multiprocessing.Pool(processes=N_PROC)
wiki_searchs = dict(zip(all_names, pool.map(hacked_search, all_names)))
len(wiki_searchs)

12644



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [48]:
collections.Counter([len(x) for x in wiki_searchs.values()])

Counter({0: 7525,
         2: 359,
         1: 648,
         10: 3236,
         3: 225,
         6: 95,
         5: 145,
         7: 84,
         4: 182,
         8: 97,
         9: 48})

In [49]:
with_0_wiki = [k for k,v in wiki_searchs.items() if len(v)==0]
with_0_wiki[3:6]

['Saint-Georges, Alexandre-Pierre-Thomas-Amab',
 'Weill. Folies Dramatiques. La fauvet',
 'Sanlaville. Odéon. Le mariage de Figa']

In [None]:
def hacked_summary(x):
    try:
        s = wikipedia.summary(x, sentences=2)
        return s
    except:
        return ''

wiki_summaries = dict(zip(all_names, pool.map(hacked_summary, all_names)))
len(wiki_summaries)