In [3]:
from fdh_gallica import Periodical, Search, Document
from fdh_gallica.parallel_process import iiif_urls_for_documents
from tqdm.autonotebook import tqdm
import json
import pandas as pd
from pandas.io.json import json_normalize
import itertools
from itertools import chain
from itertools import compress
import re
import wikipedia
from bs4 import BeautifulSoup
import requests
from SPARQLWrapper import SPARQLWrapper, XML



In [4]:
%matplotlib inline

get all images with relevant query

In [5]:
re_execute = False
if re_execute:
    obj = Search(all_fields='atelier Nadar', dc_type='image', dc_creator='Atelier Nadar')
    obj.execute()
    print(f'number of fetched documents: {len(obj.documents)}')
    print(f'total records {obj.total_records}')
    print('retry fetching ...')
    obj.retry()
    print(f'number of fetched documents: {len(obj.documents)}')

    df = json_normalize(json.loads(json.dumps(obj.records)))
    print('save documents to local')
    # list handling becomes a pain of we save as csv
    df.to_pickle('../data/raw_df.pkl')

In [6]:
df = pd.read_pickle('../data/raw_df.pkl')

parse identifier for easier access:

In [7]:
image_dataframe = pd.DataFrame(df['dc:identifier']\
                               .map(lambda x: x[0] if type(x) == list else x))\
                               .rename(columns={'dc:identifier':'identifier'})

df['id'] = image_dataframe

### useless dataframes

remove them for ease of use, next ones contain all the same content

### removing them

In [8]:
del df['dc:type']
del df['dc:language']
del df['dc:format']
del df['dc:identifier']
del df['dc:rights']
del df['dc:publisher']

# getting information about subjects -> use as tags

In [9]:
#some objects are stored as list, others arent, use if else to handle
df['subject_is_list'] = df['dc:subject'].map(lambda x: type(x) == list)

In [10]:
df['len_list'] = df[df.subject_is_list]['dc:subject'].map(len)

length is non uniform ... we will have to unnest

In [11]:
df.len_list.value_counts()

2.0    12834
3.0      748
4.0      239
5.0      233
6.0       10
7.0        9
Name: len_list, dtype: int64

In [12]:
df[df.subject_is_list]['dc:subject'].map(lambda x: x).value_counts().head()

[Bernhardt, Sarah (1844-1923) -- Portraits, Portraits du théâtre -- 19e siècle]          266
[Réjane (1856-1920) -- Portraits, Portraits du théâtre -- 19e siècle]                    118
[Théo, Louise (1854-1922) -- Portraits, Portraits du théâtre -- 19e siècle]              109
[Simon-Girard, Juliette (1859-1959) -- Portraits, Portraits du théâtre -- 19e siècle]     84
[Lantelme, Marie -- Portraits, Portraits du théâtre -- 19e siècle]                        83
Name: dc:subject, dtype: int64

In [13]:
df2 = df[['id','dc:subject','dc:title','subject_is_list','len_list']]

In [14]:
subject1 = df2['dc:subject'][~df2.subject_is_list].unique().tolist()

In [15]:
subject1

['Portraits du théâtre -- 19e siècle',
 'Portraits de studio -- 19e siècle',
 'Portraits collectifs -- 19e siècle',
 'Scènes théâtrales -- 19e siècle',
 'Portraits -- 19e siècle',
 'Portraits équestres -- 19e siècle',
 'Expositions de photographie',
 'Tableaux vivants et mises en scène -- 19e siècle',
 'Portraits de plein air -- 19e siècle',
 'Portraits après décès -- 19e siècle',
 'Portraits de studio',
 "Vues d'intérieur -- 19e siècle",
 'Luco, François (18..-1882) -- Portraits',
 'Paysages -- 19e siècle',
 'Cham (1818-1879) -- Oeuvres -- Dessin',
 'Carvalho, Léon (1825-1897) -- Tombes',
 'Sand, George (1804-1876) -- Statues',
 'Caricatures et dessins humoristiques -- 19e siècle',
 'Figuet, Gabrielle (1862-1889) -- Portraits',
 'Delmas, Jean-François (1861-1933) -- Portraits']

In [16]:
subjects_to_exclude = list(set(subject1) - set(['Luco, François (18..-1882) -- Portraits',
                                                'Figuet, Gabrielle (1862-1889) -- Portraits',
                                                'Cham (1818-1879) -- Oeuvres -- Dessin',
                                                'Delmas, Jean-François (1861-1933) -- Portraits',
                                                'Carvalho, Léon (1825-1897) -- Tombes',
                                                'Sand, George (1804-1876) -- Statues']))

In [17]:
df2['len_list'][~df2.len_list.isna()].groupby(df2.len_list).size()

len_list
2.0    12834
3.0      748
4.0      239
5.0      233
6.0       10
7.0        9
Name: len_list, dtype: int64

In [18]:
def subject_filter(x):
    if type(x) == str:
        if x in subjects_to_exclude:
            result = ''
        else:
            result = [x]
    elif type(x) == list:
        temp = []
        for s in x:
            if s not in subjects_to_exclude:
                #x.remove(s)
                temp.append(s)
        result = temp
    else:
        result = type(x)
    return result

In [19]:
df2['subjects'] = df2['dc:subject'].apply(subject_filter)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [20]:
df2['number_of_subjects'] = df2['subjects'].apply(len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [21]:
df2.groupby(df2.number_of_subjects).size()

number_of_subjects
0     9599
1    12881
2      853
3      189
4        9
5        3
6        1
dtype: int64

In [22]:
df2 = df2.drop(['dc:subject', 'subject_is_list','len_list'],axis=1)

In [23]:
df2 = df2[df2.number_of_subjects != 0]

In [24]:
#df2

In [25]:
def title_filter(x):
    if type(x) == list:
        x = ''.join(x)
    
    if type(x) == str:
        try:
            result = x.split(':')[0].strip().strip('[]')
        except:
            result = x
    else:
        result = x
        
    return result
        

In [26]:
df2['title'] = df2['dc:title'].apply(title_filter)

In [27]:
#df2

In [28]:
def name_filter(x):
    return [s.split('--')[0].strip() for s in x]

In [29]:
_prenames = df2.subjects.apply(name_filter)    

In [30]:
#_prenames

In [31]:
def is_name(x):
    names = []
    for s in x:
        if '(' not in s and ',' not in s:
            pass
        else:
            names.append(s.split('--')[0].strip())
    return names

def not_name(x):
    words = []
    for s in x:
        if '(' not in s and ',' not in s:
            words.append(s)
        else:
            pass
    return words

In [32]:
not_names = _prenames.apply(not_name).tolist()
is_names = _prenames.apply(is_name).tolist()

In [33]:
is_names = list(set(chain.from_iterable(is_names)))
not_names = list(set(chain.from_iterable(not_names)))

In [34]:
names = []
for n in is_names:
    name_ = n.split('(')[0].strip().lower().split(',')
    names.append(name_)
        
names = list(set(chain.from_iterable(names)))
names = list(set(list(map(lambda x:x.strip(),names))))

In [35]:
mask_tags = list(map((lambda x: ',' not in x), not_names))
subject_tags = list(compress(not_names, mask_tags))

In [36]:
#subject_tags

In [37]:
mask_names = list(map((lambda x: ',' in x), subject_tags))

subject_names = list(compress(subject_tags, mask_names))
#subject_names

In [38]:
def exclude_title_name(x):
    temp = []
    for t in x:
        t = re.sub('[\[\] (".,")]', '', t)

        if t.lower() in names:
            pass
        else:
            temp.append(t.lower())
    return temp

In [39]:
df2['title_broken'] = df2.title.apply(lambda x: re.findall('[\S]+',x))

In [40]:
df2.title_broken

7              [Me, Auguez]
8              [Me, Auguez]
9              [M., Bruant]
10             [Me, Auguez]
13       [M., A., Brasseur]
                ...        
23528        [M., Hignette]
23530             [Brémont]
23531               [Caron]
23533           [Croizette]
23534              [Capoul]
Name: title_broken, Length: 13936, dtype: object

In [41]:
tag_df = pd.DataFrame(df2.id)
tag_df['title'] = df2['title_broken'].apply(exclude_title_name)
tag_df['subject'] = df2.subjects.apply(name_filter).apply(not_name)

In [42]:
tag_df

Unnamed: 0,id,title,subject
7,https://gallica.bnf.fr/ark:/12148/btv1b53168872x,[me],[]
8,https://gallica.bnf.fr/ark:/12148/btv1b531688701,[me],[]
9,https://gallica.bnf.fr/ark:/12148/btv1b53171770j,[m],[]
10,https://gallica.bnf.fr/ark:/12148/btv1b53168873c,[me],[]
13,https://gallica.bnf.fr/ark:/12148/btv1b531651494,[m],[]
...,...,...,...
23528,https://gallica.bnf.fr/ark:/12148/btv1b531203810,"[m, hignette]",[Ombre portée]
23530,https://gallica.bnf.fr/ark:/12148/btv1b53123957r,[],[]
23531,https://gallica.bnf.fr/ark:/12148/btv1b53124037v,[],[]
23533,https://gallica.bnf.fr/ark:/12148/btv1b53124131j,[],[]


In [43]:
def filter_tags(x):
    tag_4 = ['cure', 'chef', 'lord', 'abbé']
    
    if type(x) != str:
        return False
    
    if x.find(',') != -1:
        return False
    
    if len(x) < 3:
        return False
    
    elif len(x) == 3:
        if x == 'roi':
            return True
        else:
            return False
    elif len(x) == 4:
        if x in tag_4:
            return True
        else:
            return False        
    else:
        return True

In [44]:
tags_title_ = tag_df.title.apply(pd.Series).merge(tag_df, right_index=True, left_index=True) \
        .drop(['title','subject'],axis=1).melt(id_vars=['id'], value_name='tags').drop('variable', axis=1)

In [45]:
tags_title_ = tags_title_[tags_title_.tags.apply(filter_tags)]

In [46]:
tags_title_

Unnamed: 0,id,tags
10,https://gallica.bnf.fr/ark:/12148/btv1b53162160w,d'arc
19,https://gallica.bnf.fr/ark:/12148/btv1b53162161b,d'arc
26,https://gallica.bnf.fr/ark:/12148/btv1b530922119,schah
33,https://gallica.bnf.fr/ark:/12148/btv1b530653663,député
34,https://gallica.bnf.fr/ark:/12148/btv1b530923448,schah
...,...,...
206335,https://gallica.bnf.fr/ark:/12148/btv1b531595399,majesté
207456,https://gallica.bnf.fr/ark:/12148/btv1b530921667,forme
220271,https://gallica.bnf.fr/ark:/12148/btv1b531595399,suite
262079,https://gallica.bnf.fr/ark:/12148/btv1b531595399,naser


In [47]:
def get_taglist(df, n=10):
    
    return df[df['tags'].map(df['tags'].value_counts()) > n]

In [48]:
taglist_title = list(get_taglist(tags_title_).tags.unique())

In [49]:
#taglist_title

In [50]:
tags_subject_ = tag_df.subject.apply(pd.Series).merge(tag_df, right_index=True, left_index=True) \
        .drop(['title','subject'],axis=1).melt(id_vars=['id'], value_name='tags').drop('variable', axis=1)



In [51]:
tags_subject_ = tags_subject_[tags_subject_.tags.apply(filter_tags)]

In [52]:
taglist_subject = list(get_taglist(tags_subject_, n=1).tags.unique())
#taglist_subject

In [53]:
#some manual work

taglist_title_final = ['député', 'ambassade', 'ministre', 'bordas',
       'compositeur', 'général', 'chinois', 'journaliste',
       'opéra', 'sculpteur', 'vaudeviliste', 'peintre',
       'colonel', 'auteur', 'historien', 'comédien', 'amiral',
       'baretta', 'prince', 'chimiste', 'avocat',
       'piccolo', 'comédie', 'romancier', 'abbé', 
       'écrivain', 'capitaine', 'navigation', 'vaudeville', 
       'châtelet', 'explorateur', 'aéronaute',
       'critique', 'cantatrice', 'princesse', 'poète', 'violoniste',
       'docteur',  'journal', 'marquis',
       'dessinateur', 'musicien', 'chanteur',
       'publiciste', 'chef', 'couturier', 'lanthelme', 'famille',
       'opéra-comique', 'écuyère', 'actrice',
       'napoléon', 'directeur', 'ecrivain', 'suite', 'anglaise',
       'palais-royal', 'folies', 'cirque', 'ecuyère',
        'théâtre-français', 'gymnase', 'frère',
      'roi', 'folies-dramatiques',
       'mousquetaires', 'bouffes-parisiens', 'professeur', 'groupe',
       'littérateur', 'président', 'maison',
       
       'lyrique', 'cluny', 'reichemberg', 'folies-dramatiques', 'frères',
       
       'chevallier', 'danseuse',  
       'décorateur', 'ambassadeur', 
       'politique', 'saint', 'république',
        'droit',
        'marquise', 
       'sénateur',
       'artiste', 'cloches',
       'majesté', 'japonaise', 'dramatique',
       'comique', 'française', 'aérienne', 'conservatoire', 'bourgeois',
       'italien', 'royal', "l'institut", 
       "d'orchestre", 'comédie-française', 
       'revue', 'bergère', "d'hiver", 'potter', 'dramatiques', 
       'suédoise', "l'académie", 'opéra-comique',
       'orphée',
       
       'folies-dramatiques',
       'vaudevill', 'odéon', 'phèdre', 'assassin', 'décoré',
       'saint-martin', 'trouvère', 'vénus', "l'arlésienne",
       'assommoir', 
       'cantinière', "s'amuse", 'amour', 'opéra', 'serment', 'rouge',
       
       'mascotte', 'gymnase',
       'châtelet', 
       
       'africaine',
        'juanita', 'perse', 'musique',
       'nouveau', 'lettres', 'russe', 'breuil', 'lantelme', 'hanovre',
       'pyrénées', 'parisienne', 'ventre',

       'chevalier', 'séville', 'américaine',  "l'impératrice",
       'commune', 'cigale', 'saturnales', 'cendrillonnette', 'tzigane',
       
       'camarade', 'marchande', 'bicyclistes',
       'sans-gêne', 'cousin-cousine', "d'avignon", 'pilules', 'fétiche',
       'cliquette', 
       'patard', 'patart', 'joyeusetés', 'fantaisies-parisiennes',
       'dramatiques', 'carreau',  'suzette', 'enfers', 'école',
       
       
        'trèfle', 

        'tambour-major', 'couronne',
        'diable',
       'pyrennées', 'christ', 'hussard', 'galles', "l'année", 'chèvres',
       'seigneur', 'clairette']

In [54]:
tag_df['tags_title'] = tag_df.title.apply(lambda x: list(set(x).intersection(set(taglist_title_final)))) 
tag_df['tags_subject'] = tag_df.subject.apply(lambda x: list(set(x).intersection(set(taglist_subject)))) 

In [55]:
tag_df['tags'] = tag_df.tags_title + tag_df.tags_subject

In [56]:
tag_df['len'] = tag_df.tags.apply(lambda x: len(x))

In [57]:
tag_df[tag_df.len >0]

Unnamed: 0,id,title,subject,tags_title,tags_subject,tags,len
14,https://gallica.bnf.fr/ark:/12148/btv1b53168871g,"[me, opéra-comique]",[],[opéra-comique],[],[opéra-comique],1
38,https://gallica.bnf.fr/ark:/12148/btv1b53171027z,"[me, variétés, les, variétés, de, l'année]",[],[l'année],[],[l'année],1
39,https://gallica.bnf.fr/ark:/12148/btv1b53171028d,"[me, variétés, les, variétés, de, l'année]",[],[l'année],[],[l'année],1
45,https://gallica.bnf.fr/ark:/12148/btv1b53165012c,"[me, folies-dramatiques, juanita]",[],"[juanita, folies-dramatiques]",[],"[juanita, folies-dramatiques]",2
50,https://gallica.bnf.fr/ark:/12148/btv1b53165010g,"[m, folies-dramatiques, juanita]",[],"[juanita, folies-dramatiques]",[],"[juanita, folies-dramatiques]",2
...,...,...,...,...,...,...,...
23516,https://gallica.bnf.fr/ark:/12148/btv1b530506270,"[de, journaliste]",[],[journaliste],[],[journaliste],1
23517,https://gallica.bnf.fr/ark:/12148/btv1b53065782n,"[homme, de, lettres]",[],[lettres],[],[lettres],1
23519,https://gallica.bnf.fr/ark:/12148/btv1b53066278c,"[directeur, des, bouffes]",[],[directeur],[],[directeur],1
23523,https://gallica.bnf.fr/ark:/12148/btv1b53118792t,[docteur],[],[docteur],[],[docteur],1


In [58]:
tag_id_df = pd.DataFrame(tag_df[['id','tags']], columns=['id','tags'])

In [59]:
tag_id_df.to_pickle('data/tag-id.pkl')

# Fetch wikipedia article

In [60]:
wiki_df = pd.DataFrame(df2[['id','subjects']])

In [61]:
wiki_df['names'] = wiki_df.subjects.apply(is_name)

In [62]:
wiki_df['names'][wiki_df.names.apply(lambda x:len(x)>2)]

69       [Georges, Édouard (1829-1903), Armand, Victori...
119      [Chevreul, Eugène (1786-1889), Chevreul, Henri...
120      [Chevreul, Eugène (1786-1889), Chevreul, Henri...
121      [Chevreul, Eugène (1786-1889), Nadar (1820-191...
135      [Chevreul, Eugène (1786-1889), Chevreul, Henri...
                               ...                        
23392    [Hadamar, Zélie (1849-1902 ; actrice), Lambert...
23445    [Hamburger, Edouard (18..-1886), Baron, Vincen...
23450    [Lavallière, Ève (1866-1929), Dupuis, Joseph (...
23468    [Mary-Albert, Madame (1855-....), Duhamel, Bia...
23469    [Mary-Albert, Madame (1855-....), Duhamel, Bia...
Name: names, Length: 65, dtype: object

In [63]:
wiki_explode = wiki_df.explode('names')
wiki_names = wiki_explode.names.unique()
wiki_names = wiki_df.explode('names').unique()

In [64]:
def hacked_summary(x):
        
    try:
        x = re.sub('[,(;\-)]', ' ', x)
        keywords = re.findall('[\S]+', x.lower())
        keywords = list(compress(keywords, list(map(lambda x: re.findall('[\d]+[\.\?]+', x.lower()) == [], keywords))))


        p = wikipedia.page(x)

        url = p.url
        s = p.summary
        weight = len(p.images)

        mask = list(map(lambda x: s.lower().find(x) != -1 ,keywords))
        se = list(compress(keywords, mask))

        if len(se) == 0:
            s = ''
            weight = -1
            url = -1
                    
        return [s, list(keywords), list(se), weight, url]
        
    except:
        return ['', [], [],-1,'']

In [67]:
wiki_names = wiki_df.explode('names').unique()

wiki_reset = False

if wiki_reset:
    wikipedia.set_lang('en')
    wiki_en = pd.DataFrame(wiki_names, columns=['name'])
    wiki_en['result'] = wiki_en.name.apply(hacked_summary)
    wiki_en[['summary','keyword', 'keyword_found','weight','url']] = pd.DataFrame(wiki_en.result.values.tolist(), index= wiki_en.index)
    
    wikipedia.set_lang('fr')
    wiki_fr = pd.DataFrame(wiki_names, columns=['name'])
    wiki_fr['result_fr'] = wiki_fr.name.apply(hacked_summary)
    wiki_fr[['summary_fr','keyword_fr', 'keyword_found_fr','weight_fr', 'url_fr']] = pd.DataFrame(wiki_fr.result_fr.values.tolist(), index= wiki_fr.index)

# takes 2 hours



  lis = BeautifulSoup(html).find_all('li')


In [89]:
#wiki.to_pickle('data/wiki_raw.pkl')


if wiki_reset = False:

    wiki = pd.read_pickle('data/wiki_raw.pkl')
    wiki = wiki[(wiki.weight != -1) | (wiki.weight_fr != -1)]

    wiki_en = wiki[['name','keyword_found','weight','url']]
    wiki_fr = wiki[['name','keyword_found_fr','weight_fr','url_fr']]

In [101]:
wiki_exclude_en = wiki_en['name'][wiki_en.keyword_found.apply(lambda x: len(x) == 1)].tolist()
wiki_exclude_en.remove('Beraud, Jeanne (18..-1...)')

wiki_en = wiki_en[wiki_en.weight != -1]
wiki_en = wiki_en[wiki_en.name.apply(lambda x: x not in wiki_exclude_en)]
wiki_en

Unnamed: 0,name,keyword_found,weight,url
0,"Auguez, Mathilde (1868-1955)","[auguez, mathilde, 1868, 1955]",6,https://en.wikipedia.org/wiki/Mathilde_Auguez
1,"Bruant, Aristide (1851-1925)","[bruant, aristide, 1851, 1925]",3,https://en.wikipedia.org/wiki/Aristide_Bruant
4,"Lender, Marcelle (1862-1926)","[lender, marcelle, 1862, 1926]",7,https://en.wikipedia.org/wiki/Marcelle_Lender
5,"Bernhardt, Sarah (1844-1923)","[bernhardt, sarah, 1844, 1923]",45,https://en.wikipedia.org/wiki/Sarah_Bernhardt
11,"Gille, Philippe (1831-1901)","[gille, philippe, 1831, 1901]",3,https://en.wikipedia.org/wiki/Philippe_Gille
...,...,...,...,...
1939,"Schrader, Franz (1844-1924)","[schrader, franz, 1844, 1924]",4,https://en.wikipedia.org/wiki/Franz_Schrader
1940,"Wolff, Pierre (1865-1944)","[wolff, pierre, 1865, 1944]",4,https://en.wikipedia.org/wiki/Pierre_Wolff
1943,"Kaemmerer, Frederik Hendrik (1839-1902)","[kaemmerer, frederik, hendrik, 1839, 1902]",9,https://en.wikipedia.org/wiki/Frederik_Hendrik...
1953,"Roujon, Henry (1853-1914)","[roujon, henry, 1853, 1914]",6,https://en.wikipedia.org/wiki/Henry_Roujon


In [None]:
wiki_exclude_fr = wiki_fr['name'][wiki_fr.keyword_found_fr.apply(lambda x: len(x) == 1)].tolist()
wiki_exclude_fr.remove('Lockroy (1803-1891)')
#wiki_exclude_fr

In [None]:
wiki_fr = wiki_fr[wiki_fr.weight_fr != -1]
wiki_fr = wiki_fr[wiki_fr.name.apply(lambda x: x not in wiki_exclude_fr)]
#wiki_fr

In [115]:
wiki_fr

Unnamed: 0,name,keyword_found_fr,weight_fr,url_fr
0,"Auguez, Mathilde (1868-1955)","[auguez, mathilde, 1868, 1955]",7,https://fr.wikipedia.org/wiki/Mathilde_Auguez
1,"Bruant, Aristide (1851-1925)","[bruant, aristide, 1851, 1925]",16,https://fr.wikipedia.org/wiki/Aristide_Bruant
2,"Brasseur, Albert (1862-1932)","[brasseur, albert, 1862, 1932]",13,https://fr.wikipedia.org/wiki/Albert_Brasseur
5,"Bernhardt, Sarah (1844-1923)","[bernhardt, sarah, 1844, 1923]",43,https://fr.wikipedia.org/wiki/Sarah_Bernhardt
9,"Gobin, Charles Constant (1843-1907)","[gobin, charles, constant, 1843, 1907]",12,https://fr.wikipedia.org/wiki/Charles_Constant...
...,...,...,...,...
1943,"Kaemmerer, Frederik Hendrik (1839-1902)","[kaemmerer, frederik, hendrik, 1839, 1902]",5,https://fr.wikipedia.org/wiki/Frederik_Hendrik...
1953,"Roujon, Henry (1853-1914)","[roujon, henry, 1853, 1914]",8,https://fr.wikipedia.org/wiki/Henry_Roujon
1955,Maison Doucet (Paris),"[doucet, paris]",24,https://fr.wikipedia.org/wiki/Jacques_Doucet_(...
1959,"Jackson, Stonewall (1824-1863)","[jackson, stonewall, 1824, 1863]",13,https://fr.wikipedia.org/wiki/Thomas_Jonathan_...


In [120]:
lockroy = wikipedia.page('Lockroy', auto_suggest=False)
wiki_fr.loc[wiki_fr['name'] == 'Lockroy (1803-1891)', ['weight_fr', 'url_fr']] = [len(lockroy.images), lockroy.url]
wiki_fr[wiki_fr.name == 'Lockroy (1803-1891)']

Unnamed: 0,name,keyword_found_fr,weight_fr,url_fr
365,Lockroy (1803-1891),[lockroy],6,https://fr.wikipedia.org/wiki/Lockroy


In [None]:
wiki_en.to_pickle('data/wiki_en_summaries.pkl')
wiki_fr.to_pickle('data/wiki_fr_summaries.pkl')

# Fetch BnF description

In [95]:
def reorder_names(x):
    names_order = {1:[0],2:[1,0], 3:[2,0,1], 4:[3,0,1,2], 5:[4,0,1,2,3]}
    s = x.split(',')
    m = names_order[len(s)]
    return ' '.join([y.strip() for x,y in sorted(zip(m,s))])
    

In [678]:
names_df = wiki_df.explode('names')[['names']].dropna().drop_duplicates()

In [680]:
names_df = names_df.rename(columns={'names':'title'})

In [681]:
names_df['name'] = names_df.title.apply(lambda x: x.split('(')[0].strip())

In [682]:
names_df['ordered_name'] = names_df.name.apply(reorder_names)

In [683]:
names_df

Unnamed: 0,title,name,ordered_name
7,"Auguez, Mathilde (1868-1955)","Auguez, Mathilde",Mathilde Auguez
9,"Bruant, Aristide (1851-1925)","Bruant, Aristide",Aristide Bruant
13,"Brasseur, Albert (1862-1932)","Brasseur, Albert",Albert Brasseur
19,"Decroza, Francine (1868-19..)","Decroza, Francine",Francine Decroza
30,"Lender, Marcelle (1862-1926)","Lender, Marcelle",Marcelle Lender
...,...,...,...
23236,"Jackson, Stonewall (1824-1863)","Jackson, Stonewall",Stonewall Jackson
23241,"Delaunay, Louis Arsène (1826-1903)","Delaunay, Louis Arsène",Louis Arsène Delaunay
23445,"Hamburger, Edouard (18..-1886)","Hamburger, Edouard",Edouard Hamburger
23445,"Lamy, François (18..-1903)","Lamy, François",François Lamy


In [685]:
names_df['sup'] =  names_df.title.apply(lambda x: str(re.findall('\(\S+\)',x)).strip("['']"))

In [539]:
names_df[names_df.name == 'Nicolas II']
names_df
#names_df['year'] =  names_df.title.apply(lambda x: str(re.findall('\(1[789].+-1[89\.].+',x)).strip("['']"))

Unnamed: 0,title,name,ordered_name,year,sup
7,"Auguez, Mathilde (1868-1955)","Auguez, Mathilde",Mathilde Auguez,(1868-1955),(1868-1955)
9,"Bruant, Aristide (1851-1925)","Bruant, Aristide",Aristide Bruant,(1851-1925),(1851-1925)
13,"Brasseur, Albert (1862-1932)","Brasseur, Albert",Albert Brasseur,(1862-1932),(1862-1932)
19,"Decroza, Francine (1868-19..)","Decroza, Francine",Francine Decroza,(1868-19..),(1868-19..)
30,"Lender, Marcelle (1862-1926)","Lender, Marcelle",Marcelle Lender,(1862-1926),(1862-1926)
...,...,...,...,...,...
23236,"Jackson, Stonewall (1824-1863)","Jackson, Stonewall",Stonewall Jackson,(1824-1863),(1824-1863)
23241,"Delaunay, Louis Arsène (1826-1903)","Delaunay, Louis Arsène",Louis Arsène Delaunay,(1826-1903),(1826-1903)
23445,"Hamburger, Edouard (18..-1886)","Hamburger, Edouard",Edouard Hamburger,(18..-1886),(18..-1886)
23445,"Lamy, François (18..-1903)","Lamy, François",François Lamy,(18..-1903),(18..-1903)


In [98]:
def sparql_str() :
    pass

In [366]:
def sparql_uri(x, option='name'):
    if option == 'name':
        sparql_str.__doc__ = 'PREFIX foaf: <http://xmlns.com/foaf/0.1/>    SELECT ?pers     WHERE {    ?pers foaf:name "%s".    }    LIMIT 200' % (x)
    elif option == 'familyName':        
        sparql_str.__doc__ = 'PREFIX foaf: <http://xmlns.com/foaf/0.1/>    SELECT ?pers     WHERE {    ?pers foaf:familyName "%s".    }    LIMIT 200' % (x)        
    elif option == 'givenName':
        pass
        
    sparql = SPARQLWrapper("http://data.bnf.fr/sparql")
    sparql.setQuery(sparql_str.__doc__)
    sparql.setReturnFormat(XML)
    results = sparql.query().convert()
    xml_ = results.toxml()
    return list(map(lambda x: x[5:-6], re.findall("<uri>\S*</uri>", xml_)))

In [297]:
names_df

Unnamed: 0,title,name,ordered_name,year
7,"Auguez, Mathilde (1868-1955)","Auguez, Mathilde",Mathilde Auguez,(1868-1955)
9,"Bruant, Aristide (1851-1925)","Bruant, Aristide",Aristide Bruant,(1851-1925)
13,"Brasseur, Albert (1862-1932)","Brasseur, Albert",Albert Brasseur,(1862-1932)
19,"Decroza, Francine (1868-19..)","Decroza, Francine",Francine Decroza,(1868-19..)
30,"Lender, Marcelle (1862-1926)","Lender, Marcelle",Marcelle Lender,(1862-1926)
...,...,...,...,...
23236,"Jackson, Stonewall (1824-1863)","Jackson, Stonewall",Stonewall Jackson,(1824-1863)
23241,"Delaunay, Louis Arsène (1826-1903)","Delaunay, Louis Arsène",Louis Arsène Delaunay,(1826-1903)
23445,"Hamburger, Edouard (18..-1886)","Hamburger, Edouard",Edouard Hamburger,(18..-1886)
23445,"Lamy, François (18..-1903)","Lamy, François",François Lamy,(18..-1903)


In [409]:
bnf_namelist = names_df.ordered_name#[0:60]
bnf_name_error = []
bnf_not_found = []
bnf_link = {}

In [410]:
bnf_reset = False

if bnf_reset:
    for i in tqdm(bnf_namelist):
        l = i.split()
        try:
            s = sparql_uri(i)
            
            if s == []:
                #try by familyname
                s = sparql_uri(l[0], option='familyName')
                
                if s == []:
                    s = sparql_uri(l[-1], option='familyName')
                    
                    if s == []:
                        s = sparql_uri(i, option='familyName')
                        
                        if s == []:
                            bnf_not_found3.append(i)
                        else:
                            pass
                    else:
                        pass
                else:
                    pass
            else:
                pass
                      
            bnf_link[i] = [s]
        except:
            bnf_name_error.append(i)
            
    with open('bnf_link.json', 'w') as fp:
        json.dump(bnf_link, fp)
# takes about 20 minutes

HBox(children=(IntProgress(value=0, max=1963), HTML(value='')))




In [738]:
with open('data/bnf_name_error.json', 'w') as fp:
    json.dump(bnf_name_error, fp)
with open('data/bnf_not_found.json', 'w') as fp:
    json.dump(bnf_not_found, fp)

In [107]:
with open('bnf_link.json','r') as fp:
    bnf_link =  json.load(fp)

In [413]:
bnf_df_ = pd.DataFrame.from_dict(bnf_link, orient='index', columns=['links'])

In [389]:
def get_bnf_description(links):
    result = {}

    for link in links:
        try:
            page = requests.get(link)
            soup = BeautifulSoup(page.content, 'html.parser')
            try:
                bnf_text = soup.find("div", {"itemprop": "description"}).get_text()
            except:
                bnf_text = ''
            
            bnf_title = soup.find("title").get_text()
            result[bnf_title] = bnf_text

        except:
            pass
            
        
    return result

In [414]:
bnf_df_['bnf'] = bnf_df_.links.apply(get_bnf_description)
bnf_df_ = bnf_df_.reset_index().rename(columns={'index':'ordered_name'})

In [736]:
bnf_df_.to_pickle('data/bnf_text_raw.pkl')

In [710]:
bnf_df_merge = pd.merge(bnf_df_, names_df[['ordered_name','sup']], on='ordered_name')

In [711]:
bnf_df_merge

Unnamed: 0,ordered_name,links,bnf,sup
0,Mathilde Auguez,[http://data.bnf.fr/ark:/12148/cb149853704#about],{'Mathilde Auguez (1868-1955)': 'Chanteuse lyr...,(1868-1955)
1,Aristide Bruant,[http://data.bnf.fr/ark:/12148/cb11894238r#about],{'Aristide Bruant (1851-1925)': 'Chansonnier e...,(1851-1925)
2,Albert Brasseur,[http://data.bnf.fr/ark:/12148/cb10728832j#about],{'Albert Brasseur (1862-1932)': 'Acteur. - Dir...,(1862-1932)
3,Francine Decroza,[http://data.bnf.fr/ark:/12148/cb169379381#about],{'Francine Decroza (1868-19..)': 'Artiste lyri...,(1868-19..)
4,Marcelle Lender,[http://data.bnf.fr/ark:/12148/cb14653634g#about],{'Marcelle Lender (1862-1926)': 'Actrice'},(1862-1926)
...,...,...,...,...
1983,Stonewall Jackson,[http://data.bnf.fr/ark:/12148/cb138955263#abo...,{'Stonewall Jackson': 'Vocaliste. - Guitariste...,(1824-1863)
1984,Louis Arsène Delaunay,[http://data.bnf.fr/ark:/12148/cb14654126q#about],{'Louis Arsène Delaunay (1826-1903)': 'Acteur ...,(1826-1903)
1985,Edouard Hamburger,[http://data.bnf.fr/ark:/12148/cb15094430q#about],{'Edouard Hamburger (18..-1886)': 'Acteur de t...,(18..-1886)
1986,François Lamy,[http://data.bnf.fr/ark:/12148/cb135452617#abo...,{'François Lamy': 'Attaché parlementaire. - Dé...,(18..-1903)


In [712]:
import numpy as np

def handle_year(years):
    try:
        byear = years[0]
        dyear = years[1]

        try:
            byear =  int(byear)
        except:
            byear =  1855

        try:
            dyear =  int(dyear)
        except:
            dyear =  1910
        return  min(1910, dyear) - max(1855, byear)

    except:
        return 0
    
        
        

def bnf_match(keys):
    activity = range(1855, 1910)
    
    
#    years = list(map(lambda x: re.findall('1[.89].+-1[.89].+',x),keys))
    years = list(map(lambda x: str(re.findall('1[89].+-1[89].+',x)).strip("['()']"), keys))

    years_list = list(map(lambda x: handle_year(x.split('-')),years))
    
    
    
#    keys_new = list(compress(keys, map(lambda x: x != '',years)))
    
        
    try :
        return keys[np.argmax(years_list)]
    except:
        return np.nan


In [713]:
bnf_df_merge['names'] = bnf_df_merge.bnf.apply(lambda x: list(x.keys()))

In [714]:
bnf_df_merge['title'] = bnf_df_merge.apply(lambda x:str(x.ordered_name + ' ' + x.sup).strip(), axis=1)

In [715]:
bnf_df_merge['key_title'] = bnf_df_merge.apply(lambda x: list(compress(x.names, list(map(lambda y: x.title in y, x.names)))), axis=1)
bnf_df_merge['key_name'] = bnf_df_merge.apply(lambda x: list(compress(x.names, list(map(lambda y: x.ordered_name in y, x.names)))), axis=1)
bnf_df_merge['key_sup'] = bnf_df_merge.apply(lambda x: list(compress(x.names, list(map(lambda y: x.sup in y, x.names)))), axis=1)

In [716]:
bnf_df_merge['key'] = bnf_df_merge.apply(lambda x: list(set.intersection(set(x.key_title), set(x.key_name), set(x.key_sup))), axis=1)

In [717]:
bnf_df_merge.key.apply(lambda x: len(x)).value_counts(ascending=False)

1      1867
0        73
2        11
3         5
5         4
175       3
131       2
6         2
7         2
9         2
10        2
12        1
8         1
11        1
187       1
176       1
16        1
23        1
30        1
42        1
67        1
76        1
88        1
90        1
95        1
13        1
Name: key, dtype: int64

In [718]:
bnf_df_merge['key_match'] = bnf_df_merge.key.apply(bnf_match)#.apply(lambda x: len(x)).value_counts(ascending=False)

In [719]:
bnf_df_merge['description'] = bnf_df_merge.dropna(axis=0).apply(lambda x: x.bnf[x.key_match],axis=1)

In [737]:
bnf_df_merge[['ordered_name','description']].to_pickle('data/bnf_description.pkl')#.description.dropna(axis=0)