In [1]:
from fdh_gallica import Periodical, Search, Document
from fdh_gallica.parallel_process import iiif_urls_for_documents

from tqdm.autonotebook import tqdm

import json
import pandas as pd
from pandas.io.json import json_normalize


from itertools import chain
from itertools import compress
import re




In [2]:
%matplotlib inline

get all images with relevant query

In [3]:
re_execute = False
if re_execute:
    obj = Search(all_fields='atelier Nadar', dc_type='image', dc_creator='Atelier Nadar')
    obj.execute()
    print(f'number of fetched documents: {len(obj.documents)}')
    print(f'total records {obj.total_records}')
    print('retry fetching ...')
    obj.retry()
    print(f'number of fetched documents: {len(obj.documents)}')

    df = json_normalize(json.loads(json.dumps(obj.records)))
    print('save documents to local')
    # list handling becomes a pain of we save as csv
    df.to_pickle('raw_df.pkl')

In [4]:
df = pd.read_pickle('raw_df.pkl')

parse identifier for easier access:

In [5]:
image_dataframe = pd.DataFrame(df['dc:identifier']\
                               .map(lambda x: x[0] if type(x) == list else x))\
                               .rename(columns={'dc:identifier':'identifier'})

df['id'] = image_dataframe

### useless dataframes

remove them for ease of use, next ones contain all the same content

### removing them

In [6]:
del df['dc:type']
del df['dc:language']
del df['dc:format']
del df['dc:identifier']
del df['dc:rights']
del df['dc:publisher']

# getting information about subjects

In [7]:
#some objects are stored as list, others arent, use if else to handle
df['subject_is_list'] = df['dc:subject'].map(lambda x: type(x) == list)

In [8]:
df['len_list'] = df[df.subject_is_list]['dc:subject'].map(len)

length is non uniform ... we will have to unnest

In [9]:
df.len_list.value_counts()

2.0    12834
3.0      748
4.0      239
5.0      233
6.0       10
7.0        9
Name: len_list, dtype: int64

In [10]:
df[df.subject_is_list]['dc:subject'].map(lambda x: x).value_counts().head()

[Bernhardt, Sarah (1844-1923) -- Portraits, Portraits du théâtre -- 19e siècle]          266
[Réjane (1856-1920) -- Portraits, Portraits du théâtre -- 19e siècle]                    118
[Théo, Louise (1854-1922) -- Portraits, Portraits du théâtre -- 19e siècle]              109
[Simon-Girard, Juliette (1859-1959) -- Portraits, Portraits du théâtre -- 19e siècle]     84
[Lantelme, Marie -- Portraits, Portraits du théâtre -- 19e siècle]                        83
Name: dc:subject, dtype: int64

In [11]:
df2 = df[['id','dc:subject','dc:title','subject_is_list','len_list']]

In [12]:
subject1 = df2['dc:subject'][~df2.subject_is_list].unique().tolist()

In [13]:
subject1

['Portraits du théâtre -- 19e siècle',
 'Portraits de studio -- 19e siècle',
 'Portraits collectifs -- 19e siècle',
 'Scènes théâtrales -- 19e siècle',
 'Portraits -- 19e siècle',
 'Portraits équestres -- 19e siècle',
 'Expositions de photographie',
 'Tableaux vivants et mises en scène -- 19e siècle',
 'Portraits de plein air -- 19e siècle',
 'Portraits après décès -- 19e siècle',
 'Portraits de studio',
 "Vues d'intérieur -- 19e siècle",
 'Luco, François (18..-1882) -- Portraits',
 'Paysages -- 19e siècle',
 'Cham (1818-1879) -- Oeuvres -- Dessin',
 'Carvalho, Léon (1825-1897) -- Tombes',
 'Sand, George (1804-1876) -- Statues',
 'Caricatures et dessins humoristiques -- 19e siècle',
 'Figuet, Gabrielle (1862-1889) -- Portraits',
 'Delmas, Jean-François (1861-1933) -- Portraits']

In [14]:
subjects_to_exclude = list(set(subject1) - set(['Luco, François (18..-1882) -- Portraits',
                                                'Figuet, Gabrielle (1862-1889) -- Portraits',
                                                'Cham (1818-1879) -- Oeuvres -- Dessin',
                                                'Delmas, Jean-François (1861-1933) -- Portraits',
                                                'Carvalho, Léon (1825-1897) -- Tombes',
                                                'Sand, George (1804-1876) -- Statues']))

In [15]:
df2['len_list'][~df2.len_list.isna()].groupby(df2.len_list).size()

len_list
2.0    12834
3.0      748
4.0      239
5.0      233
6.0       10
7.0        9
Name: len_list, dtype: int64

In [271]:
#df2['dc:subject'][df2.len_list == 7].tolist()

In [17]:
def subject_filter(x):
    if type(x) == str:
        if x in subjects_to_exclude:
            result = ''
        else:
            result = [x]
    elif type(x) == list:
        temp = []
        for s in x:
            if s not in subjects_to_exclude:
                #x.remove(s)
                temp.append(s)
        result = temp
    else:
        result = type(x)
    return result

In [18]:
df2['subjects'] = df2['dc:subject'].apply(subject_filter)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [19]:
#df2['subjects'][df2.len_list== 7].tolist()

In [20]:
df2['number_of_subjects'] = df2['subjects'].apply(len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [21]:
df2.groupby(df2.number_of_subjects).size()

number_of_subjects
0     9599
1    12881
2      853
3      189
4        9
5        3
6        1
dtype: int64

In [22]:
df2 = df2.drop(['dc:subject', 'subject_is_list','len_list'],axis=1)

In [23]:
df2 = df2[df2.number_of_subjects != 0]

In [24]:
df2

Unnamed: 0,id,dc:title,subjects,number_of_subjects
7,https://gallica.bnf.fr/ark:/12148/btv1b53168872x,"[Me Auguez] : [photographie, tirage de démonst...","[Auguez, Mathilde (1868-1955) -- Portraits]",1
8,https://gallica.bnf.fr/ark:/12148/btv1b531688701,"[Me Auguez] : [photographie, tirage de démonst...","[Auguez, Mathilde (1868-1955) -- Portraits]",1
9,https://gallica.bnf.fr/ark:/12148/btv1b53171770j,"M. Bruant : [photographie, tirage de démonstra...","[Bruant, Aristide (1851-1925) -- Portraits]",1
10,https://gallica.bnf.fr/ark:/12148/btv1b53168873c,"Me Auguez : [photographie, tirage de démonstra...","[Auguez, Mathilde (1868-1955) -- Portraits]",1
13,https://gallica.bnf.fr/ark:/12148/btv1b531651494,"M. A. Brasseur : [photographie, tirage de démo...","[Brasseur, Albert (1862-1932) -- Portraits]",1
...,...,...,...,...
23528,https://gallica.bnf.fr/ark:/12148/btv1b531203810,"M. Hignette : [photographie, tirage de démonst...",[Ombre portée],1
23530,https://gallica.bnf.fr/ark:/12148/btv1b53123957r,"[Brémont] : [photographie, tirage de démonstra...","[Brémont, Léon (1852-1939) -- Portraits]",1
23531,https://gallica.bnf.fr/ark:/12148/btv1b53124037v,"[Caron] : [photographie, tirage de démonstrati...","[Caron, Cécile (1852-1923) -- Portraits]",1
23533,https://gallica.bnf.fr/ark:/12148/btv1b53124131j,"Croizette : [photographie, tirage de démonstra...","[Croizette, Sophie (1847-1901) -- Portraits]",1


In [25]:
def title_filter(x):
    if type(x) == list:
        x = ''.join(x)
    
    if type(x) == str:
        try:
            result = x.split(':')[0].strip().strip('[]')
        except:
            result = x
    else:
        result = x
        
    return result
        

In [26]:
df2['title'] = df2['dc:title'].apply(title_filter)
#df2 = df2[df2.number_of_subjects!=0]

In [27]:
df2

Unnamed: 0,id,dc:title,subjects,number_of_subjects,title
7,https://gallica.bnf.fr/ark:/12148/btv1b53168872x,"[Me Auguez] : [photographie, tirage de démonst...","[Auguez, Mathilde (1868-1955) -- Portraits]",1,Me Auguez
8,https://gallica.bnf.fr/ark:/12148/btv1b531688701,"[Me Auguez] : [photographie, tirage de démonst...","[Auguez, Mathilde (1868-1955) -- Portraits]",1,Me Auguez
9,https://gallica.bnf.fr/ark:/12148/btv1b53171770j,"M. Bruant : [photographie, tirage de démonstra...","[Bruant, Aristide (1851-1925) -- Portraits]",1,M. Bruant
10,https://gallica.bnf.fr/ark:/12148/btv1b53168873c,"Me Auguez : [photographie, tirage de démonstra...","[Auguez, Mathilde (1868-1955) -- Portraits]",1,Me Auguez
13,https://gallica.bnf.fr/ark:/12148/btv1b531651494,"M. A. Brasseur : [photographie, tirage de démo...","[Brasseur, Albert (1862-1932) -- Portraits]",1,M. A. Brasseur
...,...,...,...,...,...
23528,https://gallica.bnf.fr/ark:/12148/btv1b531203810,"M. Hignette : [photographie, tirage de démonst...",[Ombre portée],1,M. Hignette
23530,https://gallica.bnf.fr/ark:/12148/btv1b53123957r,"[Brémont] : [photographie, tirage de démonstra...","[Brémont, Léon (1852-1939) -- Portraits]",1,Brémont
23531,https://gallica.bnf.fr/ark:/12148/btv1b53124037v,"[Caron] : [photographie, tirage de démonstrati...","[Caron, Cécile (1852-1923) -- Portraits]",1,Caron
23533,https://gallica.bnf.fr/ark:/12148/btv1b53124131j,"Croizette : [photographie, tirage de démonstra...","[Croizette, Sophie (1847-1901) -- Portraits]",1,Croizette


In [28]:
def name_filter(x):
    return [s.split('--')[0].strip() for s in x]

In [29]:
_prenames = df2.subjects.apply(name_filter)    

In [30]:
_prenames

7         [Auguez, Mathilde (1868-1955)]
8         [Auguez, Mathilde (1868-1955)]
9         [Bruant, Aristide (1851-1925)]
10        [Auguez, Mathilde (1868-1955)]
13        [Brasseur, Albert (1862-1932)]
                      ...               
23528                     [Ombre portée]
23530        [Brémont, Léon (1852-1939)]
23531        [Caron, Cécile (1852-1923)]
23533    [Croizette, Sophie (1847-1901)]
23534       [Capoul, Victor (1839-1924)]
Name: subjects, Length: 13936, dtype: object

In [31]:
def is_name(x):
    names = []
    for s in x:
        if '(' not in s and ',' not in s:
            pass
        else:
            names.append(s.split('--')[0].strip())
    return names

def not_name(x):
    words = []
    for s in x:
        if '(' not in s and ',' not in s:
            words.append(s)
        else:
            pass
    return words

In [32]:
not_names = _prenames.apply(not_name).tolist()
is_names = _prenames.apply(is_name).tolist()

In [33]:
is_names = list(set(chain.from_iterable(is_names)))
not_names = list(set(chain.from_iterable(not_names)))

In [269]:
#not_names[0:30]

In [270]:
#is_names[0:30]

In [36]:
names = []
for n in is_names:
    name_ = n.split('(')[0].strip().lower().split(',')
    names.append(name_)
        
names = list(set(chain.from_iterable(names)))
names = list(set(list(map(lambda x:x.strip(),names))))

In [37]:
mask_tags = list(map((lambda x: ',' not in x), not_names))
subject_tags = list(compress(not_names, mask_tags))

In [268]:
#subject_tags

In [39]:
mask_names = list(map((lambda x: ',' in x), subject_tags))

subject_names = list(compress(subject_tags, mask_names))
#subject_names

In [40]:
def exclude_title_name(x):
    temp = []
    for t in x:
        t = re.sub('[\[\] (".,")]', '', t)

        if t.lower() in names:
            pass
        else:
            temp.append(t.lower())
    return temp

In [41]:
df2['title_broken'] = df2.title.apply(lambda x: re.findall('[\S]+',x))

In [42]:
df2.title_broken

7              [Me, Auguez]
8              [Me, Auguez]
9              [M., Bruant]
10             [Me, Auguez]
13       [M., A., Brasseur]
                ...        
23528        [M., Hignette]
23530             [Brémont]
23531               [Caron]
23533           [Croizette]
23534              [Capoul]
Name: title_broken, Length: 13936, dtype: object

In [43]:
tag_df = pd.DataFrame(df2.id)
#tag_df['subjects'] = df2['subjects']
tag_df['title'] = df2['title_broken'].apply(exclude_title_name)
tag_df['subject'] = df2.subjects.apply(name_filter).apply(not_name)

In [44]:
tag_df

Unnamed: 0,id,title,subject
7,https://gallica.bnf.fr/ark:/12148/btv1b53168872x,[me],[]
8,https://gallica.bnf.fr/ark:/12148/btv1b531688701,[me],[]
9,https://gallica.bnf.fr/ark:/12148/btv1b53171770j,[m],[]
10,https://gallica.bnf.fr/ark:/12148/btv1b53168873c,[me],[]
13,https://gallica.bnf.fr/ark:/12148/btv1b531651494,[m],[]
...,...,...,...
23528,https://gallica.bnf.fr/ark:/12148/btv1b531203810,"[m, hignette]",[Ombre portée]
23530,https://gallica.bnf.fr/ark:/12148/btv1b53123957r,[],[]
23531,https://gallica.bnf.fr/ark:/12148/btv1b53124037v,[],[]
23533,https://gallica.bnf.fr/ark:/12148/btv1b53124131j,[],[]


In [45]:
def filter_tags(x):
    tag_4 = ['cure', 'chef', 'lord', 'abbé']
    
    if type(x) != str:
        return False
    
    if x.find(',') != -1:
        return False
    
    if len(x) < 3:
        return False
    
    elif len(x) == 3:
        if x == 'roi':
            return True
        else:
            return False
    elif len(x) == 4:
        if x in tag_4:
            return True
        else:
            return False        
    else:
        return True

In [46]:
tags_title_ = tag_df.title.apply(pd.Series).merge(tag_df, right_index=True, left_index=True) \
        .drop(['title','subject'],axis=1).melt(id_vars=['id'], value_name='tags').drop('variable', axis=1)

In [47]:
tags_title_ = tags_title_[tags_title_.tags.apply(filter_tags)]

In [48]:
tags_title_

Unnamed: 0,id,tags
10,https://gallica.bnf.fr/ark:/12148/btv1b53162160w,d'arc
19,https://gallica.bnf.fr/ark:/12148/btv1b53162161b,d'arc
26,https://gallica.bnf.fr/ark:/12148/btv1b530922119,schah
33,https://gallica.bnf.fr/ark:/12148/btv1b530653663,député
34,https://gallica.bnf.fr/ark:/12148/btv1b530923448,schah
...,...,...
206335,https://gallica.bnf.fr/ark:/12148/btv1b531595399,majesté
207456,https://gallica.bnf.fr/ark:/12148/btv1b530921667,forme
220271,https://gallica.bnf.fr/ark:/12148/btv1b531595399,suite
262079,https://gallica.bnf.fr/ark:/12148/btv1b531595399,naser


In [49]:
def get_taglist(df, n=10):
    
    return df[df['tags'].map(df['tags'].value_counts()) > n]

In [50]:
taglist_title = list(get_taglist(tags_title_).tags.unique())

In [51]:
#taglist_title

In [52]:
tags_subject_ = tag_df.subject.apply(pd.Series).merge(tag_df, right_index=True, left_index=True) \
        .drop(['title','subject'],axis=1).melt(id_vars=['id'], value_name='tags').drop('variable', axis=1)



In [53]:
tags_subject_ = tags_subject_[tags_subject_.tags.apply(filter_tags)]

In [54]:
taglist_subject = list(get_taglist(tags_subject_, n=1).tags.unique())
#taglist_subject

In [55]:
#some manual work

taglist_title_final = ['député', 'ambassade', 'ministre', 'bordas',
       'compositeur', 'général', 'chinois', 'journaliste',
       'opéra', 'sculpteur', 'vaudeviliste', 'peintre',
       'colonel', 'auteur', 'historien', 'comédien', 'amiral',
       'baretta', 'prince', 'chimiste', 'avocat',
       'piccolo', 'comédie', 'romancier', 'abbé', 
       'écrivain', 'capitaine', 'navigation', 'vaudeville', 
       'châtelet', 'explorateur', 'aéronaute',
       'critique', 'cantatrice', 'princesse', 'poète', 'violoniste',
       'docteur',  'journal', 'marquis',
       'dessinateur', 'musicien', 'chanteur',
       'publiciste', 'chef', 'couturier', 'lanthelme', 'famille',
       'opéra-comique', 'écuyère', 'actrice',
       'napoléon', 'directeur', 'ecrivain', 'suite', 'anglaise',
       'palais-royal', 'folies', 'cirque', 'ecuyère',
        'théâtre-français', 'gymnase', 'frère',
      'roi', 'folies-dramatiques',
       'mousquetaires', 'bouffes-parisiens', 'professeur', 'groupe',
       'littérateur', 'président', 'maison',
       
       'lyrique', 'cluny', 'reichemberg', 'folies-dramatiques', 'frères',
       
       'chevallier', 'danseuse',  
       'décorateur', 'ambassadeur', 
       'politique', 'saint', 'république',
        'droit',
        'marquise', 
       'sénateur',
       'artiste', 'cloches',
       'majesté', 'japonaise', 'dramatique',
       'comique', 'française', 'aérienne', 'conservatoire', 'bourgeois',
       'italien', 'royal', "l'institut", 
       "d'orchestre", 'comédie-française', 
       'revue', 'bergère', "d'hiver", 'potter', 'dramatiques', 
       'suédoise', "l'académie", 'opéra-comique',
       'orphée',
       
       'folies-dramatiques',
       'vaudevill', 'odéon', 'phèdre', 'assassin', 'décoré',
       'saint-martin', 'trouvère', 'vénus', "l'arlésienne",
       'assommoir', 
       'cantinière', "s'amuse", 'amour', 'opéra', 'serment', 'rouge',
       
       'mascotte', 'gymnase',
       'châtelet', 
       
       'africaine',
        'juanita', 'perse', 'musique',
       'nouveau', 'lettres', 'russe', 'breuil', 'lantelme', 'hanovre',
       'pyrénées', 'parisienne', 'ventre',

       'chevalier', 'séville', 'américaine',  "l'impératrice",
       'commune', 'cigale', 'saturnales', 'cendrillonnette', 'tzigane',
       
       'camarade', 'marchande', 'bicyclistes',
       'sans-gêne', 'cousin-cousine', "d'avignon", 'pilules', 'fétiche',
       'cliquette', 
       'patard', 'patart', 'joyeusetés', 'fantaisies-parisiennes',
       'dramatiques', 'carreau',  'suzette', 'enfers', 'école',
       
       
        'trèfle', 

        'tambour-major', 'couronne',
        'diable',
       'pyrennées', 'christ', 'hussard', 'galles', "l'année", 'chèvres',
       'seigneur', 'clairette']

In [56]:
tag_df['tags_title'] = tag_df.title.apply(lambda x: list(set(x).intersection(set(taglist_title_final)))) 
tag_df['tags_subject'] = tag_df.subject.apply(lambda x: list(set(x).intersection(set(taglist_subject)))) 

In [57]:
tag_df['tags'] = tag_df.tags_title + tag_df.tags_subject

In [58]:
tag_df['len'] = tag_df.tags.apply(lambda x: len(x))

In [59]:
tag_df[tag_df.len >0]

Unnamed: 0,id,title,subject,tags_title,tags_subject,tags,len
14,https://gallica.bnf.fr/ark:/12148/btv1b53168871g,"[me, opéra-comique]",[],[opéra-comique],[],[opéra-comique],1
38,https://gallica.bnf.fr/ark:/12148/btv1b53171027z,"[me, variétés, les, variétés, de, l'année]",[],[l'année],[],[l'année],1
39,https://gallica.bnf.fr/ark:/12148/btv1b53171028d,"[me, variétés, les, variétés, de, l'année]",[],[l'année],[],[l'année],1
45,https://gallica.bnf.fr/ark:/12148/btv1b53165012c,"[me, folies-dramatiques, juanita]",[],"[juanita, folies-dramatiques]",[],"[juanita, folies-dramatiques]",2
50,https://gallica.bnf.fr/ark:/12148/btv1b53165010g,"[m, folies-dramatiques, juanita]",[],"[juanita, folies-dramatiques]",[],"[juanita, folies-dramatiques]",2
...,...,...,...,...,...,...,...
23516,https://gallica.bnf.fr/ark:/12148/btv1b530506270,"[de, journaliste]",[],[journaliste],[],[journaliste],1
23517,https://gallica.bnf.fr/ark:/12148/btv1b53065782n,"[homme, de, lettres]",[],[lettres],[],[lettres],1
23519,https://gallica.bnf.fr/ark:/12148/btv1b53066278c,"[directeur, des, bouffes]",[],[directeur],[],[directeur],1
23523,https://gallica.bnf.fr/ark:/12148/btv1b53118792t,[docteur],[],[docteur],[],[docteur],1


In [60]:
tag_id_df = pd.DataFrame(tag_df[['id','tags']], columns=['id','tags'])

In [61]:
tag_id_df.to_pickle('tag-id.pkl')

In [64]:
wiki_df = pd.DataFrame(df2[['id','subjects']])

In [65]:
wiki_df['names'] = wiki_df.subjects.apply(is_name)

In [66]:
wiki_df['names'][wiki_df.names.apply(lambda x:len(x)>2)]

69       [Georges, Édouard (1829-1903), Armand, Victori...
119      [Chevreul, Eugène (1786-1889), Chevreul, Henri...
120      [Chevreul, Eugène (1786-1889), Chevreul, Henri...
121      [Chevreul, Eugène (1786-1889), Nadar (1820-191...
135      [Chevreul, Eugène (1786-1889), Chevreul, Henri...
                               ...                        
23392    [Hadamar, Zélie (1849-1902 ; actrice), Lambert...
23445    [Hamburger, Edouard (18..-1886), Baron, Vincen...
23450    [Lavallière, Ève (1866-1929), Dupuis, Joseph (...
23468    [Mary-Albert, Madame (1855-....), Duhamel, Bia...
23469    [Mary-Albert, Madame (1855-....), Duhamel, Bia...
Name: names, Length: 65, dtype: object

In [67]:
re.findall('[\S]+','Auguez, Mathilde (1868-1955)')

['Auguez,', 'Mathilde', '(1868-1955)']

In [68]:
wiki_explode = wiki_df.explode('names')

In [69]:
wiki_explode.groupby(wiki_explode.names).size()

names
Abbatucci, Séverin (1821-1888)               5
Abbott, Emma (1850-1891)                    50
Abbéma, Louise (1858-1927)                  12
Abney, William de Wiveleslie (1843-1920)     3
Abott, Bessie (1878-1919)                    1
                                            ..
Énault, Louis (1824-1900)                    1
Éon, Charles de Beaumont d' (1728-1810)      1
Étex, Antoine (1808-1888)                    2
Étiévant, Henri (1870-1953)                  1
Č̌ebyšev, Pafnutij Lʹvovič (1821-1894)       3
Length: 1963, dtype: int64

In [70]:
wiki_names = wiki_explode.names.unique()

In [71]:
import wikipedia



In [72]:
def hacked_summary(x):
    try:
        x = re.sub('[,(\-)]', ' ', x)
        keywords = re.findall('[\S]+', x.lower())

        s = wikipedia.summary(x, sentences=2)
        mask = list(map(lambda x: s.lower().find(x) != -1 ,keywords))
        se = list(compress(keywords, mask))
        
        #se = set(keywords).intersection(set(s.lower().split()))
        return (s, list(se))
    except:
        return ''

In [75]:
wiki_reset = False

if wiki_reset:
    wikipedia.set_lang('en')
    wiki = pd.DataFrame(wiki_names, columns=['name'])

    wiki['result'] = wiki.name.apply(hacked_summary)

    wiki[['summary','keyword']] = pd.DataFrame(wiki.result.values.tolist(), index= wiki.index)
    
    wikipedia.set_lang('fr')
    wiki['result_fr'] = wiki.name.apply(hacked_summary)
    wiki[['summary_fr','keyword_fr']] = pd.DataFrame(wiki.result_fr.values.tolist(), index= wiki.index)
    wiki.to_pickle('wiki_summaries.pkl')
    #   wiki[['name','summary','summary_fr']][(~wiki.summary.isnull() & wiki.keyword.apply(lambda x: x!=[]))| \
 #                              (~wiki.summary_fr.isnull() & wiki.keyword_fr.apply(lambda x:x!=[]))].to_pickle('wiki_summaries.pkl')

In [77]:
wiki = pd.read_pickle('wiki_summaries.pkl')


In [None]:
wiki_notnull = wiki[~wiki.summary.isnull()]
wiki_mismatch = wiki_notnull[['name','summary']][wiki_notnull.keyword.apply(lambda x: len(x)== 0)]
wiki_dict = wiki_mismatch.to_dict(orient='records')

In [81]:
wiki_explode.names.unique()

array(['Auguez, Mathilde (1868-1955)', 'Bruant, Aristide (1851-1925)',
       'Brasseur, Albert (1862-1932)', ...,
       'Hamburger, Edouard (18..-1886)', 'Lamy, François (18..-1903)',
       'Delessert, Eugène (1819-1877)'], dtype=object)

In [144]:
wiki_names = wiki_explode.names.dropna().apply(lambda x: x.split('(')[0].strip())

In [203]:
wiki_names[wiki_names.apply(lambda x: len(x.split(',')) == 3)].unique()

array(['Gélabert, Conchita, Marie', 'Potel, Pierre, Armand',
       'Salinas, Maria, Paola', 'Barbey, Isidore, Théodore',
       'Fourtier, Hyacinthe, Pierre', 'Mesmaecker, Pierre, Joseph de',
       'Thuillier-Leloir, Louise, Victoire', 'Debay, Venulie, Elise',
       "Allonville, Armand, vicomte d'", 'Moïna-Clément, Anne, Victorine'],
      dtype=object)

In [197]:
l = 'Isnard, Jean, Baptiste, Marie, Félix'.split(',')
m = [4,0,1,2,3]

In [200]:
' '.join([y.strip() for x,y in sorted(zip(m,l))])

'Jean Baptiste Marie Félix Isnard'

In [207]:
def reorder_names(x):
    names_order = {1:[0],2:[1,0], 3:[2,0,1], 4:[3,0,1,2], 5:[4,0,1,2,3]}
    s = x.split(',')
    m = names_order[len(s)]
    return ' '.join([y.strip() for x,y in sorted(zip(m,s))])
    

In [138]:
def convert_utf(x):
    s = str(x.encode())
    s = re.sub("'",'',s.split("b'")[-1])
    s = s.strip()
    s = s.replace('\\x','%')
    return s

In [151]:
from SPARQLWrapper import SPARQLWrapper, XML

In [172]:
def sparql_str() :
    pass

In [184]:
def sparql_uri(x):
    sparql_str.__doc__ = 'PREFIX foaf: <http://xmlns.com/foaf/0.1/>    SELECT ?pers     WHERE {    ?pers foaf:name "%s".    }    LIMIT 100' % (x)
    sparql = SPARQLWrapper("http://data.bnf.fr/sparql")
    sparql.setQuery(sparql_str.__doc__)
    sparql.setReturnFormat(XML)
    results = sparql.query().convert()
    xml_ = results.toxml()
    return re.findall("<uri>\S*</uri>", xml_)

In [260]:
wiki_namelist = wiki_names.apply(reorder_names).unique()
bnf_name_error = []
bnf_not_found = []
bnf_link = {}

In [251]:
bnf_name_error

[]

In [261]:
for i in tqdm(wiki_namelist):
    try:
        s = sparql_uri(i)
        if s == []:
            bnf_not_found.append(i)
        else:
            bnf_link[i] = s
    except:
        bnf_name_error.append(i)
# takes about 20 minutes

HBox(children=(IntProgress(value=0, max=1950), HTML(value='')))




In [262]:
bnf_name_error

['Herald de Pages']

In [264]:
bnf_not_found.__len__()

155

In [267]:
bnf_link.keys().__len__()

In [273]:
bnf_link.json('bnf_link.json')

AttributeError: 'dict' object has no attribute 'tjson'