### **Birot mapping**
*Paul Duchesne. 2020-08-17.*

Notebook to map contributed data to defined ontology.

In [1]:
import json
import numpy
import pandas
import pathlib
import rdflib

In [2]:
# define graph and stack "author" columns.

birot = rdflib.Namespace('https://birot.github.io/')
graph = rdflib.Graph()

data_path = pathlib.Path.cwd().parents[0] / 'data'
data = pandas.read_excel(data_path / 'birot-data.xlsx')
data['Book'] = [birot[f'book/{str(x).zfill(7)}'] for x in range(len(data))]
data['Language'] = data['Language'].str.split('/')
data = data.explode('Language')
data = pandas.concat([
    data[['Book', 'Title', 'Author.1', 'Editor', 'Language', 'Month/Year']].rename(columns={'Author.1':'Author'}),
    data[['Book', 'Title', 'Author', 'Editor', 'Language', 'Month/Year']]])

print(len(data))
data.head()

2280


Unnamed: 0,Book,Title,Author,Editor,Language,Month/Year
0,https://birot.github.io/book/0000000,Besessen,,,German,August 1997-August 1998
1,https://birot.github.io/book/0000001,Kane & Abel,,,English,August 1997-August 1998
2,https://birot.github.io/book/0000002,Die Sonntagsfrau,Franco Lucentini,,German,August 1997-August 1998
3,https://birot.github.io/book/0000003,Paris - Ein Fest fürs Leben,,,German,August 1997-August 1998
4,https://birot.github.io/book/0000004,Kitchen,,,German,August 1997-August 1998


In [3]:
# mint "language" IRIs, write those entities to the graph and replace in the table.

language_data = data.copy()
language_data = language_data[['Language']].drop_duplicates()
language_data['Book'] = [birot[f'language/{str(x).zfill(7)}'] for x in range(len(language_data))]

for x in language_data.to_dict('records'):
    graph.add((x['Book'], rdflib.RDF.type, birot['ontology/Language']))
    graph.add((x['Book'], rdflib.RDFS.label, rdflib.Literal(x['Language'])))

language_conversion = language_data.copy()
language_conversion = language_conversion.set_index('Language').T.to_dict('records')[0]
data = data.replace({'Language':language_conversion})

print(len(data))
data.head()

2280


Unnamed: 0,Book,Title,Author,Editor,Language,Month/Year
0,https://birot.github.io/book/0000000,Besessen,,,https://birot.github.io/language/0000000,August 1997-August 1998
1,https://birot.github.io/book/0000001,Kane & Abel,,,https://birot.github.io/language/0000001,August 1997-August 1998
2,https://birot.github.io/book/0000002,Die Sonntagsfrau,Franco Lucentini,,https://birot.github.io/language/0000000,August 1997-August 1998
3,https://birot.github.io/book/0000003,Paris - Ein Fest fürs Leben,,,https://birot.github.io/language/0000000,August 1997-August 1998
4,https://birot.github.io/book/0000004,Kitchen,,,https://birot.github.io/language/0000000,August 1997-August 1998


In [4]:
# same process for "author" and "editor" entities, both subclasses of "person".

def has_profession(row, prof):    
    if len(data.loc[data[prof].isin([row['Person']])]):
        return True

person_data = data.copy()
person_data = pandas.concat([
    person_data[['Author']].rename(columns={'Author':'Person'}),
    person_data[['Editor']].rename(columns={'Editor':'Person'})])
person_data = person_data.loc[~person_data.Person.isin([numpy.nan])].drop_duplicates()

for x in ['Author', 'Editor']:
    person_data[x] = person_data.apply(has_profession, prof=x, axis=1)
person_data['IRI'] = [birot[f'person/{str(x).zfill(7)}'] for x in range(len(person_data))]

for x in person_data.to_dict('records'):
    if x['Author']:
        graph.add((x['IRI'], rdflib.RDF.type, birot['ontology/Author']))
    if x['Editor']:
        graph.add((x['IRI'], rdflib.RDF.type, birot['ontology/Editor']))
    graph.add((x['IRI'], rdflib.RDFS.label, rdflib.Literal(x['Person'])))
    
person_data = person_data[['Person', 'IRI']].drop_duplicates()  
person_conversion = person_data.copy()
person_conversion = person_conversion.set_index('Person').T.to_dict('records')[0]
data = data.replace({'Author':person_conversion})
data = data.replace({'Editor':person_conversion})

print(len(data))
data.head()

2280


Unnamed: 0,Book,Title,Author,Editor,Language,Month/Year
0,https://birot.github.io/book/0000000,Besessen,,,https://birot.github.io/language/0000000,August 1997-August 1998
1,https://birot.github.io/book/0000001,Kane & Abel,,,https://birot.github.io/language/0000001,August 1997-August 1998
2,https://birot.github.io/book/0000002,Die Sonntagsfrau,https://birot.github.io/person/0000000,,https://birot.github.io/language/0000000,August 1997-August 1998
3,https://birot.github.io/book/0000003,Paris - Ein Fest fürs Leben,,,https://birot.github.io/language/0000000,August 1997-August 1998
4,https://birot.github.io/book/0000004,Kitchen,,,https://birot.github.io/language/0000000,August 1997-August 1998


In [5]:
# IRIs for reading events, including (approximate) start and end dates.

event_data = data.copy()
event_data = event_data[['Book', 'Month/Year']].drop_duplicates()
event_data['Event'] = [birot[f'event/{str(x).zfill(7)}'] for x in range(len(event_data))]

for x in event_data.to_dict('records'):
    graph.add((x['Event'], rdflib.RDF.type, birot['ontology/Event']))
    start = pandas.to_datetime(str(x['Month/Year']).split('-')[0]).strftime('%Y-%m-%d')
    graph.add((x['Event'], birot['ontology/hasStartDate'], rdflib.Literal(start)))
    end = pandas.to_datetime(str(x['Month/Year']).split('-')[1]).strftime('%Y-%m-%d')
    graph.add((x['Event'], birot['ontology/hasEndDate'], rdflib.Literal(end)))
    
data = pandas.merge(data, event_data[['Book', 'Event']], on='Book', how='left')
data = data[[x for x in data.columns.values if x != 'Month/Year']]

print(len(data))
data.head()

2280


Unnamed: 0,Book,Title,Author,Editor,Language,Event
0,https://birot.github.io/book/0000000,Besessen,,,https://birot.github.io/language/0000000,https://birot.github.io/event/0000000
1,https://birot.github.io/book/0000001,Kane & Abel,,,https://birot.github.io/language/0000001,https://birot.github.io/event/0000001
2,https://birot.github.io/book/0000002,Die Sonntagsfrau,https://birot.github.io/person/0000000,,https://birot.github.io/language/0000000,https://birot.github.io/event/0000002
3,https://birot.github.io/book/0000003,Paris - Ein Fest fürs Leben,,,https://birot.github.io/language/0000000,https://birot.github.io/event/0000003
4,https://birot.github.io/book/0000004,Kitchen,,,https://birot.github.io/language/0000000,https://birot.github.io/event/0000004


In [6]:
# write "book" entities to graph, including incorporating external Wikidata identifiers.

with open(data_path / 'wikidata-links.json') as wikidata_data:
    wikidata_data = json.loads(wikidata_data.read())

for x in data.to_dict('records'):
    graph.add((x['Book'], rdflib.RDF.type, birot['ontology/Book']))
    graph.add((x['Book'], rdflib.RDFS.label, rdflib.Literal(x['Title'])))
    graph.add((x['Book'], birot['ontology/hasLanguage'], x['Language'])) 
    graph.add((x['Event'], birot['ontology/hasBook'], x['Book']))  
    if x['Author'] is not numpy.nan:
        graph.add((x['Author'], birot['ontology/isAuthorOf'], x['Book']))  
    if x['Editor'] is not numpy.nan:
        graph.add((x['Editor'], birot['ontology/isEditorOf'], x['Book']))   
    if str(x['Book']) in  wikidata_data.keys():
        qid = wikidata_data[str(x['Book'])]
        graph.add((x['Book'], birot['ontology/hasWikidataId'], rdflib.Literal(qid))) 

graph.serialize(destination=str(data_path / 'birot-data.ttl'), format="turtle")
print(len(graph), 'triples.')

10971 triples.
