### **Birot querying**
*Paul Duchesne. 2020-08-17.*

Notebook of graph queries.

In [1]:
import altair
import numpy
import pandas
import pathlib
import rdflib

In [2]:
data = pathlib.Path.cwd().parents[0] / 'data' / 'birot-data.ttl'
graph = rdflib.Graph().parse(file=open(str(data), 'r'), format='ttl')
      
q = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
    SELECT ?s ?label
    WHERE { 
        ?s a <https://birot.github.io/ontology/Editor>.
        ?s a <https://birot.github.io/ontology/Author>.
        ?s rdfs:label ?label
        }
    """

results = [str(r['label']) for r in graph.query(q)]
print(len(results), results)

2 ['Klaus-Dieter Felsmann', 'Angela Hausner']


In [3]:
data = pathlib.Path.cwd().parents[0] / 'data' / 'birot-data.ttl'
graph = rdflib.Graph().parse(file=open(str(data), 'r'), format='ttl')
      
q = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
    SELECT DISTINCT ?s ?label
    WHERE { 
        ?s rdf:type/rdfs:subClassOf* <https://birot.github.io/ontology/Person>.
        ?s <https://birot.github.io/ontology/isEditorOf> ?a.
        ?s <https://birot.github.io/ontology/isAuthorOf> ?b.        
        ?s rdfs:label ?label
        }
    """

results = [str(r['label']) for r in graph.query(q)]
print(len(results), results)

0 []


In [4]:
def create_graph():
    g = rdflib.Graph()
    ontology = pathlib.Path.cwd().parents[0] / 'ontology' / 'birot-ontology.ttl'
    g += rdflib.Graph().parse(file=open(str(ontology), 'r'), format='ttl')
    data = pathlib.Path.cwd().parents[0] / 'data' / 'birot-data.ttl'
    g += rdflib.Graph().parse(file=open(str(data), 'r'), format='ttl')
    return(g)
      
q = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
    SELECT DISTINCT ?s ?label
    WHERE { 
        ?s rdf:type/rdfs:subClassOf* <https://birot.github.io/ontology/Person>.
        ?s <https://birot.github.io/ontology/isEditorOf> ?a.
        ?s <https://birot.github.io/ontology/isAuthorOf> ?b.        
        ?s rdfs:label ?label
        }
    """

results = [str(r['label']) for r in create_graph().query(q)]
print(len(results), results)

2 ['Angela Hausner', 'Klaus-Dieter Felsmann']


In [5]:
q = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
    SELECT DISTINCT ?book ?date ?label 
    WHERE { 
        ?book a <https://birot.github.io/ontology/Book>. 
        ?event <https://birot.github.io/ontology/hasBook> ?book.
        ?event <https://birot.github.io/ontology/hasStartDate> ?date.        
        ?book <https://birot.github.io/ontology/hasLanguage> ?lang.
        ?lang rdfs:label ?label
        }
    """

dataframe = pandas.DataFrame(create_graph().query(q))
dataframe.columns=['Entity', 'Reading Date', 'Language']
dataframe['Books'] = 1

chart = altair.Chart(dataframe, title='Books by language over time.').mark_line().encode(x='Reading Date:T', y='sum(Books)', color='Language')
display(altair.layer(chart).properties(width=1000, height=300, title=''))

chart = altair.Chart(dataframe, title='Books by language over time (normalised).').mark_bar().encode(
    x='year(Reading Date):T', y=altair.Y('sum(Books)', stack="normalize", axis=altair.Axis(format='%'), title='Percentage'), color='Language')
display(altair.layer(chart).properties(width=1000, height=300, title=''))

In [6]:
q = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    SELECT DISTINCT ?book ?reading ?publication
    WHERE { 
        ?book a <https://birot.github.io/ontology/Book>. 
        ?book <https://birot.github.io/ontology/hasWikidataId> ?wiki.
        BIND (IRI(CONCAT(str(wd:),?wiki)) as ?wiki_book).
        OPTIONAL {service <https://query.wikidata.org/sparql> {?wiki_book wdt:P577 ?publication}}
        ?event <https://birot.github.io/ontology/hasBook> ?book.
        ?event <https://birot.github.io/ontology/hasStartDate> ?reading. 
        }
    """

dataframe = pandas.DataFrame(create_graph().query(q))
dataframe.columns=['Book', 'Reading Date', 'Publication Date']

line = altair.Chart(dataframe, title='Books by publication date over time.').mark_line().encode(x='Reading Date:T', y='mean(Publication Date):T')
band = altair.Chart(dataframe).mark_errorband().encode(x='Reading Date:T', y=altair.Y('Publication Date:T'))
display(altair.layer(band + line).properties(width=1000, height=300, title=''))

In [7]:
q = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    SELECT DISTINCT ?book ?reading ?genre_label
    WHERE { 
        ?book a <https://birot.github.io/ontology/Book>. 
        ?book <https://birot.github.io/ontology/hasWikidataId> ?wiki.
        BIND (IRI(CONCAT(str(wd:),?wiki)) as ?wiki_book).
        OPTIONAL {service <https://query.wikidata.org/sparql> {
            ?wiki_book wdt:P136 ?genre. ?genre rdfs:label ?genre_label filter (lang(?genre_label) = "en").}}
        ?event <https://birot.github.io/ontology/hasBook> ?book.
        ?event <https://birot.github.io/ontology/hasStartDate> ?reading. 
        }
    """

dataframe = pandas.DataFrame(create_graph().query(q))
dataframe.columns=['Book', 'Reading Date', 'Genre']
dataframe['Books'] = 1

dataframe = dataframe.loc[~dataframe.Genre.isin([numpy.nan, None])]
for x in dataframe.Genre.unique():
    if len(dataframe.loc[dataframe.Genre.isin([x])]) <= 3:
        dataframe.loc[dataframe.Genre.isin([x]), 'Genre'] = 'Other'

chart = altair.Chart(dataframe, title='Books by genre over time (normalised).').mark_bar().encode(
    x='year(Reading Date):T', y=altair.Y('sum(Books)', stack="normalize", axis=altair.Axis(format='%'), title='Percentage'), color='Genre')
display(altair.layer(chart).properties(width=1000, height=300, title=''))

In [8]:
q = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    SELECT DISTINCT ?book ?reading ?pages
    WHERE { 
        ?book a <https://birot.github.io/ontology/Book>. 
        ?book <https://birot.github.io/ontology/hasWikidataId> ?wiki.
        BIND (IRI(CONCAT(str(wd:),?wiki)) as ?wiki_book).
        OPTIONAL {service <https://query.wikidata.org/sparql> {?wiki_book wdt:P1104 ?pages}}
        ?event <https://birot.github.io/ontology/hasBook> ?book.
        ?event <https://birot.github.io/ontology/hasStartDate> ?reading. 
        }
    """

dataframe = pandas.DataFrame(create_graph().query(q))
dataframe.columns=['Book', 'Reading Date', 'Pages']
dataframe['Pages'] = dataframe['Pages'].astype('float')

chart = altair.Chart(dataframe, title='Pages over time.').mark_bar().encode(x='year(Reading Date):T', y='sum(Pages)')
display(altair.layer(chart).properties(width=1000, height=300, title=''))

In [9]:
q = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    SELECT DISTINCT ?book ?reading ?gender_label
    WHERE { 
        ?book a <https://birot.github.io/ontology/Book>. 
        ?book <https://birot.github.io/ontology/hasWikidataId> ?wiki.
        BIND (IRI(CONCAT(str(wd:),?wiki)) as ?wiki_book).
        OPTIONAL {service <https://query.wikidata.org/sparql> {
            ?wiki_book wdt:P50 ?wiki_author. ?wiki_author wdt:P21 ?gender.
            ?gender rdfs:label ?gender_label filter (lang(?gender_label) = "en").}}
        ?event <https://birot.github.io/ontology/hasBook> ?book.
        ?event <https://birot.github.io/ontology/hasStartDate> ?reading. 
        }
    """

dataframe = pandas.DataFrame(create_graph().query(q))
dataframe.columns=['Book', 'Reading Date', 'Author Gender']
dataframe['Books'] = 1

chart = altair.Chart(dataframe, title='Books by author gender over time (normalised).').mark_bar().encode(
    x='year(Reading Date):T', y=altair.Y('sum(Books)', stack="normalize", axis=altair.Axis(format='%'), title='Percentage'), 
    color=altair.Color('Author Gender', scale=altair.Scale(scheme='set2')))
display(altair.layer(chart).properties(width=1000, height=300, title=''))

In [10]:
q = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    SELECT DISTINCT ?book ?reading ?publication ?dob ?gender_label ?book_label ?author_label
    WHERE { 
        ?book a <https://birot.github.io/ontology/Book>. 
        ?book <https://birot.github.io/ontology/hasWikidataId> ?wiki.
        BIND (IRI(CONCAT(str(wd:),?wiki)) as ?wiki_book).
        OPTIONAL {service <https://query.wikidata.org/sparql> {
             ?wiki_book wdt:P577 ?publication. ?wiki_book wdt:P50 ?wiki_author.       
             ?wiki_author wdt:P569 ?dob. ?wiki_author wdt:P21 ?gender.
             ?gender rdfs:label ?gender_label filter (lang(?gender_label) = "en").
             ?wiki_book rdfs:label ?book_label filter (lang(?book_label) = "en").   
             ?wiki_author rdfs:label ?author_label filter (lang(?author_label) = "en").}}
        ?event <https://birot.github.io/ontology/hasBook> ?book.
        ?event <https://birot.github.io/ontology/hasStartDate> ?reading. 
        }
    """

def calculate_age(row):
    if row['Publication Date'] == None or row['Author Birth'] == None:
        return 'not valid'
    else:
        return int(str(row['Publication Date'])[:4])-int(str(row['Author Birth'])[:4])

dataframe = pandas.DataFrame(create_graph().query(q))
dataframe.columns=['Book', 'Reading Date', 'Publication Date', 'Author Birth', 'Author Gender', 'Title', 'Author']
dataframe['Author Age'] = dataframe.apply(calculate_age, axis=1)
dataframe = dataframe.loc[~dataframe['Author Age'].isin(['not valid'])]

chart = altair.Chart(dataframe, title='Books by author age/gender over time.').mark_point(size=40).encode(
    x='year(Reading Date):T', y='Author Age', color=altair.Color('Author Gender', scale=altair.Scale(scheme='set2')),
    tooltip=['Title', 'Author', 'Author Age']).interactive()
display(altair.layer(chart).properties(width=1000, height=300, title=''))