### **Birot querying**
*Paul Duchesne. 2020-08-31.*

Notebook of graph queries.

In [1]:
import altair
import numpy
import pandas
import pathlib
import rdflib
import unidecode

In [2]:
def create_graph():

    ''' Assemble ontology and mapped data into single graph. '''

    g = rdflib.Graph()
    ontology = pathlib.Path.cwd().parents[0] / 'ontology' / 'birot-ontology.ttl'
    g += rdflib.Graph().parse(file=open(str(ontology), 'r'), format='ttl')
    data = pathlib.Path.cwd().parents[0] / 'data' / 'birot-data.ttl'
    g += rdflib.Graph().parse(file=open(str(data), 'r'), format='ttl')
    return(g)

def translation_detect(row):

    ''' Compare language data to determine if translated. '''

    if unidecode.unidecode(str(row['Original Language'])) == unidecode.unidecode(str(row['Language'])):
        return 'Original Language'
    else:
        return 'Translation'

def extract_age(row):

    ''' Determine author age by subtracting birth date from publication date. '''

    if not (row['Author Birth']):
        return None
    elif not (row['Publication Date']):
        return None
    else:
        try:
            birth_year = int(pandas.to_datetime(row['Author Birth'], errors = 'coerce').strftime('%j'))/365
            birth_year += int(pandas.to_datetime(row['Author Birth'], errors = 'coerce').strftime('%Y'))
            book_year = int(pandas.to_datetime(row['Publication Date'], errors = 'coerce').strftime('%j'))/365
            book_year += int(pandas.to_datetime(row['Publication Date'], errors = 'coerce').strftime('%Y'))
            return book_year-birth_year
        except:
            return None

In [3]:
q = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    SELECT DISTINCT ?book ?reading
    WHERE { 
        ?book a <https://birot.github.io/ontology/Book>. 
        ?event <https://birot.github.io/ontology/hasBook> ?book.
        ?event <https://birot.github.io/ontology/hasStartDate> ?reading. 
        }
    """

dataframe = pandas.DataFrame(create_graph().query(q))
dataframe.columns=['Entity', 'Reading Year']
dataframe['Books'] = 1

altair.themes.enable('fivethirtyeight')
altair.Chart(dataframe, title='').mark_area().encode(
    x='Reading Year:T', y=altair.Y('sum(Books)', stack="center", title='Books Read')).properties(width=800, height=300)

In [4]:
q = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    SELECT DISTINCT ?book ?reading ?publication 
    WHERE { 
        ?book a <https://birot.github.io/ontology/Book>. 
        ?book <https://birot.github.io/ontology/hasWikidataId> ?wiki.
        BIND (IRI(CONCAT(str(wd:),?wiki)) as ?wiki_book).
        OPTIONAL {service <https://query.wikidata.org/sparql> {?wiki_book wdt:P577 ?publication.}}
        ?event <https://birot.github.io/ontology/hasBook> ?book.
        ?event <https://birot.github.io/ontology/hasStartDate> ?reading. 
        }
    """

dataframe = pandas.DataFrame(create_graph().query(q))
dataframe.columns=['Entity', 'Reading Year', 'Publication Date']

line = altair.Chart(dataframe, title='').mark_line().encode(x='Reading Year:T', y='mean(Publication Date):T')
band = altair.Chart(dataframe).mark_errorband().encode(x='Reading Year:T', y=('Publication Date:T'))
display(altair.layer(band, line).properties(width=800, height=300, title=''))

In [5]:
q = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    SELECT DISTINCT ?book ?reading ?country_label
    WHERE { 
        ?book a <https://birot.github.io/ontology/Book>. 
        ?book <https://birot.github.io/ontology/hasWikidataId> ?wiki.
        BIND (IRI(CONCAT(str(wd:),?wiki)) as ?wiki_book).
        OPTIONAL {service <https://query.wikidata.org/sparql> {
            ?wiki_book wdt:P495 ?country. ?country rdfs:label ?country_label filter (lang(?country_label) = "en").}}
        ?event <https://birot.github.io/ontology/hasBook> ?book.
        ?event <https://birot.github.io/ontology/hasStartDate> ?reading. 
        }
    """

dataframe = pandas.DataFrame(create_graph().query(q))
dataframe.columns=['Entity', 'Reading Year', 'Country Of Origin']
dataframe['Books'] = 1

dataframe = dataframe.loc[~dataframe['Country Of Origin'].isin([numpy.nan, None])]
for x in dataframe['Country Of Origin'].unique():
    if len(dataframe.loc[dataframe['Country Of Origin'].isin([x])]) <= 10:
        dataframe.loc[dataframe['Country Of Origin'].isin([x]), 'Country Of Origin'] = 'Other'

altair.Chart(dataframe, title='').mark_area().encode(
    x='Reading Year:T', y=altair.Y('sum(Books)', stack="center", axis=None), 
    color=altair.Color('Country Of Origin', scale=altair.Scale(scheme='set2'))).properties(width=800, height=300)

In [6]:
q = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    SELECT DISTINCT ?book ?reading ?genre_label
    WHERE { 
        ?book a <https://birot.github.io/ontology/Book>. 
        ?book <https://birot.github.io/ontology/hasWikidataId> ?wiki.
        BIND (IRI(CONCAT(str(wd:),?wiki)) as ?wiki_book).
        OPTIONAL {service <https://query.wikidata.org/sparql> {
            ?wiki_book wdt:P136 ?genre. ?genre rdfs:label ?genre_label filter (lang(?genre_label) = "en").}}
        ?event <https://birot.github.io/ontology/hasBook> ?book.
        ?event <https://birot.github.io/ontology/hasStartDate> ?reading. 
        }
    """

dataframe = pandas.DataFrame(create_graph().query(q))
dataframe.columns=['Entity', 'Reading Year', 'Genre']
dataframe['Books'] = 1

for x in dataframe['Genre'].unique():
    if len(dataframe.loc[dataframe['Genre'].isin([x])]) <= 7:
        dataframe.loc[dataframe['Genre'].isin([x]), 'Genre'] = 'Other'
dataframe['Genre'] = dataframe['Genre'].str.strip()
dataframe = dataframe.loc[~dataframe['Genre'].isin([numpy.nan, None, 'Other', 'essay', 'novella', 'novel'])]        

altair.Chart(dataframe, title='').mark_area().encode(
    x='Reading Year:T', y=altair.Y('sum(Books)', stack="center", axis=None), 
    color=altair.Color('Genre', scale=altair.Scale(scheme='tableau20'))).properties(width=800, height=300)

In [7]:
q = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    SELECT DISTINCT ?book ?reading ?origlang_label ?readlang_label
    WHERE { 
        ?book a <https://birot.github.io/ontology/Book>. 
        ?book <https://birot.github.io/ontology/hasWikidataId> ?wiki.
        BIND (IRI(CONCAT(str(wd:),?wiki)) as ?wiki_book).
        OPTIONAL {service <https://query.wikidata.org/sparql> {
            ?wiki_book wdt:P407 ?origlang. ?origlang rdfs:label ?origlang_label filter (lang(?origlang_label) = "en").}}
        ?event <https://birot.github.io/ontology/hasBook> ?book.
        ?event <https://birot.github.io/ontology/hasStartDate> ?reading. 
        ?book <https://birot.github.io/ontology/hasLanguage> ?readlang.
        ?readlang rdfs:label ?readlang_label
        }
    """

dataframe = pandas.DataFrame(create_graph().query(q))
dataframe.columns=['Entity', 'Reading Year', 'Original Language', 'Language']
dataframe['Books'] = 1

altair.Chart(dataframe, title='').mark_area().encode(
    x='Reading Year:T', y=altair.Y('sum(Books)', stack="center", axis=None), 
    color=altair.Color('Language', scale=altair.Scale(scheme='set2'))).properties(width=800, height=300)

In [8]:
for x in dataframe['Original Language'].unique():
    if len(dataframe.loc[dataframe['Original Language'].isin([x])]) <= 7:
        dataframe.loc[dataframe['Original Language'].isin([x]), 'Original Language'] = 'Other'
dataframe['Original Language'] = dataframe['Original Language'].str.strip()
dataframe = dataframe.loc[~dataframe['Original Language'].isin([numpy.nan, None])]        

altair.Chart(dataframe, title='').mark_area().encode(
    x='Reading Year:T', y=altair.Y('sum(Books)', stack="center", axis=None), 
    color=altair.Color('Original Language', scale=altair.Scale(scheme='tableau20'))).properties(width=800, height=300)

In [9]:
dataframe = dataframe.copy()
dataframe['Translation'] = dataframe.apply(translation_detect, axis=1)

altair.Chart(dataframe, title='').mark_area().encode(
    x='Reading Year:T', y=altair.Y('sum(Books)', stack="center", axis=None), 
    color=altair.Color('Translation', scale=altair.Scale(scheme='tableau10'))).properties(width=800, height=300)

In [10]:
q = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    SELECT DISTINCT ?book ?reading ?publication ?dob ?gender_label ?book_label ?author_label
    WHERE { 
        ?book a <https://birot.github.io/ontology/Book>. 
        ?book <https://birot.github.io/ontology/hasWikidataId> ?wiki.
        BIND (IRI(CONCAT(str(wd:),?wiki)) as ?wiki_book).
        OPTIONAL {service <https://query.wikidata.org/sparql> {
             ?wiki_book wdt:P577 ?publication. ?wiki_book wdt:P50 ?wiki_author.       
             ?wiki_author wdt:P569 ?dob. ?wiki_author wdt:P21 ?gender.
             ?gender rdfs:label ?gender_label filter (lang(?gender_label) = "en").
             ?wiki_book rdfs:label ?book_label filter (lang(?book_label) = "en").   
             ?wiki_author rdfs:label ?author_label filter (lang(?author_label) = "en").}}
        ?event <https://birot.github.io/ontology/hasBook> ?book.
        ?event <https://birot.github.io/ontology/hasStartDate> ?reading. 
        }
    """

dataframe = pandas.DataFrame(create_graph().query(q))
dataframe.columns=['Entity', 'Reading Year', 'Publication Date', 'Author Birth', 'Author Gender', 'Title', 'Author']
dataframe['Books'] = 1

dataframe = dataframe.loc[~dataframe['Author Gender'].isin([numpy.nan, None])] 
dataframe['Author Gender'] = dataframe['Author Gender'].str.title()  
altair.Chart(dataframe, title='').mark_area().encode(
    x='Reading Year:T', y=altair.Y('sum(Books)', stack="center", axis=None), 
    color=altair.Color('Author Gender', scale=altair.Scale(scheme='set2'))).properties(width=800, height=300)

In [11]:
dataframe['Author Age'] = dataframe.apply(extract_age, axis=1)
dataframe = dataframe.loc[~dataframe['Author Age'].isin([numpy.nan, None])]  

altair.Chart(dataframe, title='').mark_point(size=40).encode(
    x='Reading Year:T', y='Author Age', color=altair.Color('Author Gender', scale=altair.Scale(scheme='set2')),
    tooltip=['Title', 'Author', 'Author Age']).interactive().properties(width=800, height=300, title='')