# OpenRefine to SKOS

Convert Export (tsv) of the enriched keywords to preliminary SKOS.

In [1]:
!ls ../export

DHd-Affiliations.tsv


Use the export 2022-10-26: enriched in OpenRefine (could of course do more)... 

In [2]:
refine_output_file = "../export/" + "DHd-Affiliations.tsv"

## Table as pandas data frame

In [3]:
import pandas as pd

In [4]:
import numpy as np

In [5]:
#convert the tsv to a dataframe
df = pd.read_csv(refine_output_file, sep='\t', header=0)

In [6]:
df

Unnamed: 0,orig_string,country,city,city_wd,parent_institution_normalized,parent_type,parent_wd,parent_wikidata_uri,parent_reconci,parent_factgrid,...,parent_ROR_link,parent_url,institutional_subunit_normalized,subunit_wd,subunit_wikidata_uri,subunit_reconci,subunit_factgrid,subunit_ROR,subunit_ROR_link,subunit_url
0,Austrian Centre for Digital Humanities and Cul...,Österreich,Wien,Q1741,Österreichische Akademie der Wissenschaften,Akademie,Q299015,http://www.wikidata.org/entity/Q299015,Österreichische Akademie der Wissenschaften,Q399974,...,https://ror.org/03anc3s24,,Austrian Centre for Digital Humanities and Cul...,Q30268470,http://www.wikidata.org/entity/Q30268470,Austrian Centre for Digital Humanities and Cul...,,028bsh698,https://ror.org/028bsh698,
1,"Bayerische Akademie der Wissenschaftten, Germany",Deutschland,München,Q1726,Bayerische Akademie der Wissenschaften,Akademie,Q684415,http://www.wikidata.org/entity/Q684415,Bayerische Akademie der Wissenschaften,Q164305,...,https://ror.org/001rdaz60,,,,,,,,,
2,"Bayerische Staatsbibliothek, Germany",Deutschland,München,Q1726,Bayerische Staatsbibliothek,Bibliothek,Q256507,http://www.wikidata.org/entity/Q256507,Bayerische Staatsbibliothek,Q153197,...,https://ror.org/031h71w90,,,,,,,,,
3,Berlin-Brandenburgische Akademie der Wissensch...,Deutschland,Berlin,Q64,Berlin-Brandenburgische Akademie der Wissensch...,Akademie,Q219989,http://www.wikidata.org/entity/Q219989,Berlin-Brandenburgische Akademie der Wissensch...,Q230420,...,https://ror.org/05jgq9443,,,,,,,,,
4,"Computational Humanities Group, Universität Le...",Deutschland,Leipzig,Q2079,Universität Leipzig,Universität,Q154804,http://www.wikidata.org/entity/Q154804,Universität Leipzig,Q22261,...,https://ror.org/03s7gtk40,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,"Wittgenstein-Archiv an der Universität Bergen,...",Norwegen,Bergen,Q26793,Universität Bergen,Universität,Q204457,http://www.wikidata.org/entity/Q204457,Universität Bergen,,...,https://ror.org/03zga2b32,,Wittgenstein Archiv,,,,,,,
996,Zentrum für Informationsmodellierung - Austria...,Österreich,Graz,Q13298,Karl-Franzens-Universität Graz,Universität,Q622683,http://www.wikidata.org/entity/Q622683,Universität Graz,Q175366,...,https://ror.org/01faaaf77,,Zentrum für Informationsmodellierung - Austria...,,,,,,,
997,Zentrum für Technologietransfer und Telekommun...,Deutschland,Worms,Q3852,FH Worms,Universität,Q1391239,http://www.wikidata.org/entity/Q1391239,Hochschule Worms,,...,https://ror.org/031ph8d53,,Zentrum für Technologietransfer und Telekommun...,,,,,,,
998,"Zentrum Technik und Gesellschaft / TU Berlin, ...",Deutschland,Berlin,Q64,Technische Universität Berlin,Universität,Q51985,http://www.wikidata.org/entity/Q51985,Technische Universität Berlin,Q405632,...,https://ror.org/03v4gjf40,,Zentrum Technik und Gesellschaft,,,,,,,


In [7]:
unique_parent_orgs = df["parent_institution_normalized"].unique()

In [8]:
len(unique_parent_orgs)

319

In [9]:
#Function to generate short hashes:
import hashlib
def shorthash(textstring:str):
    """
    Generate a trunctated md5 hash
    """
    hash = hashlib.sha1(textstring.encode("UTF-8")).hexdigest()
    shorthash = hash[:8]
    return shorthash

In [10]:
# set the baseuri for the new skos-terms:
baseuri = "https://dig-hum.de/vocabs/affiliations"

In [11]:
def label_to_uri(textstring:str):
    """
    Generate a uri with a short md5 hash
    """
    a_hash = shorthash(textstring.lower())
    uri = baseuri + "#" + a_hash
    return uri

In [12]:
label_to_uri("Universität zu Köln")

'https://dig-hum.de/vocabs/affiliations#0f92bbab'

Testing how to interate and find rows:

In [13]:
unique_parent_orgs[11]

'Fachhochschule Potsdam'

In [14]:
df.loc[df['parent_institution_normalized'] == unique_parent_orgs[11]]

Unnamed: 0,orig_string,country,city,city_wd,parent_institution_normalized,parent_type,parent_wd,parent_wikidata_uri,parent_reconci,parent_factgrid,...,parent_ROR_link,parent_url,institutional_subunit_normalized,subunit_wd,subunit_wikidata_uri,subunit_reconci,subunit_factgrid,subunit_ROR,subunit_ROR_link,subunit_url
13,"Fachhochschule Potsdam, Germany",Deutschland,Potsdam,Q1711,Fachhochschule Potsdam,Fachhochschule,Q896706,http://www.wikidata.org/entity/Q896706,Fachhochschule Potsdam,,...,https://ror.org/012m9bp23,,,,,,,,,
14,"Fachhochschule Potsdam, UCLAB",Deutschland,Potsdam,Q1711,Fachhochschule Potsdam,Fachhochschule,Q896706,http://www.wikidata.org/entity/Q896706,Fachhochschule Potsdam,,...,https://ror.org/012m9bp23,,,,,,,,,
16,"FH Potsdam, Deutschland",Deutschland,Potsdam,Q1711,Fachhochschule Potsdam,Fachhochschule,Q896706,http://www.wikidata.org/entity/Q896706,Fachhochschule Potsdam,,...,https://ror.org/012m9bp23,,,,,,,,,
71,"UCLAB, Fachhochschule Potsdam, Germany",Deutschland,Potsdam,Q1711,Fachhochschule Potsdam,Fachhochschule,Q896706,http://www.wikidata.org/entity/Q896706,Fachhochschule Potsdam,,...,https://ror.org/012m9bp23,,,,,,,,,
125,"Fachhochschule Potsdam, Deutschland",Deutschland,Potsdam,Q1711,Fachhochschule Potsdam,Fachhochschule,Q896706,http://www.wikidata.org/entity/Q896706,Fachhochschule Potsdam,,...,https://ror.org/012m9bp23,,,,,,,,,
126,"Fachhochschule Potsdam, Fachbereich Informatio...",Deutschland,Potsdam,Q1711,Fachhochschule Potsdam,Fachhochschule,Q896706,http://www.wikidata.org/entity/Q896706,Fachhochschule Potsdam,,...,https://ror.org/012m9bp23,,,,,,,,,
127,Fachhochschule Potsdam,Deutschland,Potsdam,Q1711,Fachhochschule Potsdam,Fachhochschule,Q896706,http://www.wikidata.org/entity/Q896706,Fachhochschule Potsdam,,...,https://ror.org/012m9bp23,,,,,,,,,
227,"UCLAB, Fachhochschule Potsdam, Deutschland",Deutschland,Potsdam,Q1711,Fachhochschule Potsdam,Fachhochschule,Q896706,http://www.wikidata.org/entity/Q896706,Fachhochschule Potsdam,,...,https://ror.org/012m9bp23,,,,,,,,,
229,FH Potsdam,Deutschland,Potsdam,Q1711,Fachhochschule Potsdam,Fachhochschule,Q896706,http://www.wikidata.org/entity/Q896706,Fachhochschule Potsdam,,...,https://ror.org/012m9bp23,,,,,,,,,
403,University of Appliend Sciences Potsdam,Deutschland,Potsdam,Q1711,Fachhochschule Potsdam,Fachhochschule,Q896706,http://www.wikidata.org/entity/Q896706,Fachhochschule Potsdam,,...,https://ror.org/012m9bp23,,,,,,,,,


In [15]:
# get distinct labels
df.loc[df['parent_institution_normalized'] == unique_parent_orgs[11]]["orig_string"].unique()

array(['Fachhochschule Potsdam, Germany', 'Fachhochschule Potsdam, UCLAB',
       'FH Potsdam, Deutschland',
       'UCLAB, Fachhochschule Potsdam, Germany',
       'Fachhochschule Potsdam, Deutschland',
       'Fachhochschule Potsdam, Fachbereich Informationswissenschaften, RDMO, Deutschland',
       'Fachhochschule Potsdam',
       'UCLAB, Fachhochschule Potsdam, Deutschland', 'FH Potsdam',
       'University of Appliend Sciences Potsdam',
       'Urban Complexity Lab, Fachhochschule Potsdam, Deutschland',
       'Fachbereich Informationswissenschaften, Fachhochschule Potsdam, Deutschland'],
      dtype=object)

In [16]:
#get subunits
df.loc[df['parent_institution_normalized'] == unique_parent_orgs[11]]["institutional_subunit_normalized"].unique()

array([nan, 'Fachbereich Informationswissenschaften'], dtype=object)

## Some notes on modeling the keywords in SKOS

From the [SKOS Primer](https://www.w3.org/TR/2009/NOTE-skos-primer-20090818/#secalt):

`skos:prefLabel "animals"@en;`

`skos:altLabel "creatures"@en;`

also use:
> A hidden lexical label, represented by means of the skos:hiddenLabel property, is a lexical label for a resource, where a KOS designer would like that character string to be accessible to applications performing text-based indexing and search operations, but would not like that label to be visible otherwise. Hidden labels may for instance be used to include misspelled variants of other lexical labels. For example:

`skos:hiddenLabel "betes"@fr.`

e.g.:
```
skos:prefLabel "Authorship attribution" ;
skos:hiddenLabel "authorship attibution" ;
``` 


Semantic Links:

> `skos:broader` and `skos:narrower` enable the representation of hierarchical links, such as the relationship between one genre and its more specific species, or, depending on interpretations, the relationship between one whole and its parts;
> `skos:related` enables the representation of associative (non-hierarchical) links, such as the relationship between one type of event and a category of entities which typically participate in it. Another use for skos:related is between two categories where neither is more general or more specific. Note that skos:related enables the representation of associative (non-hierarchical) links, which can also be used to represent part-whole links that are not meant as hierarchical relationships.

owl:sameAs vs. skos:exactMatch:
> Note on `skos:exactMatch` vs. `owl:sameAs`: SKOS provides `skos:exactMatch` to map concepts with equivalent meaning, and intentionally does not use owl:sameAs from the OWL ontology language [OWL]. When two resources are linked with `owl:sameAs` they are considered to be the same resource, and triples involving these resources are merged. This does not fit what is needed in most SKOS applications.         

## Transform keyword into a SKOS concept
I will do some testing and then come up with a function to transform the keywords based on `unique_clustered_terms`.

In [17]:
import rdflib

In [18]:
from rdflib import Namespace, URIRef, RDF, RDFS, Literal, Graph, XSD, SKOS, DCTERMS, DC

In [19]:
def add_ns(graph):
    graph.bind("skos", SKOS)
    graph.bind("dcterms", DCTERMS)
    graph.bind("dc", DC)
    return graph

In [20]:
#setup the concept scheme
vocab_g = rdflib.Graph()
VOCAB = URIRef(baseuri)
vocab_g.add(( VOCAB, RDF.type, SKOS.ConceptScheme ))
vocab_g.add(( VOCAB, SKOS.prefLabel, Literal("DHd Institutionszugehörigkeiten", lang="de") ))
vocab_g.add(( VOCAB, SKOS.prefLabel, Literal("DHd Affiliations", lang="en") ))

# add documentation in Dublin Core (dct vs. dc elements?)
vocab_g.add(( VOCAB, DC.title, Literal("DHd Institutionszugehörigkeiten", lang="de") ))
vocab_g.add(( VOCAB, DC.title, Literal("DHd Affiliations", lang="en") ))

vocab_g.add(( VOCAB, DC.creator, Literal("Ingo Börner") ))
vocab_g.add(( VOCAB, DC.contributor, Literal("TF DHd Abstracts") ))
vocab_g.add(( VOCAB, DC.date, Literal("2022") ))
vocab_g.add((VOCAB, DC.relation,  URIRef("https://zenodo.org/communities/dhd")))

# ...

print(vocab_g.serialize(format="ttl"))

@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .

<https://dig-hum.de/vocabs/affiliations> a skos:ConceptScheme ;
    dc:contributor "TF DHd Abstracts" ;
    dc:creator "Ingo Börner" ;
    dc:date "2022" ;
    dc:relation <https://zenodo.org/communities/dhd> ;
    dc:title "DHd Institutionszugehörigkeiten"@de,
        "DHd Affiliations"@en ;
    skos:prefLabel "DHd Institutionszugehörigkeiten"@de,
        "DHd Affiliations"@en .




In [21]:
def type_to_collection_uri(type_string:str):
    """
    Generate a URI from a type label
    """
    type_str_clean = type_string.lower().replace("ö","oe").replace("ä","ae").replace("ü","ue").replace(" ","_")
    uri = baseuri + "#" + type_str_clean + "_collection"
    return uri

In [22]:
type_to_collection_uri("Universität")

'https://dig-hum.de/vocabs/affiliations#universitaet_collection'

In [23]:
def affiliation_as_concept(normalized_label:str):
    """
    Generates a skosConcept of an affiliation; uses "parent_institution_normalized" as input "normalized_label"
    """
    g = rdflib.Graph()
    g = add_ns(g)
    
    #get the data from the dataframe df
    data = df.loc[df['parent_institution_normalized'] == normalized_label]
    
    #generate an URI
    uri = label_to_uri(normalized_label)
    CONCEPT = URIRef(uri)
    #add the concept to the graph
    g.add(( CONCEPT, RDF.type, SKOS.Concept))
    
    #use normalized label as skos:prefLabel
    #maybe have to add a language tag consequently to the graph and edit where necessary?
    #for now I just assume, it's german.. Oh no..
    g.add(( CONCEPT, SKOS.prefLabel, Literal(normalized_label, lang="de")))
    
    #wd link
    for wd_uri in data["parent_wikidata_uri"].unique():
        if str( wd_uri) != "nan":
            g.add(( CONCEPT, SKOS.exactMatch, URIRef(wd_uri) ))
            
    #ROR
    for ror_uri in data["parent_ROR_link"].unique():
        if str( ror_uri) != "nan":
            g.add(( CONCEPT, SKOS.exactMatch, URIRef(ror_uri) ))
    
    #factgrid
    #should add, but need URI scheme
    
    #hidden labels, need all rows that are not subunits
    parent_rows_vanilla = data[data['institutional_subunit_normalized'].isna()]
    #print(parent_rows_vanilla)
    for parent_hidden_label in parent_rows_vanilla["orig_string"].unique():
        g.add(( CONCEPT, SKOS.hiddenLabel, Literal(parent_hidden_label) ))
    
    #maybe add the reconciled label as altLabel
    for reconci_label in data["parent_reconci"].unique():
        if str( reconci_label) != "nan":
            if reconci_label != normalized_label:
                g.add(( CONCEPT, SKOS.altLabel, Literal(reconci_label, lang="de") ))
    
    
    
    #get the unique subunits:
    subunits = []
    unique_subunits_strings = data["institutional_subunit_normalized"].unique()
    for subunit_str in unique_subunits_strings:
        if str(subunit_str) != "nan":
            subunit = {}
            subunit["label"] = subunit_str
            subunit["full_label"] = normalized_label + " / " + subunit_str
            subunit["uri"] = label_to_uri(subunit["full_label"])
            subunits.append(subunit)
    
    #print(subunits)
    for subunit_item in subunits:
        subunit_data = df.loc[df['institutional_subunit_normalized'] == subunit_item["label"]]
        SUBUNIT = URIRef(subunit_item["uri"])
        g.add(( SUBUNIT, RDF.type, SKOS.Concept ))
        g.add(( SUBUNIT, SKOS.prefLabel, Literal(subunit_item["label"], lang="de")  ))
        g.add(( SUBUNIT, SKOS.altLabel, Literal(subunit_item["full_label"], lang="de")  ))
        #broader, narrower
        g.add(( SUBUNIT, SKOS.broader, CONCEPT ))
        g.add(( CONCEPT, SKOS.narrower, SUBUNIT))
        
        #wd links, ROR links, hidden labels
        
        #wd link
        for sub_wd_uri in subunit_data["subunit_wikidata_uri"].unique():
            if str( sub_wd_uri) != "nan":
                g.add(( SUBUNIT, SKOS.exactMatch, URIRef(sub_wd_uri) ))
            
        #ROR
        for sub_ror_uri in data["subunit_ROR_link"].unique():
            if str( sub_ror_uri) != "nan":
                g.add(( SUBUNIT, SKOS.exactMatch, URIRef(sub_ror_uri) ))
        
        #hidden labels
        for subunit_hidden_label in subunit_data["orig_string"].unique():
            g.add(( SUBUNIT, SKOS.hiddenLabel, Literal(subunit_hidden_label) ))
        
        
        #add to vocab
        g.add(( SUBUNIT, SKOS.inScheme, VOCAB))
        
    
    # the Collections
    # use the type as Group; group by city and country
    for parent_type in data["parent_type"].unique():
        if str(parent_type) != "nan":
            #function to translate type strings to uris:
            group_uri = type_to_collection_uri(parent_type)
            g.add(( URIRef(group_uri), RDF.type, SKOS.Collection ))
            g.add(( URIRef(group_uri), SKOS.prefLabel, Literal(parent_type, lang="de") ))
            g.add(( URIRef(group_uri), SKOS.member, CONCEPT ))
            
    #Stadt
    for parent_city in data["city"].unique():
        if str(parent_city) != "nan":
            city_group_uri = type_to_collection_uri(parent_city)
            g.add(( URIRef(city_group_uri), RDF.type, SKOS.Collection ))
            g.add(( URIRef(city_group_uri), SKOS.prefLabel, Literal(parent_city, lang="de") ))
            g.add(( URIRef(city_group_uri), SKOS.member, CONCEPT ))
    
    
    
    
    #need to "hook" to concept scheme
    g.add(( CONCEPT, SKOS.inScheme, VOCAB))
    # add this as a top concept
    #skos:topConceptOf
    g.add((CONCEPT, SKOS.topConceptOf, VOCAB ))
    g.add(( VOCAB, SKOS.hasTopConcept, CONCEPT ))
    
    return g

In [24]:
test_label = "Universität zu Köln"
test_concept_g = affiliation_as_concept(test_label)

In [25]:
print(test_concept_g.serialize(format="ttl"))

@prefix skos: <http://www.w3.org/2004/02/skos/core#> .

<https://dig-hum.de/vocabs/affiliations#koeln_collection> a skos:Collection ;
    skos:member <https://dig-hum.de/vocabs/affiliations#0f92bbab> ;
    skos:prefLabel "Köln"@de .

<https://dig-hum.de/vocabs/affiliations#universitaet_collection> a skos:Collection ;
    skos:member <https://dig-hum.de/vocabs/affiliations#0f92bbab> ;
    skos:prefLabel "Universität"@de .

<https://dig-hum.de/vocabs/affiliations#47f4bc6a> a skos:Concept ;
    skos:altLabel "Universität zu Köln / Institut für Medienkultur und Theater"@de ;
    skos:broader <https://dig-hum.de/vocabs/affiliations#0f92bbab> ;
    skos:exactMatch <http://www.wikidata.org/entity/Q101248538> ;
    skos:hiddenLabel "Institut für Medienkultur und Theater" ;
    skos:inScheme <https://dig-hum.de/vocabs/affiliations> ;
    skos:prefLabel "Institut für Medienkultur und Theater"@de .

<https://dig-hum.de/vocabs/affiliations#5939e6fc> a skos:Concept ;
    skos:altLabel "Universität 

In [26]:
%%time
#join everything
the_whole_g = rdflib.Graph()
the_whole_g = add_ns(the_whole_g)
the_whole_g = the_whole_g + vocab_g
#add the single concept in a for-loop
for aff in unique_parent_orgs:
    if str(aff) != "nan":
        concept_g = affiliation_as_concept(aff)
        the_whole_g = the_whole_g + concept_g

CPU times: user 11.5 s, sys: 67.6 ms, total: 11.6 s
Wall time: 11.6 s


## Validation of the SKOS

In [None]:
#!pip install skosify

In [27]:
import skosify

In [28]:
voc = skosify.skosify(the_whole_g)



In [29]:
#messes up the namespaces, which I have to override after cleaning up
#voc.bind('crm', CRM, override=True)
#voc.bind('custom', CUSTOM, override=True)

In [30]:
#store the whole graph
voc.serialize(destination="../export/DHd-Affiliations.ttl", format="turtle")

<Graph identifier=N6693a4bc52104b0fa6142ad5d23db64f (<class 'rdflib.graph.Graph'>)>

## Visualize in Skosmos
I set up a skosmos instance using the docker-compose provided by skosmos here: https://github.com/NatLibFi/Skosmos

Updated skosmos: `conig-docker-compose.ttl` in `dockerfiles`

```
:affiliations a skosmos:Vocabulary, void:Dataset ;
    dc:title "DHd Affiliations"@en ;
    dc:title "DHd Institutionszugehörigkeiten"@de ;
    skosmos:shortName "dhd-affiliations";
    dc:subject :cat_general ;
    void:uriSpace "https://dig-hum.de/vocabs/affiliations#";
    skosmos:language "de";
    skosmos:defaultLanguage "de";
    skosmos:showTopConcepts true ;
    skosmos:fullAlphabeticalIndex true ;
    skosmos:groupClass skos:Collection ;
    void:sparqlEndpoint <http://fuseki-cache:80/skosmos/sparql> ;
    skosmos:sparqlGraph <https://dig-hum.de/vocabs/affiliations> .
````
To load data:

`curl -I -X POST -H Content-Type:text/turtle -T DHd-Affiliations.ttl -G http://localhost:9030/skosmos/data --data-urlencode graph=https://dig-hum.de/vocabs/affiliations`