# (Semi-)Automatic Mapping of two skos:Concept schemes based on prefLabels of Concepts

In [1]:
# Need to extract prefLabel and uri from the ConceptSchemes

In [2]:
!ls

01_generate_skos.ipynb            [34mout[m[m
02_generate_skos.ipynb            requirements.txt
03_map_bouterwek_eschenburg.ipynb [34mskosmos[m[m
04_more_advanced_mapping.ipynb    [34mskostools[m[m
[34mdlod[m[m                              [34mvenv[m[m
[34mdocker[m[m


In [3]:
!ls out

bouterwek.ttl                       eschenburg_closeMatch_goethe.ttl
bouterwek_closeMatch_eschenburg.ttl eschenburg_terms.json
bouterwek_closeMatch_goethe.ttl     goethe.ttl
bouterwek_terms.json                goethe_closeMatch_bouterwek.ttl
eschenburg.ttl                      goethe_closeMatch_eschenburg.ttl
eschenburg_closeMatch_bouterwek.ttl goethe_dichtarten_terms.json


In [4]:
from rdflib import Graph, SKOS

In [5]:
bouterwek = Graph()
bouterwek.parse("out/bouterwek.ttl")

<Graph identifier=N346b7c89fe154344a7e19719c432720b (<class 'rdflib.graph.Graph'>)>

In [6]:
eschenburg = Graph()
eschenburg.parse("out/eschenburg.ttl")

<Graph identifier=N13559d0867a64ef2bb1ce997fb14db67 (<class 'rdflib.graph.Graph'>)>

In [7]:
from dlod.ontologies import SKOSNAMESPACE
print(SKOSNAMESPACE)

http://www.w3.org/2004/02/skos/core#


In [8]:
def pref_label_to_term_dict(graph:Graph) -> list:
    """Takes rdflib.Graph and extracts the skos:prefLabel as label"""
    
    query = """

    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

    SELECT ?uri ?label WHERE {
        ?uri a skos:Concept ;
            skos:prefLabel ?label
    }
    """
     
    terms = []

    res = graph.query(query)

    for binding in res:
        term = dict()
        term["id"] = binding[0]
        term["label"] = str(binding[1])
        terms.append(term)

    return terms

In [9]:
bouterwek_terms = pref_label_to_term_dict(bouterwek)
eschenburg_terms = pref_label_to_term_dict(eschenburg)

In [10]:
#bouterwek_terms

In [11]:
bouterwek_terms[0]

{'id': rdflib.term.URIRef('https://genre.clscor.io/bouterwek/ballade'),
 'label': 'Ballade'}

> The property skos:closeMatch is used to link two concepts that are sufficiently similar that they can be used interchangeably in some information retrieval applications. In order to avoid the possibility of "compound errors" when combining mappings across more than two concept schemes, skos:closeMatch is not declared to be a transitive property.

> The property skos:exactMatch is used to link two concepts, indicating a high degree of confidence that the concepts can be used interchangeably across a wide range of information retrieval applications. skos:exactMatch is a transitive property, and is a sub-property of skos:closeMatch.

https://www.w3.org/TR/skos-reference/#mapping

In [12]:
# what skos relation is it? exactMatch? closeMatch?
eschenburg_exact_string_match_g = Graph()
bouterwek_exact_string_match_g = Graph()


In [13]:
exact_matching_strings = []

for bouterwek_term in bouterwek_terms:
    for eschenburg_term in eschenburg_terms:
        if bouterwek_term["label"].lower() == eschenburg_term["label"].lower():
            
            print(f"Exact match on string: {eschenburg_term['label']}")
            exact_matching_strings.append(eschenburg_term['label'])

            eschenburg_exact_string_match_g.add(( eschenburg_term["id"], SKOS.closeMatch, bouterwek_term["id"]  ))
            bouterwek_exact_string_match_g.add(( bouterwek_term["id"], SKOS.closeMatch, eschenburg_term["id"]  ))
 

Exact match on string: Elegie
Exact match on string: Lehrgedicht
Exact match on string: Lustspiel
Exact match on string: Trauerspiel
Exact match on string: Epigramm
Exact match on string: Oper
Exact match on string: Roman
Exact match on string: Dramatische Dichtungsart
Exact match on string: Epische Dichtungsart


In [14]:
len(exact_matching_strings)

9

Exact String matching (to lower case) allows to identify 9 matching strings; but it doesn't match concepts with labels having slight spelling variation, e.g. "Äsopische Fabel" and "Aesopische Fabel". Maybe edit distance would help here.

In [15]:
print(eschenburg_exact_string_match_g.serialize())

@prefix skos: <http://www.w3.org/2004/02/skos/core#> .

<https://genre.clscor.io/eschenburg/dramatische_dichtungsart> skos:closeMatch <https://genre.clscor.io/bouterwek/dramatische_dichtungsart> .

<https://genre.clscor.io/eschenburg/elegie> skos:closeMatch <https://genre.clscor.io/bouterwek/elegie> .

<https://genre.clscor.io/eschenburg/epigramm> skos:closeMatch <https://genre.clscor.io/bouterwek/epigramm> .

<https://genre.clscor.io/eschenburg/epische_dichtungsart> skos:closeMatch <https://genre.clscor.io/bouterwek/epische_dichtungsart> .

<https://genre.clscor.io/eschenburg/lehrgedicht> skos:closeMatch <https://genre.clscor.io/bouterwek/lehrgedicht> .

<https://genre.clscor.io/eschenburg/lustspiel> skos:closeMatch <https://genre.clscor.io/bouterwek/lustspiel> .

<https://genre.clscor.io/eschenburg/oper> skos:closeMatch <https://genre.clscor.io/bouterwek/oper> .

<https://genre.clscor.io/eschenburg/roman> skos:closeMatch <https://genre.clscor.io/bouterwek/roman> .

<https://genre.cls

In [16]:
eschenburg_exact_string_match_g.serialize(destination="out/eschenburg_closeMatch_bouterwek.ttl")

<Graph identifier=N95ebeaaffd384a5bbf7c388979321e05 (<class 'rdflib.graph.Graph'>)>

In [17]:
print(bouterwek_exact_string_match_g.serialize())

@prefix skos: <http://www.w3.org/2004/02/skos/core#> .

<https://genre.clscor.io/bouterwek/dramatische_dichtungsart> skos:closeMatch <https://genre.clscor.io/eschenburg/dramatische_dichtungsart> .

<https://genre.clscor.io/bouterwek/elegie> skos:closeMatch <https://genre.clscor.io/eschenburg/elegie> .

<https://genre.clscor.io/bouterwek/epigramm> skos:closeMatch <https://genre.clscor.io/eschenburg/epigramm> .

<https://genre.clscor.io/bouterwek/epische_dichtungsart> skos:closeMatch <https://genre.clscor.io/eschenburg/epische_dichtungsart> .

<https://genre.clscor.io/bouterwek/lehrgedicht> skos:closeMatch <https://genre.clscor.io/eschenburg/lehrgedicht> .

<https://genre.clscor.io/bouterwek/lustspiel> skos:closeMatch <https://genre.clscor.io/eschenburg/lustspiel> .

<https://genre.clscor.io/bouterwek/oper> skos:closeMatch <https://genre.clscor.io/eschenburg/oper> .

<https://genre.clscor.io/bouterwek/roman> skos:closeMatch <https://genre.clscor.io/eschenburg/roman> .

<https://genre.cls

In [18]:
bouterwek_exact_string_match_g.serialize(destination="out/bouterwek_closeMatch_eschenburg.ttl")

<Graph identifier=Nd51cad0f7a5c4a7e8c442df4dc2bed42 (<class 'rdflib.graph.Graph'>)>

## Goethe mapping

In [19]:
goethe = Graph()
goethe.parse("out/goethe.ttl")

<Graph identifier=Ne870df0888bc410483629d1b8c50ed79 (<class 'rdflib.graph.Graph'>)>

In [20]:
# We only take the "Dichtarten" of Goethe; this can be selected with a designated SPARQL Query that checks for the collection
goethe_dichtarten_terms = []


goethe_dichtarten_query = """

 PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

    SELECT ?uri ?label WHERE {
        
        ?uri a skos:Concept ;
            skos:prefLabel ?label .

        <https://genre.clscor.io/goethe/collection/dichtarten> skos:member ?uri .
    }

"""

goethe_sparql_result = goethe.query(goethe_dichtarten_query)

for binding in goethe_sparql_result:
    term = dict()
    term["id"] = binding[0]
    term["label"] = str(binding[1])
    goethe_dichtarten_terms.append(term)



In [21]:
goethe_dichtarten_terms

[{'id': rdflib.term.URIRef('https://genre.clscor.io/goethe/allegorie'),
  'label': 'Allegorie'},
 {'id': rdflib.term.URIRef('https://genre.clscor.io/goethe/ballade'),
  'label': 'Ballade'},
 {'id': rdflib.term.URIRef('https://genre.clscor.io/goethe/cantate'),
  'label': 'Cantate'},
 {'id': rdflib.term.URIRef('https://genre.clscor.io/goethe/drama'),
  'label': 'Drama'},
 {'id': rdflib.term.URIRef('https://genre.clscor.io/goethe/elegie'),
  'label': 'Elegie'},
 {'id': rdflib.term.URIRef('https://genre.clscor.io/goethe/epigramm'),
  'label': 'Epigramm'},
 {'id': rdflib.term.URIRef('https://genre.clscor.io/goethe/epistel'),
  'label': 'Epistel'},
 {'id': rdflib.term.URIRef('https://genre.clscor.io/goethe/epopoee'),
  'label': 'Epopöe'},
 {'id': rdflib.term.URIRef('https://genre.clscor.io/goethe/erzaehlung'),
  'label': 'Erzählung'},
 {'id': rdflib.term.URIRef('https://genre.clscor.io/goethe/fabel'),
  'label': 'Fabel'},
 {'id': rdflib.term.URIRef('https://genre.clscor.io/goethe/heroide'),


In [22]:
# run the same algorithm as the comparison with bouterwek and eschenburg
# copied the code, but should become a function at some point


# what skos relation is it? exactMatch? closeMatch?
goethe_to_eschenburg_exact_string_match_g = Graph()
eschenburg_to_goethe_exact_string_match_g = Graph()

goethe_to_bouterwek_exact_string_match_g = Graph()
bouterwek_to_goethe_exact_string_match_g = Graph()


In [23]:
exact_matching_strings_goethe_bouterwek = []
exact_matching_strings_goethe_eschenburg = []

for goethe_term  in goethe_dichtarten_terms:
    # do it for eschenburg
    for eschenburg_term in eschenburg_terms:
        if goethe_term["label"].lower() == eschenburg_term["label"].lower():
            
            print(f"Exact match Goethe to Eschenburg on string: {eschenburg_term['label']}")
            exact_matching_strings_goethe_eschenburg.append(eschenburg_term['label'])

            goethe_to_eschenburg_exact_string_match_g.add(( goethe_term["id"], SKOS.closeMatch, eschenburg_term["id"]  ))
            eschenburg_to_goethe_exact_string_match_g.add(( eschenburg_term["id"], SKOS.closeMatch,  goethe_term["id"]))

    # same for bouterwek
    for bouterwek_term in bouterwek_terms:
        if goethe_term["label"].lower() == bouterwek_term["label"].lower():
            
            print(f"Exact match Goethe to Bouterwek on string: {bouterwek_term['label']}")
            exact_matching_strings_goethe_bouterwek.append(bouterwek_term['label'])

            goethe_to_bouterwek_exact_string_match_g.add(( goethe_term["id"], SKOS.closeMatch, bouterwek_term["id"]  ))
            bouterwek_to_goethe_exact_string_match_g.add(( bouterwek_term["id"], SKOS.closeMatch,  goethe_term["id"]))
            
            

Exact match Goethe to Bouterwek on string: Ballade
Exact match Goethe to Eschenburg on string: Drama
Exact match Goethe to Eschenburg on string: Elegie
Exact match Goethe to Bouterwek on string: Elegie
Exact match Goethe to Eschenburg on string: Epigramm
Exact match Goethe to Bouterwek on string: Epigramm
Exact match Goethe to Eschenburg on string: Epistel
Exact match Goethe to Bouterwek on string: Epopöe
Exact match Goethe to Eschenburg on string: Heroide
Exact match Goethe to Eschenburg on string: Lehrgedicht
Exact match Goethe to Bouterwek on string: Lehrgedicht
Exact match Goethe to Bouterwek on string: Ode
Exact match Goethe to Eschenburg on string: Roman
Exact match Goethe to Bouterwek on string: Roman
Exact match Goethe to Eschenburg on string: Satire


In [24]:
# closeMatch Goethe to Eschenburg
goethe_to_eschenburg_exact_string_match_g.serialize(destination="out/goethe_closeMatch_eschenburg.ttl")

# closeMatch Eschenburg to Goethe
eschenburg_to_goethe_exact_string_match_g.serialize(destination="out/eschenburg_closeMatch_goethe.ttl")

# closeMatch Goethe to Bouterwek
goethe_to_bouterwek_exact_string_match_g.serialize(destination="out/goethe_closeMatch_bouterwek.ttl")

# Bouterwek to Goethe
bouterwek_to_goethe_exact_string_match_g.serialize(destination="out/bouterwek_closeMatch_goethe.ttl")

<Graph identifier=Ne9ef182a4e9542529b43ddb7404a7098 (<class 'rdflib.graph.Graph'>)>

In [25]:
print(exact_matching_strings_goethe_bouterwek)

['Ballade', 'Elegie', 'Epigramm', 'Epopöe', 'Lehrgedicht', 'Ode', 'Roman']


In [26]:
len(exact_matching_strings_goethe_bouterwek)

7

In [27]:
print(exact_matching_strings_goethe_eschenburg)

['Drama', 'Elegie', 'Epigramm', 'Epistel', 'Heroide', 'Lehrgedicht', 'Roman', 'Satire']


In [28]:
len(exact_matching_strings_goethe_eschenburg)

8

In [29]:
import json

def store_terms(terms:list, filename:str):
    filepath = "out/" + filename + ".json"

    with open(filepath, "w") as f:
        f.write(json.dumps(terms))

In [30]:
## Save the terms for later re-use
store_terms(goethe_dichtarten_terms, "goethe_dichtarten_terms")
store_terms(bouterwek_terms, "bouterwek_terms")
store_terms(eschenburg_terms, "eschenburg_terms")