# (Semi-)Automatic Mapping of two skos:Concept schemes based on prefLabels of Concepts

In [None]:
# Need to extract prefLabel and uri from the ConceptSchemes

In [None]:
!ls

In [None]:
!ls out

In [None]:
from rdflib import Graph, SKOS

In [None]:
bouterwek = Graph()
bouterwek.parse("out/bouterwek.ttl")

In [None]:
eschenburg = Graph()
eschenburg.parse("out/eschenburg.ttl")

In [None]:
from dlod.ontologies import SKOSNAMESPACE
print(SKOSNAMESPACE)

In [None]:
def pref_label_to_term_dict(graph:Graph) -> list:
    """Takes rdflib.Graph and extracts the skos:prefLabel as label"""
    
    query = """

    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

    SELECT ?uri ?label WHERE {
        ?uri a skos:Concept ;
            skos:prefLabel ?label
    }
    """
     
    terms = []

    res = graph.query(query)

    for binding in res:
        term = dict()
        term["id"] = binding[0]
        term["label"] = str(binding[1])
        terms.append(term)

    return terms

In [None]:
bouterwek_terms = pref_label_to_term_dict(bouterwek)
eschenburg_terms = pref_label_to_term_dict(eschenburg)

In [None]:
#bouterwek_terms

In [None]:
bouterwek_terms[0]

> The property skos:closeMatch is used to link two concepts that are sufficiently similar that they can be used interchangeably in some information retrieval applications. In order to avoid the possibility of "compound errors" when combining mappings across more than two concept schemes, skos:closeMatch is not declared to be a transitive property.

> The property skos:exactMatch is used to link two concepts, indicating a high degree of confidence that the concepts can be used interchangeably across a wide range of information retrieval applications. skos:exactMatch is a transitive property, and is a sub-property of skos:closeMatch.

https://www.w3.org/TR/skos-reference/#mapping

In [None]:
# what skos relation is it? exactMatch? closeMatch?
eschenburg_exact_string_match_g = Graph()
bouterwek_exact_string_match_g = Graph()


In [None]:
exact_matching_strings = []

for bouterwek_term in bouterwek_terms:
    for eschenburg_term in eschenburg_terms:
        if bouterwek_term["label"].lower() == eschenburg_term["label"].lower():
            
            print(f"Exact match on string: {eschenburg_term['label']}")
            exact_matching_strings.append(eschenburg_term['label'])

            eschenburg_exact_string_match_g.add(( eschenburg_term["id"], SKOS.closeMatch, bouterwek_term["id"]  ))
            bouterwek_exact_string_match_g.add(( bouterwek_term["id"], SKOS.closeMatch, eschenburg_term["id"]  ))
 

In [None]:
len(exact_matching_strings)

Exact String matching (to lower case) allows to identify 9 matching strings; but it doesn't match concepts with labels having slight spelling variation, e.g. "Äsopische Fabel" and "Aesopische Fabel". Maybe edit distance would help here.

In [None]:
print(eschenburg_exact_string_match_g.serialize())

In [None]:
eschenburg_exact_string_match_g.serialize(destination="out/eschenburg_closeMatch_bouterwek.ttl")

In [None]:
print(bouterwek_exact_string_match_g.serialize())

In [None]:
bouterwek_exact_string_match_g.serialize(destination="out/bouterwek_closeMatch_eschenburg.ttl")

## Goethe mapping

In [None]:
goethe = Graph()
goethe.parse("out/goethe.ttl")

In [None]:
# We only take the "Dichtarten" of Goethe; this can be selected with a designated SPARQL Query that checks for the collection
goethe_dichtarten_terms = []


goethe_dichtarten_query = """

 PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

    SELECT ?uri ?label WHERE {
        
        ?uri a skos:Concept ;
            skos:prefLabel ?label .

        <https://genre.clscor.io/goethe/collection/dichtarten> skos:member ?uri .
    }

"""

goethe_sparql_result = goethe.query(goethe_dichtarten_query)

for binding in goethe_sparql_result:
    term = dict()
    term["id"] = binding[0]
    term["label"] = str(binding[1])
    goethe_dichtarten_terms.append(term)



In [None]:
goethe_dichtarten_terms

In [None]:
# run the same algorithm as the comparison with bouterwek and eschenburg
# copied the code, but should become a function at some point


# what skos relation is it? exactMatch? closeMatch?
goethe_to_eschenburg_exact_string_match_g = Graph()
eschenburg_to_goethe_exact_string_match_g = Graph()

goethe_to_bouterwek_exact_string_match_g = Graph()
bouterwek_to_goethe_exact_string_match_g = Graph()


In [None]:
exact_matching_strings_goethe_bouterwek = []
exact_matching_strings_goethe_eschenburg = []

for goethe_term  in goethe_dichtarten_terms:
    # do it for eschenburg
    for eschenburg_term in eschenburg_terms:
        if goethe_term["label"].lower() == eschenburg_term["label"].lower():
            
            print(f"Exact match Goethe to Eschenburg on string: {eschenburg_term['label']}")
            exact_matching_strings_goethe_eschenburg.append(eschenburg_term['label'])

            goethe_to_eschenburg_exact_string_match_g.add(( goethe_term["id"], SKOS.closeMatch, eschenburg_term["id"]  ))
            eschenburg_to_goethe_exact_string_match_g.add(( eschenburg_term["id"], SKOS.closeMatch,  goethe_term["id"]))

    # same for bouterwek
    for bouterwek_term in bouterwek_terms:
        if goethe_term["label"].lower() == bouterwek_term["label"].lower():
            
            print(f"Exact match Goethe to Bouterwek on string: {bouterwek_term['label']}")
            exact_matching_strings_goethe_bouterwek.append(bouterwek_term['label'])

            goethe_to_bouterwek_exact_string_match_g.add(( goethe_term["id"], SKOS.closeMatch, bouterwek_term["id"]  ))
            bouterwek_to_goethe_exact_string_match_g.add(( bouterwek_term["id"], SKOS.closeMatch,  goethe_term["id"]))
            
            

In [None]:
# closeMatch Goethe to Eschenburg
goethe_to_eschenburg_exact_string_match_g.serialize(destination="out/goethe_closeMatch_eschenburg.ttl")

# closeMatch Eschenburg to Goethe
eschenburg_to_goethe_exact_string_match_g.serialize(destination="out/eschenburg_closeMatch_goethe.ttl")

# closeMatch Goethe to Bouterwek
goethe_to_bouterwek_exact_string_match_g.serialize(destination="out/goethe_closeMatch_bouterwek.ttl")

# Bouterwek to Goethe
bouterwek_to_goethe_exact_string_match_g.serialize(destination="out/bouterwek_closeMatch_goethe.ttl")

In [None]:
print(exact_matching_strings_goethe_bouterwek)

In [None]:
len(exact_matching_strings_goethe_bouterwek)

In [None]:
print(exact_matching_strings_goethe_eschenburg)

In [None]:
len(exact_matching_strings_goethe_eschenburg)