Skip to content

Commit

Permalink
[Upd] Create termit-compliant term occurrence resources
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthew-Kulich committed Feb 17, 2023
1 parent fd02472 commit 98bbcc2
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ ExecutionContext executeSelf(){
RDFList columns = columnsList.as(RDFList.class);
if (columns == null || columns.isEmpty()) {
LOG.warn("Columns list not found or is empty in the columns statement.");
return null;
return ExecutionContextFactory.createContext(inputRDF);
}

List<String> header = columns.asJavaList().stream()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@
import cz.cvut.spipes.constants.KBSS_MODULE;
import cz.cvut.spipes.constants.SML;
import cz.cvut.spipes.engine.ExecutionContext;
import cz.cvut.spipes.engine.ExecutionContextFactory;
import cz.cvut.spipes.modules.constants.Constants;
import cz.cvut.spipes.modules.textAnalysis.Extraction;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.text.StringEscapeUtils;
import org.apache.jena.datatypes.xsd.impl.XSDBaseStringType;
import org.apache.jena.rdf.model.*;
import org.apache.jena.vocabulary.RDF;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.TextNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -34,18 +35,38 @@
* :x a csvw:row
* :csat-wo-tc "4339272" ;
* :tc-reference "52-610-00-04" ;
* :wo-text "<span about="_:a970-5" property="ddo:je-výskytem-termu" resource="http://example.com/term/missing-part" typeof="ddo:výskyt-termu" score="0.5">finding</span>" ;
* :wo-text "in the &lt;span about=\"_:a877-16\" property=\"ddo:je-výskytem-termu\" resource=\"http://onto.fel.cvut.cz/ontologies/slovnik/slovnik-komponent-a-zavad---novy/pojem/cvr-ulb\" typeof=\"ddo:výskyt-termu\" score=\"0.25\"&gt;cockpit&lt;/span&gt;" ;
* </code></pre>
* </p>
* The expected output:
* <pre><code>
* _:a970-5 a termit:výskyt-termu ;
* termit:je-výskytem-termu <http://example.com/term/missing-part>;
* :references-annotation "<span about="_:a970-5" property="ddo:je-výskytem-termu" resource="http://example.com/term/missing-part" typeof="ddo:výskyt-termu" score="0.5">finding</span>" ;
* termit:má-přesný-text-quote "finding"
* termit:má-startovní-pozici "0"^^integer ;
* termit:má-koncovou-pozici "7"^^integer ;
* termit:má-skóre "0.5"^^integer ;
* &lt;http://onto.fel.cvut.cz/ontologies/application/termit/pojem/výskyt-termu/instance2fc6112ce9a960c21918569bef6a4151&gt; a termit-pojem:výskyt-termu ;
* termit-pojem:je-přiřazením-termu &lt;http://onto.fel.cvut.cz/ontologies/slovnik/slovnik-komponent-a-zavad---novy/pojem/cvr-ulb&gt; ;
* termit-pojem:má-cíl
* [ a termit-pojem:cíl-výskytu ;
* termit-pojem:má-selektor
* [ a termit-pojem:selektor-pozici-v-textu ;
* termit-pojem:má-koncovou-pozici
* "14"^^&lt;http://www.w3.org/2001/XMLSchema#int&gt; ;
* termit-pojem:má-startovní-pozici
* "7"^^&lt;http://www.w3.org/2001/XMLSchema#int&gt;
* ] ;
* termit-pojem:má-selektor
* [ a termit-pojem:selektor-text-quote> ;
* termit-pojem:má-prefix-text-quote
* "in the " ;
* termit-pojem:má-přesný-text-quote
* "cockpit" ;
* termit-pojem:má-suffix-text-quote
* ""
* ]
* ];
* termit-pojem:má-skóre
* "0.25"^^&lt;http://www.w3.org/2001/XMLSchema#float&gt; ;
* termit-pojem:odkazuje-na-anotovaný-text
* "in the &lt;span about=\"_:a877-16\" property=\"ddo:je-výskytem-termu\" resource=\"http://onto.fel.cvut.cz/ontologies/slovnik/slovnik-komponent-a-zavad---novy/pojem/cvr-ulb\" typeof=\"ddo:výskyt-termu\" score=\"0.25\"&gt;cockpit&lt;/span&gt;" ;
* termit-pojem:odkazuje-na-anotaci
* "&lt;span about=\"_:a877-16\" property=\"ddo:je-výskytem-termu\" resource=\"http://onto.fel.cvut.cz/ontologies/slovnik/slovnik-komponent-a-zavad---novy/pojem/cvr-ulb\" typeof=\"ddo:výskyt-termu\" score=\"0.25\"&gt;cockpit&lt;/span&gt;" .
* .
* </code></pre>
*/
Expand All @@ -65,6 +86,7 @@ public class ExtractTermOccurrencesModule extends AnnotatedAbstractModule {
@Override
protected ExecutionContext executeSelf() {
Model inputRDF = this.getExecutionContext().getDefaultModel();
Model outputModel = ModelFactory.createDefaultModel();

Map<String, List<Element>> annotatedElements = new HashMap<>();

Expand All @@ -80,33 +102,44 @@ protected ExecutionContext executeSelf() {
}
);

annotatedElements.forEach((key, el) -> {
Element e = el.get(0);
Resource res = inputRDF.createResource(key);
res.addProperty(RDF.type, ResourceFactory.createResource(Constants.VYSKYT_TERMU));

if(e.hasAttr(Constants.SCORE)){
res.addLiteral(
ResourceFactory.createProperty(Constants.MA_SKORE),
inputRDF.createTypedLiteral(Float.valueOf(e.attr(Constants.SCORE)))
);
}

assert e.parentNode() != null;
String parentTag = ((Element) e.parentNode()).text();

res.addProperty(ResourceFactory.createProperty(Constants.JE_VYSKYT_TERMU), ResourceFactory.createResource(fullIri(e.attr(Constants.RDFa.RESOURCE))));
addLiteral(res, ResourceFactory.createProperty(Constants.WHOLE_TEXT), StringEscapeUtils.unescapeJava(((Element) e.parentNode()).html()));
addLiteral(res, ResourceFactory.createProperty(Constants.REFERENCES_ANNOTATION), StringEscapeUtils.unescapeJava(e.toString()));
addLiteral(res, ResourceFactory.createProperty(Constants.MA_PRESNY_TEXT_QUOTE), parentTag);
addLiteral(res, ResourceFactory.createProperty(Constants.MA_STARTOVNI_POZICI), parentTag.indexOf(e.text()));
addLiteral(res, ResourceFactory.createProperty(Constants.MA_KONCOVOU_POZICI), parentTag.indexOf(e.text()) + e.text().length());
});
return ExecutionContextFactory.createContext(inputRDF);
annotatedElements.forEach((key, el) -> createTermOccurrenceResources(outputModel, el.get(0)));
return this.createOutputContext(isReplace, inputRDF, outputModel);
}

private void addLiteral(Resource resource, Property property, Object value){
resource.addLiteral(property, value);
private void createTermOccurrenceResources(Model outputModel, Element e) {
assert e.parentNode() != null;
String hash = DigestUtils.md5Hex(StringEscapeUtils.unescapeJava(e.toString()));
Resource termOccurrence = outputModel.createResource(Constants.VYSKYT_TERMU + "/instance" + hash);
Resource occurrenceTarget = outputModel.createResource();
Resource positionSelector = outputModel.createResource();
Resource textSelector = outputModel.createResource();

termOccurrence.addProperty(RDF.type, Constants.VYSKYT_TERMU_RESOURCE);
termOccurrence.addProperty(Constants.JE_PRIRAZENIM_TERMU, outputModel.createResource(fullIri(e.attr(Constants.RDFa.RESOURCE))));
termOccurrence.addProperty(Constants.MA_CIL, occurrenceTarget);
termOccurrence.addLiteral(Constants.ODKAZUJE_NA_ANOTOVANY_TEXT, StringEscapeUtils.unescapeJava(((Element) e.parentNode()).html()));
termOccurrence.addLiteral(Constants.ODKAZUJE_NA_ANOTACI, StringEscapeUtils.unescapeJava(e.toString()));
if(e.hasAttr(Constants.SCORE)){
termOccurrence.addLiteral(Constants.MA_SKORE, outputModel.createTypedLiteral(Float.valueOf(e.attr(Constants.SCORE))));
}

occurrenceTarget.addProperty(RDF.type, Constants.CIL_VYSKYTU);
occurrenceTarget.addProperty(Constants.MA_SELEKTOR, textSelector);
occurrenceTarget.addProperty(Constants.MA_SELEKTOR, positionSelector);

positionSelector.addProperty(RDF.type, Constants.SELEKTOR_POZICI_V_TEXTU);
textSelector.addProperty(RDF.type, Constants.SELEKTOR_TEXT_QUOTE);

String parentTag = ((Element) e.parentNode()).text();
String textQuote = ((TextNode) e.childNodes().get(0)).text();
String prefix = parentTag.substring(0, parentTag.indexOf(textQuote));
String suffix = parentTag.substring(parentTag.indexOf(textQuote) + textQuote.length());

positionSelector.addLiteral(Constants.MA_STARTOVNI_POZICI, Integer.valueOf(parentTag.indexOf(e.text())));
positionSelector.addLiteral(Constants.MA_KONCOVOU_POZICI, Integer.valueOf(parentTag.indexOf(e.text()) + e.text().length()));
textSelector.addLiteral(Constants.MA_PRESNY_TEXT_QUOTE, textQuote);
textSelector.addLiteral(Constants.MA_PREFIX_TEXT_QUOTE, prefix);
textSelector.addLiteral(Constants.MA_SUFFIX_TEXT_QUOTE, suffix);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,42 @@
*/
package cz.cvut.spipes.modules.constants;

import org.apache.jena.rdf.model.Property;
import org.apache.jena.rdf.model.Resource;
import org.apache.jena.rdf.model.ResourceFactory;

/**
* Application-wide constants.
*/
public class Constants {

protected static Resource resource(String local )
{ return ResourceFactory.createResource( termitUri + local ); }

protected static Property property(String local )
{ return ResourceFactory.createProperty( termitUri + local ); }

public static final String termitUri = "http://onto.fel.cvut.cz/ontologies/application/termit/pojem/";
public static final String VYSKYT_TERMU = termitUri + "výskyt-termu";
public static final String JE_VYSKYT_TERMU = termitUri + "je-výskytem-termu";
public static final String MA_KONCOVOU_POZICI = termitUri + "má-koncovou-pozici";
public static final String MA_STARTOVNI_POZICI = termitUri + "má-startovní-pozici";
public static final String MA_SKORE = termitUri + "má-skóre";
public static final String WHOLE_TEXT = termitUri + "whole-text";
public static final String REFERENCES_ANNOTATION = termitUri + "references-annotation";
public static final String MA_PRESNY_TEXT_QUOTE = termitUri + "má-přesný-text-quote";

public static final String SCORE = "score";
public static final String VYSKYT_TERMU = termitUri + "výskyt-termu";

public static final Resource VYSKYT_TERMU_RESOURCE = resource("výskyt-termu");
public static final Resource CIL_VYSKYTU = resource("cíl-výskytu");
public static final Resource SELEKTOR_POZICI_V_TEXTU = resource("selektor-pozici-v-textu");
public static final Resource SELEKTOR_TEXT_QUOTE = resource("selektor-text-quote");

public static final Property MA_CIL = property("má-cíl");
public static final Property JE_PRIRAZENIM_TERMU = property("je-přiřazením-termu");
public static final Property MA_SELEKTOR = property("má-selektor");
public static final Property MA_PREFIX_TEXT_QUOTE = property("má-prefix-text-quote");
public static final Property MA_PRESNY_TEXT_QUOTE = property("má-přesný-text-quote");
public static final Property MA_SUFFIX_TEXT_QUOTE = property("má-suffix-text-quote");
public static final Property MA_KONCOVOU_POZICI = property("má-koncovou-pozici");
public static final Property MA_STARTOVNI_POZICI = property("má-startovní-pozici");
public static final Property ODKAZUJE_NA_ANOTACI = property("odkazuje-na-anotaci");
public static final Property ODKAZUJE_NA_ANOTOVANY_TEXT = property("odkazuje-na-anotovaný-text");
public static final Property MA_SKORE = property("má-skóre");

private Constants() {
throw new AssertionError();
Expand Down

0 comments on commit 98bbcc2

Please sign in to comment.