Skip to content

Commit

Permalink
[New] Add term occurrences and rdf2csv module
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthew-Kulich committed Jan 24, 2023
1 parent d1be140 commit 886c508
Show file tree
Hide file tree
Showing 4 changed files with 446 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/**
* TermIt Copyright (C) 2019 Czech Technical University in Prague
* <p>
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later
* version.
* <p>
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
* <p>
* You should have received a copy of the GNU General Public License along with this program. If not, see
* <https://www.gnu.org/licenses/>.
*/
package cz.cvut.spipes.constants;

/**
* Application-wide constants.
*/
public class Constants {

public static final String VYSKYT_TERMU = "http://onto.fel.cvut.cz/ontologies/application/termit/pojem/v\u00fdskyt-termu";
public static final String JE_VYSKYT_TERMU = "http://onto.fel.cvut.cz/ontologies/application/termit/pojem/je-v\u00fdskytem-termu";
public static final String MA_KONCOVOU_POZICI = "http://onto.fel.cvut.cz/ontologies/application/termit/pojem/má-koncovou-pozici";
public static final String MA_STARTOVNi_POZICI = "http://onto.fel.cvut.cz/ontologies/application/termit/pojem/má-startovní-pozici";

private Constants() {
throw new AssertionError();
}
/**
* Constants from the RDFa vocabulary.
*/
public static final class RDFa {

/**
* RDFa property attribute.
*/
public static final String PROPERTY = "property";

/**
* RDFa context identifier attribute.
*/
public static final String ABOUT = "about";

/**
* RDFa content attribute.
*/
public static final String CONTENT = "content";

/**
* RDFa type identifier attribute.
*/
public static final String TYPE = "typeof";

/**
* RDFa resource identifier.
*/
public static final String RESOURCE = "resource";

/**
* RDFa prefix attribute.
*/
public static final String PREFIX = "prefix";

private RDFa() {
throw new AssertionError();
}
}

public static final String SCORE = "score";
public static final String WHOLE_TEXT = "whole-text";
public static final String REFERENCES_ANNOTATION = "references-annotation";
public static final String REFERENCES_TEXT = "references-text";
public static final String ANNOTATION_IN_TEXT_START = "annotation-in-text-start";
public static final String ANNOTATION_IN_TEXT_END = "annotation-in-text-end";

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
package cz.cvut.spipes.modules;

import cz.cvut.spipes.constants.CSVW;
import cz.cvut.spipes.constants.KBSS_MODULE;
import cz.cvut.spipes.constants.SML;
import cz.cvut.spipes.constants.Constants;
import cz.cvut.spipes.engine.ExecutionContext;
import cz.cvut.spipes.engine.ExecutionContextFactory;
import cz.cvut.spipes.exception.ResourceNotFoundException;
import cz.cvut.spipes.modules.textAnalysis.Extraction;
import cz.cvut.spipes.registry.StreamResource;
import cz.cvut.spipes.registry.StreamResourceRegistry;
import org.apache.commons.text.StringEscapeUtils;
import org.apache.jena.rdf.model.*;
import org.apache.jena.vocabulary.RDF;
import org.jetbrains.annotations.NotNull;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;

public class ExtractTermOccurrencesModule extends AnnotatedAbstractModule {

private static final Logger LOG = LoggerFactory.getLogger(ExtractTermOccurrencesModule.class);

private static final String TYPE_URI = KBSS_MODULE.uri + "extract-term-occurrences";
private static final String TYPE_PREFIX = TYPE_URI + "/";

private final Property P_DATE_PREFIX = getSpecificParameter("data-prefix");
private final Property P_SOURCE_RESOURCE_URI = getSpecificParameter("source-resource-uri");

//sml:replace
private boolean isReplace;

//:data-prefix
private String dataPrefix;

//:source-resource-uri
private StreamResource sourceResource;

@Override
protected ExecutionContext executeSelf() {
Model inputRDF = this.getExecutionContext().getDefaultModel();

ResIterator rows = inputRDF.listResourcesWithProperty(RDF.type, CSVW.RowUri);
Map<String, List<Element>> annotatedElements = new HashMap<>();

Extraction extraction = new Extraction();
extraction.addPrefix("ddo","http://onto.fel.cvut.cz/ontologies/application/termit/pojem/");

rows.forEach(row -> {
String text = row.getRequiredProperty(createProperty("WO_text")).getObject().toString();
Document doc = Jsoup.parse(StringEscapeUtils.unescapeJava(text));
annotatedElements.putAll(extraction.getTermOccurrences(doc.root()));
});


annotatedElements.forEach((key, el) -> {
Element e = el.get(0);
Resource res = inputRDF.createResource(key);
res.addProperty(RDF.type, ResourceFactory.createResource(Constants.VYSKYT_TERMU));

if(e.hasAttr(Constants.SCORE)){
res.addLiteral(
ResourceFactory.createProperty(getDataPrefix() + Constants.SCORE),
inputRDF.createTypedLiteral(Float.valueOf(e.attr(Constants.SCORE)))
);
}

assert e.parentNode() != null;
String parentTag = ((Element) e.parentNode()).text();

addLiteral(res, createProperty(Constants.WHOLE_TEXT), StringEscapeUtils.unescapeJava(((Element) e.parentNode()).html()));
addLiteral(res, createProperty(Constants.REFERENCES_ANNOTATION), StringEscapeUtils.unescapeJava(e.toString()));
addLiteral(res, createProperty(Constants.REFERENCES_TEXT), parentTag);
addLiteral(res, ResourceFactory.createProperty(Constants.JE_VYSKYT_TERMU), e.text());
addLiteral(res, ResourceFactory.createProperty(Constants.MA_STARTOVNi_POZICI), parentTag.indexOf(e.text()));
addLiteral(res, ResourceFactory.createProperty(Constants.MA_KONCOVOU_POZICI), parentTag.indexOf(e.text()) + e.text().length());
});
return ExecutionContextFactory.createContext(inputRDF);
}

private void addLiteral(Resource resource, Property property, Object value){
resource.addLiteral(property, value);
}

private Property createProperty(String uriRef){
return ResourceFactory.createProperty(getDataPrefix() + uriRef);
}

@Override
public String getTypeURI() {
return TYPE_URI;
}

@Override
public void loadConfiguration() {
isReplace = getPropertyValue(SML.replace, false);
sourceResource = getResourceByUri(getEffectiveValue(P_SOURCE_RESOURCE_URI).asLiteral().toString());
dataPrefix = getEffectiveValue(P_DATE_PREFIX).asLiteral().toString();
}

@NotNull
private StreamResource getResourceByUri(@NotNull String resourceUri) {

StreamResource res = StreamResourceRegistry.getInstance().getResourceByUrl(resourceUri);

if (res == null) {
throw new ResourceNotFoundException("Stream resource " + resourceUri + " not found. ");
}
return res;
}

private static Property getSpecificParameter(String localPropertyName) {
return ResourceFactory.createProperty(TYPE_PREFIX + localPropertyName);
}

public boolean isReplace() {
return isReplace;
}

public void setReplace(boolean replace) {
isReplace = replace;
}

public StreamResource getSourceResource() {
return sourceResource;
}

public void setSourceResource(StreamResource sourceResource) {
this.sourceResource = sourceResource;
}

public String getDataPrefix() {
return dataPrefix;
}

public void setDataPrefix(String dataPrefix) {
this.dataPrefix = dataPrefix;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
package cz.cvut.spipes.modules;

import cz.cvut.spipes.constants.KBSS_MODULE;
import cz.cvut.spipes.engine.ExecutionContext;
import cz.cvut.spipes.engine.ExecutionContextFactory;
import org.apache.jena.rdf.model.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;
import java.util.Optional;

import static java.lang.Integer.*;

public class RDF2CSVModule extends AbstractModule {

public static final String TYPE_URI = KBSS_MODULE.uri + "RDF2CSV";
private static final Logger LOG = LoggerFactory.getLogger(RDF2CSVModule.class);

private final Property P_DATE_PREFIX = getSpecificParameter("data-prefix");
private final Property P_FILE_OUTPUT_PATH = getSpecificParameter("file-output-path");

//:data-prefix
private String dataPrefix;

//:file-output-path
private String fileOutputPath;

@Override
ExecutionContext executeSelf(){
Model inputRDF = this.getExecutionContext().getDefaultModel();

try(BufferedWriter simpleWriter = new BufferedWriter(new FileWriter(fileOutputPath, false))){

writeStringsIntoRow(simpleWriter, "DocumentId","DocumentLineNumber", "WorkOrderId","TaskCardId",
"ComponentURI","ComponentLabel","ComponentScore","MultipleComponents");
writeStringsIntoRow(simpleWriter
,"FailureURI","FailureLabel","FailureScore","MultipleFailures",
"AggregateScore","IsConfirmed" ,"OriginalText","AnnotatedText");
simpleWriter.append("\n");

List<Resource> rows = inputRDF
.listStatements()
.filterKeep(st -> st.getObject().toString().equals(dataPrefix + "Row"))
.mapWith(Statement::getSubject).toList();

rows.sort((o1, o2) -> {
int i1 = parseInt(o1
.getProperty(inputRDF.getProperty(dataPrefix + "DocumentLineNumber"))
.getObject()
.toString());
int i2 = parseInt(o2
.getProperty(inputRDF.getProperty(dataPrefix + "DocumentLineNumber"))
.getObject()
.toString());
return Integer.compare(i1, i2);
}) ;

for (Resource res : rows) {
Statement docId = res.getProperty(inputRDF.getProperty(dataPrefix + "TODO"));
Statement lineNumber = res.getProperty(inputRDF.getProperty(dataPrefix + "DocumentLineNumber"));
Statement woID = res.getProperty(inputRDF.getProperty(dataPrefix + "WorkOrderId"));
Statement tcID = res.getProperty(inputRDF.getProperty(dataPrefix + "TaskCardId"));
Statement compUri = res.getProperty(inputRDF.getProperty(dataPrefix + "ComponentUri"));
Statement compLabel = res.getProperty(inputRDF.getProperty(dataPrefix + "ComponentLabel"));
Statement compScore = res.getProperty(inputRDF.getProperty(dataPrefix + "ComponentScore"));
Statement failureUri = res.getProperty(inputRDF.getProperty(dataPrefix + "FailureUri"));
Statement failureLabel = res.getProperty(inputRDF.getProperty(dataPrefix + "FailureLabel"));
Statement failureScore = res.getProperty(inputRDF.getProperty(dataPrefix + "FailureScore"));
Statement aggregateScore = res.getProperty(inputRDF.getProperty(dataPrefix + "AggregateScore"));
Statement isConfirmed = res.getProperty(inputRDF.getProperty(dataPrefix + "IsConfirmed"));
Statement originalText = res.getProperty(inputRDF.getProperty(dataPrefix + "OriginalText"));
Statement annotatedText = res.getProperty(inputRDF.getProperty(dataPrefix + "AnnotatedText"));

StmtIterator multipleComps = res.listProperties(inputRDF.getProperty(dataPrefix + "MultipleComponents"));
StmtIterator multipleFailures = res.listProperties(inputRDF.getProperty(dataPrefix + "MultipleFailures"));


writeStringsIntoRow(simpleWriter,
getStringValue(docId),
getStringValue(lineNumber),
getStringValue(woID),
getStringValue(tcID),
getStringValue(compUri),
getLiteralValue(compLabel),
getLiteralValue(compScore),
getMultipleObjectValues(multipleComps));

writeStringsIntoRow(simpleWriter,
getStringValue(failureUri),
getLiteralValue(failureLabel),
getLiteralValue(failureScore),
getMultipleObjectValues(multipleFailures),
getLiteralValue(aggregateScore),
getStringValue(isConfirmed),
getStringValue(originalText),
getStringValue(annotatedText));

simpleWriter.append("\n");
}
}catch (IOException e){
e.printStackTrace();
}
return ExecutionContextFactory.createContext(inputRDF);
}

private String getMultipleObjectValues(StmtIterator iterator) {
StringBuilder sb = new StringBuilder();
int i = 0;
while (iterator.hasNext()){
Statement st = iterator.next();
if (i > 0) sb.append("; ");
sb.append(getLiteralValue(st));
i++;
}
return sb.toString();
}

private void writeStringsIntoRow(BufferedWriter simpleWriter, String value1, String value2, String value3,
String value4, String value5, String value6, String value7, String value8
) throws IOException {
simpleWriter.append(value1).append(",");
simpleWriter.append(value2).append(",");
simpleWriter.append(value3).append(",");
simpleWriter.append(value4).append(",");
simpleWriter.append(value5).append(",");
simpleWriter.append(value6).append(",");
simpleWriter.append(value7).append(",");
simpleWriter.append(value8).append(",");
}

@Override
public String getTypeURI() {
return TYPE_URI;
}

@Override
public void loadConfiguration() {
dataPrefix = getEffectiveValue(P_DATE_PREFIX).asLiteral().toString();
fileOutputPath = getEffectiveValue(P_FILE_OUTPUT_PATH).asLiteral().toString();
}

private static Property getSpecificParameter(String localPropertyName) {
return ResourceFactory.createProperty(TYPE_URI + "/" + localPropertyName);
}

private String getLiteralValue(Statement st){
if (st == null) return "";
RDFNode node = st.getObject();
if(node == null) return "";
return Optional.ofNullable(node.asNode().getLiteralValue().toString()).orElse("");
}

private String getStringValue(Statement st) {
if (st == null) return "";
RDFNode node = st.getObject();
if(node == null){
return "";
}
return Optional.ofNullable(node.toString()).orElse("");
}

}

0 comments on commit 886c508

Please sign in to comment.