Skip to content

Commit

Permalink
[New] Add HTML to RDF support
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthew-Kulich committed Dec 7, 2022
1 parent ec9fe7b commit a5da8c2
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 0 deletions.
5 changes: 5 additions & 0 deletions s-pipes-modules/module-tabular/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@
<groupId>cz.cvut.kbss.jopa</groupId>
<artifactId>ontodriver-jena</artifactId>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.15.3</version>
</dependency>
<dependency>
<groupId>cz.cvut.kbss</groupId>
<artifactId>s-pipes-test</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package cz.cvut.spipes.constants;

public class HTML {

private HTML() {}
public static final String TABLE_ROW_TAG = "tr";
public static final String TABLE_CELL_TAG = "td";
public static final String TABLE_HEADER_TAG = "th";
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import cz.cvut.spipes.exception.ResourceNotUniqueException;
import cz.cvut.spipes.modules.model.*;
import cz.cvut.spipes.modules.util.BNodesTransformer;
import cz.cvut.spipes.modules.util.HTML2CSVConvertor;
import cz.cvut.spipes.modules.util.JopaPersistenceUtils;
import cz.cvut.spipes.registry.StreamResource;
import cz.cvut.spipes.registry.StreamResourceRegistry;
Expand Down Expand Up @@ -77,6 +78,10 @@
* ]
* </code></pre>
* <p>
* This module can also be used to process HTML tables. First, the HTML table is converted to CSV
* and then processed as usual.
* Take a look at the option {@link TabularModule#processHTMLFile} and class {@link HTML2CSVConvertor} for more details.
* <p>
* <b>Important notes (differences from the recommendation):</b><br/>
* Does not support custom table group URIs.<br/>
* Does not support custom table URIs. <br/>
Expand All @@ -95,6 +100,7 @@ public class TabularModule extends AbstractModule {
private final Property P_OUTPUT_MODE = getSpecificParameter("output-mode");
private final Property P_SOURCE_RESOURCE_URI = getSpecificParameter("source-resource-uri");
private final Property P_SKIP_HEADER = getSpecificParameter("skip-header");
private final Property P_PROCESS_HTML_FILE = getSpecificParameter("process-html-file");

//sml:replace
private boolean isReplace;
Expand All @@ -114,6 +120,9 @@ public class TabularModule extends AbstractModule {
//:skip-header
private boolean skipHeader;

//:process-html-file
private boolean processHTMLFile;

//:output-mode
private Mode outputMode;

Expand Down Expand Up @@ -142,6 +151,14 @@ public class TabularModule extends AbstractModule {

@Override
ExecutionContext executeSelf() {

if(processHTMLFile) {
HTML2CSVConvertor htmlConvertor = new HTML2CSVConvertor();
setSourceResource(htmlConvertor.convertToCSV(sourceResource));
setDelimiter(',');
setQuoteCharacter('\"');
}

BNodesTransformer bNodesTransformer = new BNodesTransformer();
Model inputModel = bNodesTransformer.convertBNodesToNonBNodes(executionContext.getDefaultModel());
boolean hasInputSchema = false;
Expand Down Expand Up @@ -346,6 +363,7 @@ public void loadConfiguration() {
isReplace = getPropertyValue(SML.replace, false);
delimiter = getPropertyValue(P_DELIMITER, getDefaultDelimiterSupplier());
skipHeader = getPropertyValue(P_SKIP_HEADER, false);
processHTMLFile = getPropertyValue(P_PROCESS_HTML_FILE, false);
acceptInvalidQuoting = getPropertyValue(P_ACCEPT_INVALID_QUOTING, false);
quoteCharacter = getPropertyValue(P_QUOTE_CHARACTER, getDefaultQuoteCharacterSupplier(delimiter));
dataPrefix = getEffectiveValue(P_DATE_PREFIX).asLiteral().toString();
Expand Down Expand Up @@ -509,6 +527,10 @@ public void setSkipHeader(boolean skipHeader) {
this.skipHeader = skipHeader;
}

public void setProcessHTMLFile(boolean processHTMLFile) {
this.processHTMLFile = processHTMLFile;
}

private String[] getHeaderFromSchema(Model inputModel, String[] header, boolean hasInputSchema) {
if (hasInputSchema) {
List<String> orderList = new ArrayList<>();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package cz.cvut.spipes.modules.util;

import cz.cvut.spipes.constants.HTML;
import cz.cvut.spipes.registry.StreamResource;
import cz.cvut.spipes.registry.StringStreamResource;
import org.apache.commons.text.StringEscapeUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
* This class can be used to read the HTML table from input and convert it to CSV file.
* The HTML table must contain at least these two tags ({@literal <}td/>, {@literal <}tr/>) to be processed correctly.
* The recommended format is shown in the example below:
* <table>
* <tr>
* <th>Column 1</th>
* <th>Column 2</th>
* </tr>
* <tr>
* <td>Value 1</td>
* <td>Value 2</td>
* </tr>
* </table
*/
public class HTML2CSVConvertor {

public StringStreamResource convertToCSV(StreamResource streamResource) {
StringBuilder csvStringBuilder = new StringBuilder();

Document doc = Jsoup.parseBodyFragment(new String(streamResource.getContent()));
Elements rows = doc.getElementsByTag(HTML.TABLE_ROW_TAG);

for (Element row : rows) {
processTag(row, csvStringBuilder, HTML.TABLE_HEADER_TAG);
processTag(row, csvStringBuilder, HTML.TABLE_CELL_TAG);
csvStringBuilder.append("\n");
}

return new StringStreamResource(
streamResource.getUri(),
csvStringBuilder.toString().getBytes(),
"text/csv"
);
}

private void processTag(Element row, StringBuilder sb, String tag) {
Elements cells = row.getElementsByTag(tag);
for (Element cell : cells) {
if (cell != cells.get(0)) sb.append(",");
sb.append("\"").append(StringEscapeUtils.escapeJava(cell.text())).append("\"");
}
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package cz.cvut.spipes.modules;

import cz.cvut.spipes.config.ExecutionConfig;
import cz.cvut.spipes.constants.CSVW;
import cz.cvut.spipes.engine.ExecutionContext;
import cz.cvut.spipes.engine.ExecutionContextFactory;
import cz.cvut.spipes.exception.ResourceNotUniqueException;
Expand All @@ -21,6 +22,8 @@
import java.net.URISyntaxException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.List;

import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assumptions.assumeTrue;
Expand Down Expand Up @@ -197,6 +200,34 @@ void executeSelfWithBNodesInSchema() throws IOException, URISyntaxException {
assertIsomorphic(actualModel, expectedModel);
}

@Test
void executeSelfWithHTMLFileInput() throws URISyntaxException, IOException {
module.setProcessHTMLFile(true);
module.setSourceResource(
StreamResourceUtils.getStreamResource(DATA_PREFIX, getFilePath("examples/htmlFile/input.html"))
);

ExecutionContext outputContext = module.executeSelf();
Model actualModel = outputContext.getDefaultModel();

List<String> header = Arrays.asList("No_", "Test_1", "Test_2", "Description");
List<List<String>> rows = Arrays.asList(
Arrays.asList("1.", "123", "456", "description 1"),
Arrays.asList("2.", "789", "123", "description 2"));

header.forEach(headerValue -> assertTrue(actualModel.contains(null, CSVW.name, headerValue)));

for (List<String> row: rows){
for(int idx = 0; idx < header.size(); idx++) {
String headerValue = header.get(idx);
String rowValue = row.get(idx);
assertTrue(actualModel
.contains(null, actualModel.getProperty(DATA_PREFIX + headerValue), rowValue)
);
}
}
}

void assertIsomorphic(Model actualModel, Model expectedModel){
if (! actualModel.isIsomorphicWith(expectedModel)) {
LOG.debug("Saving actual model ... ");
Expand Down

0 comments on commit a5da8c2

Please sign in to comment.