[New] Add HTML to RDF support

kbss-cvut · Dec 7, 2022 · a5da8c2 · a5da8c2
1 parent ec9fe7b
commit a5da8c2
Show file tree

Hide file tree

Showing 5 changed files with 122 additions and 0 deletions.
diff --git a/s-pipes-modules/module-tabular/pom.xml b/s-pipes-modules/module-tabular/pom.xml
@@ -76,6 +76,11 @@
             <groupId>cz.cvut.kbss.jopa</groupId>
             <artifactId>ontodriver-jena</artifactId>
         </dependency>
+        <dependency>
+            <groupId>org.jsoup</groupId>
+            <artifactId>jsoup</artifactId>
+            <version>1.15.3</version>
+        </dependency>
         <dependency>
             <groupId>cz.cvut.kbss</groupId>
             <artifactId>s-pipes-test</artifactId>

diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/constants/HTML.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/constants/HTML.java
@@ -0,0 +1,9 @@
+package cz.cvut.spipes.constants;
+
+public class HTML {
+
+    private HTML() {}
+    public static final String TABLE_ROW_TAG = "tr";
+    public static final String TABLE_CELL_TAG = "td";
+    public static final String TABLE_HEADER_TAG = "th";
+}
diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
@@ -13,6 +13,7 @@
 import cz.cvut.spipes.exception.ResourceNotUniqueException;
 import cz.cvut.spipes.modules.model.*;
 import cz.cvut.spipes.modules.util.BNodesTransformer;
+import cz.cvut.spipes.modules.util.HTML2CSVConvertor;
 import cz.cvut.spipes.modules.util.JopaPersistenceUtils;
 import cz.cvut.spipes.registry.StreamResource;
 import cz.cvut.spipes.registry.StreamResourceRegistry;
@@ -77,6 +78,10 @@
  * ]
  * </code></pre>
  * <p>
+ * This module can also be used to process HTML tables. First, the HTML table is converted to CSV
+ * and then processed as usual.
+ * Take a look at the option {@link TabularModule#processHTMLFile} and class {@link HTML2CSVConvertor} for more details.
+ * <p>
  * <b>Important notes (differences from the recommendation):</b><br/>
  * Does not support custom table group URIs.<br/>
  * Does not support custom table URIs. <br/>
@@ -95,6 +100,7 @@ public class TabularModule extends AbstractModule {
     private final Property P_OUTPUT_MODE = getSpecificParameter("output-mode");
     private final Property P_SOURCE_RESOURCE_URI = getSpecificParameter("source-resource-uri");
     private final Property P_SKIP_HEADER = getSpecificParameter("skip-header");
+    private final Property P_PROCESS_HTML_FILE = getSpecificParameter("process-html-file");
 
     //sml:replace
     private boolean isReplace;
@@ -114,6 +120,9 @@ public class TabularModule extends AbstractModule {
     //:skip-header
     private boolean skipHeader;
 
+    //:process-html-file
+    private boolean processHTMLFile;
+
     //:output-mode
     private Mode outputMode;
 
@@ -142,6 +151,14 @@ public class TabularModule extends AbstractModule {
 
     @Override
     ExecutionContext executeSelf() {
+
+        if(processHTMLFile) {
+            HTML2CSVConvertor htmlConvertor = new HTML2CSVConvertor();
+            setSourceResource(htmlConvertor.convertToCSV(sourceResource));
+            setDelimiter(',');
+            setQuoteCharacter('\"');
+        }
+
         BNodesTransformer bNodesTransformer = new BNodesTransformer();
         Model inputModel = bNodesTransformer.convertBNodesToNonBNodes(executionContext.getDefaultModel());
         boolean hasInputSchema = false;
@@ -346,6 +363,7 @@ public void loadConfiguration() {
         isReplace = getPropertyValue(SML.replace, false);
         delimiter = getPropertyValue(P_DELIMITER, getDefaultDelimiterSupplier());
         skipHeader = getPropertyValue(P_SKIP_HEADER, false);
+        processHTMLFile = getPropertyValue(P_PROCESS_HTML_FILE, false);
         acceptInvalidQuoting = getPropertyValue(P_ACCEPT_INVALID_QUOTING, false);
         quoteCharacter = getPropertyValue(P_QUOTE_CHARACTER, getDefaultQuoteCharacterSupplier(delimiter));
         dataPrefix = getEffectiveValue(P_DATE_PREFIX).asLiteral().toString();
@@ -509,6 +527,10 @@ public void setSkipHeader(boolean skipHeader) {
         this.skipHeader = skipHeader;
     }
 
+    public void setProcessHTMLFile(boolean processHTMLFile) {
+        this.processHTMLFile = processHTMLFile;
+    }
+
     private String[] getHeaderFromSchema(Model inputModel, String[] header, boolean hasInputSchema) {
         if (hasInputSchema) {
             List<String> orderList = new ArrayList<>();

diff --git a/...s-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTML2CSVConvertor.java b/...s-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTML2CSVConvertor.java
@@ -0,0 +1,55 @@
+package cz.cvut.spipes.modules.util;
+
+import cz.cvut.spipes.constants.HTML;
+import cz.cvut.spipes.registry.StreamResource;
+import cz.cvut.spipes.registry.StringStreamResource;
+import org.apache.commons.text.StringEscapeUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+/**
+ * This class can be used to read the HTML table from input and convert it to CSV file.
+ * The HTML table must contain at least these two tags ({@literal <}td/>, {@literal <}tr/>) to be processed correctly.
+ * The recommended format is shown in the example below:
+ * <table>
+ *     <tr>
+ *         <th>Column 1</th>
+ *         <th>Column 2</th>
+ *     </tr>
+ *     <tr>
+ *         <td>Value 1</td>
+ *         <td>Value 2</td>
+ *     </tr>
+ * </table
+ */
+public class HTML2CSVConvertor {
+
+    public StringStreamResource convertToCSV(StreamResource streamResource) {
+        StringBuilder csvStringBuilder = new StringBuilder();
+
+        Document doc = Jsoup.parseBodyFragment(new String(streamResource.getContent()));
+        Elements rows = doc.getElementsByTag(HTML.TABLE_ROW_TAG);
+
+        for (Element row : rows) {
+            processTag(row, csvStringBuilder, HTML.TABLE_HEADER_TAG);
+            processTag(row, csvStringBuilder, HTML.TABLE_CELL_TAG);
+            csvStringBuilder.append("\n");
+        }
+
+        return new StringStreamResource(
+                streamResource.getUri(),
+                csvStringBuilder.toString().getBytes(),
+                "text/csv"
+        );
+    }
+
+    private void processTag(Element row, StringBuilder sb, String tag) {
+        Elements cells = row.getElementsByTag(tag);
+        for (Element cell : cells) {
+            if (cell != cells.get(0)) sb.append(",");
+            sb.append("\"").append(StringEscapeUtils.escapeJava(cell.text())).append("\"");
+        }
+    }
+}
diff --git a/s-pipes-modules/module-tabular/src/test/java/cz/cvut/spipes/modules/TabularModuleTest.java b/s-pipes-modules/module-tabular/src/test/java/cz/cvut/spipes/modules/TabularModuleTest.java
@@ -1,6 +1,7 @@
 package cz.cvut.spipes.modules;
 
 import cz.cvut.spipes.config.ExecutionConfig;
+import cz.cvut.spipes.constants.CSVW;
 import cz.cvut.spipes.engine.ExecutionContext;
 import cz.cvut.spipes.engine.ExecutionContextFactory;
 import cz.cvut.spipes.exception.ResourceNotUniqueException;
@@ -21,6 +22,8 @@
 import java.net.URISyntaxException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.List;
 
 import static org.junit.jupiter.api.Assertions.*;
 import static org.junit.jupiter.api.Assumptions.assumeTrue;
@@ -197,6 +200,34 @@ void executeSelfWithBNodesInSchema() throws IOException, URISyntaxException {
          assertIsomorphic(actualModel, expectedModel);
      }
 
+     @Test
+     void executeSelfWithHTMLFileInput() throws URISyntaxException, IOException {
+         module.setProcessHTMLFile(true);
+         module.setSourceResource(
+                StreamResourceUtils.getStreamResource(DATA_PREFIX, getFilePath("examples/htmlFile/input.html"))
+        );
+
+        ExecutionContext outputContext = module.executeSelf();
+        Model actualModel = outputContext.getDefaultModel();
+
+        List<String> header = Arrays.asList("No_", "Test_1", "Test_2", "Description");
+        List<List<String>> rows = Arrays.asList(
+                Arrays.asList("1.", "123", "456", "description 1"),
+                Arrays.asList("2.", "789", "123", "description 2"));
+
+        header.forEach(headerValue -> assertTrue(actualModel.contains(null, CSVW.name, headerValue)));
+
+        for (List<String> row: rows){
+            for(int idx = 0; idx < header.size(); idx++) {
+                String headerValue = header.get(idx);
+                String rowValue = row.get(idx);
+                assertTrue(actualModel
+                        .contains(null, actualModel.getProperty(DATA_PREFIX + headerValue), rowValue)
+                );
+            }
+        }
+     }
+
     void assertIsomorphic(Model actualModel, Model expectedModel){
         if (! actualModel.isIsomorphicWith(expectedModel)) {
             LOG.debug("Saving actual model ... ");