Skip to content

Commit

Permalink
[Upd] Change HTML convertor output to TSV
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthew-Kulich committed Jan 2, 2023
1 parent b5e1524 commit 921cee5
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import cz.cvut.spipes.exception.ResourceNotUniqueException;
import cz.cvut.spipes.modules.model.*;
import cz.cvut.spipes.modules.util.BNodesTransformer;
import cz.cvut.spipes.modules.util.HTML2CSVConvertor;
import cz.cvut.spipes.modules.util.HTML2TSVConvertor;
import cz.cvut.spipes.modules.util.JopaPersistenceUtils;
import cz.cvut.spipes.registry.StreamResource;
import cz.cvut.spipes.registry.StreamResourceRegistry;
Expand Down Expand Up @@ -78,9 +78,9 @@
* ]
* </code></pre>
* <p>
* This module can also be used to process HTML tables. First, the HTML table is converted to CSV
* This module can also be used to process HTML tables. First, the HTML table is converted to TSV
* and then processed as usual.
* Take a look at the option {@link TabularModule#processHTMLFile} and class {@link HTML2CSVConvertor} for more details.
* Take a look at the option {@link TabularModule#processHTMLFile} and class {@link HTML2TSVConvertor} for more details.
* <p>
* <b>Important notes (differences from the recommendation):</b><br/>
* Does not support custom table group URIs.<br/>
Expand Down Expand Up @@ -153,10 +153,9 @@ public class TabularModule extends AbstractModule {
ExecutionContext executeSelf() {

if(processHTMLFile) {
HTML2CSVConvertor htmlConvertor = new HTML2CSVConvertor();
setSourceResource(htmlConvertor.convertToCSV(sourceResource));
setDelimiter(',');
setQuoteCharacter('\"');
HTML2TSVConvertor htmlConvertor = new HTML2TSVConvertor();
setSourceResource(htmlConvertor.convertToTSV(sourceResource));
setDelimiter('\t');
}

BNodesTransformer bNodesTransformer = new BNodesTransformer();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,13 @@
import cz.cvut.spipes.constants.HTML;
import cz.cvut.spipes.registry.StreamResource;
import cz.cvut.spipes.registry.StringStreamResource;
import org.apache.commons.text.StringEscapeUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
* This class can be used to read the HTML table from input and convert it to CSV file.
* This class can be used to read the HTML table from input and convert it to TSV file.
* The HTML table must contain at least these two tags ({@literal <}td/>, {@literal <}tr/>) to be processed correctly.
* The recommended format is shown in the example below:
* <table>
Expand All @@ -24,32 +23,33 @@
* </tr>
* </table
*/
public class HTML2CSVConvertor {
public class HTML2TSVConvertor {

public StringStreamResource convertToCSV(StreamResource streamResource) {
StringBuilder csvStringBuilder = new StringBuilder();
public StringStreamResource convertToTSV(StreamResource streamResource) {
StringBuilder tsvStringBuilder = new StringBuilder();

Document doc = Jsoup.parseBodyFragment(new String(streamResource.getContent()));
doc.outputSettings(new Document.OutputSettings().prettyPrint(false));
Elements rows = doc.getElementsByTag(HTML.TABLE_ROW_TAG);

for (Element row : rows) {
processTag(row, csvStringBuilder, HTML.TABLE_HEADER_TAG);
processTag(row, csvStringBuilder, HTML.TABLE_CELL_TAG);
csvStringBuilder.append("\n");
processTag(row, tsvStringBuilder, HTML.TABLE_HEADER_TAG);
processTag(row, tsvStringBuilder, HTML.TABLE_CELL_TAG);
tsvStringBuilder.append("\n");
}

return new StringStreamResource(
streamResource.getUri(),
csvStringBuilder.toString().getBytes(),
"text/csv"
tsvStringBuilder.toString().getBytes(),
"text/tsv"
);
}

private void processTag(Element row, StringBuilder sb, String tag) {
Elements cells = row.getElementsByTag(tag);
for (Element cell : cells) {
if (cell != cells.get(0)) sb.append(",");
sb.append("\"").append(StringEscapeUtils.escapeJava(cell.text())).append("\"");
if (cell != cells.get(0)) sb.append('\t');
sb.append(cell.html().replace("\t"," "));
}
}
}

0 comments on commit 921cee5

Please sign in to comment.