-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ec9fe7b
commit a5da8c2
Showing
5 changed files
with
122 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
9 changes: 9 additions & 0 deletions
9
s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/constants/HTML.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
package cz.cvut.spipes.constants; | ||
|
||
public class HTML { | ||
|
||
private HTML() {} | ||
public static final String TABLE_ROW_TAG = "tr"; | ||
public static final String TABLE_CELL_TAG = "td"; | ||
public static final String TABLE_HEADER_TAG = "th"; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
55 changes: 55 additions & 0 deletions
55
...s-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTML2CSVConvertor.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
package cz.cvut.spipes.modules.util; | ||
|
||
import cz.cvut.spipes.constants.HTML; | ||
import cz.cvut.spipes.registry.StreamResource; | ||
import cz.cvut.spipes.registry.StringStreamResource; | ||
import org.apache.commons.text.StringEscapeUtils; | ||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.select.Elements; | ||
|
||
/** | ||
* This class can be used to read the HTML table from input and convert it to CSV file. | ||
* The HTML table must contain at least these two tags ({@literal <}td/>, {@literal <}tr/>) to be processed correctly. | ||
* The recommended format is shown in the example below: | ||
* <table> | ||
* <tr> | ||
* <th>Column 1</th> | ||
* <th>Column 2</th> | ||
* </tr> | ||
* <tr> | ||
* <td>Value 1</td> | ||
* <td>Value 2</td> | ||
* </tr> | ||
* </table | ||
*/ | ||
public class HTML2CSVConvertor { | ||
|
||
public StringStreamResource convertToCSV(StreamResource streamResource) { | ||
StringBuilder csvStringBuilder = new StringBuilder(); | ||
|
||
Document doc = Jsoup.parseBodyFragment(new String(streamResource.getContent())); | ||
Elements rows = doc.getElementsByTag(HTML.TABLE_ROW_TAG); | ||
|
||
for (Element row : rows) { | ||
processTag(row, csvStringBuilder, HTML.TABLE_HEADER_TAG); | ||
processTag(row, csvStringBuilder, HTML.TABLE_CELL_TAG); | ||
csvStringBuilder.append("\n"); | ||
} | ||
|
||
return new StringStreamResource( | ||
streamResource.getUri(), | ||
csvStringBuilder.toString().getBytes(), | ||
"text/csv" | ||
); | ||
} | ||
|
||
private void processTag(Element row, StringBuilder sb, String tag) { | ||
Elements cells = row.getElementsByTag(tag); | ||
for (Element cell : cells) { | ||
if (cell != cells.get(0)) sb.append(","); | ||
sb.append("\"").append(StringEscapeUtils.escapeJava(cell.text())).append("\""); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters