Skip to content

Commit

Permalink
[New] Initial implementation of the xls files support in TabularModule
Browse files Browse the repository at this point in the history
  • Loading branch information
rodionnv committed Aug 30, 2023
1 parent 4d410e3 commit 46f40c1
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 0 deletions.
6 changes: 6 additions & 0 deletions s-pipes-modules/module-tabular/pom.xml
Expand Up @@ -92,5 +92,11 @@
<artifactId>junit-jupiter-params</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.11</version>
<scope>compile</scope>
</dependency>
</dependencies>
</project>
Expand Up @@ -16,6 +16,7 @@
import cz.cvut.spipes.modules.util.BNodesTransformer;
import cz.cvut.spipes.modules.util.HTML2TSVConvertor;
import cz.cvut.spipes.modules.util.JopaPersistenceUtils;
import cz.cvut.spipes.modules.util.XLS2TSVConvertor;
import cz.cvut.spipes.registry.StreamResource;
import cz.cvut.spipes.registry.StreamResourceRegistry;
import cz.cvut.spipes.util.JenaUtils;
Expand Down Expand Up @@ -104,6 +105,12 @@ public class TabularModule extends AbstractModule {
private final Property P_SOURCE_RESOURCE_URI = getSpecificParameter("source-resource-uri");
private final Property P_SKIP_HEADER = getSpecificParameter("skip-header");
private final Property P_PROCESS_HTML_FILE = getSpecificParameter("process-html-file");
private final Property P_PROCESS_XLS_FILE = getSpecificParameter("process-xls-file");

/**
Optional parameter that indicates that only specific single sheet should be converted
*/
private final Property P_PROCESS_SPECIFIC_SHEET_IN_XLS_FILE = getSpecificParameter("process-specific-sheet-in-xls-file");

//sml:replace
private boolean isReplace;
Expand All @@ -126,6 +133,12 @@ public class TabularModule extends AbstractModule {
//:process-html-file
private boolean processHTMLFile;

//:process-xls-file
private boolean processXLSFile;

//:process-specific-sheet-in-xls-file
private int processSpecificSheetInXLSFile;

//:output-mode
private Mode outputMode;

Expand All @@ -146,6 +159,7 @@ public class TabularModule extends AbstractModule {
* Represents the table schema that was used to describe the table
*/
private TableSchema tableSchema;
private int numberOfSheets;

/**
* Default charset to process input file.
Expand All @@ -161,6 +175,14 @@ ExecutionContext executeSelf() {
setDelimiter('\t');
}

if(processXLSFile) {
XLS2TSVConvertor xlsConvertor = new XLS2TSVConvertor();
numberOfSheets = xlsConvertor.getNumberOfSheets(sourceResource); // Currently isn't used
LOG.debug("Number of sheets:{}",numberOfSheets);
setSourceResource(xlsConvertor.convertToTSV(sourceResource));
setDelimiter('\t');
}

BNodesTransformer bNodesTransformer = new BNodesTransformer();
Model inputModel = bNodesTransformer.convertBNodesToNonBNodes(executionContext.getDefaultModel());
boolean hasInputSchema = false;
Expand Down Expand Up @@ -368,6 +390,8 @@ public void loadConfiguration() {
delimiter = getPropertyValue(P_DELIMITER, getDefaultDelimiterSupplier());
skipHeader = getPropertyValue(P_SKIP_HEADER, false);
processHTMLFile = getPropertyValue(P_PROCESS_HTML_FILE, false);
processXLSFile = getPropertyValue(P_PROCESS_XLS_FILE, false);
processSpecificSheetInXLSFile = getPropertyValue(P_PROCESS_SPECIFIC_SHEET_IN_XLS_FILE, 0);
acceptInvalidQuoting = getPropertyValue(P_ACCEPT_INVALID_QUOTING, false);
quoteCharacter = getPropertyValue(P_QUOTE_CHARACTER, getDefaultQuoteCharacterSupplier(delimiter));
dataPrefix = getEffectiveValue(P_DATE_PREFIX).asLiteral().toString();
Expand Down Expand Up @@ -538,6 +562,9 @@ public void setSkipHeader(boolean skipHeader) {
public void setProcessHTMLFile(boolean processHTMLFile) {
this.processHTMLFile = processHTMLFile;
}
public void setProcessXLSFile(boolean processXLSFile) {
this.processXLSFile = processXLSFile;
}

private String[] getHeaderFromSchema(Model inputModel, String[] header, boolean hasInputSchema) {
if (hasInputSchema) {
Expand Down
@@ -0,0 +1,51 @@
package cz.cvut.spipes.modules.util;

import cz.cvut.spipes.registry.StreamResource;
import cz.cvut.spipes.registry.StringStreamResource;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;

import java.io.ByteArrayInputStream;
import java.io.IOException;

/**
* Module for converting tabular data from XLS to TSV.
* Currently, converts only first sheet.
*/
public class XLS2TSVConvertor {
public StringStreamResource convertToTSV(StreamResource streamResource){
try {
Workbook workbook = new HSSFWorkbook(new ByteArrayInputStream(streamResource.getContent()));
Sheet sheet = workbook.getSheetAt(0);

StringBuilder tsvStringBuilder = new StringBuilder();
for (Row row : sheet) {
for (Cell cell : row) {
tsvStringBuilder.append(cell.toString().replace('\t', ' '));
tsvStringBuilder.append('\t');
}
tsvStringBuilder.deleteCharAt(tsvStringBuilder.length() - 1);
tsvStringBuilder.append('\n');
}
return new StringStreamResource(
streamResource.getUri(),
tsvStringBuilder.toString().getBytes(),
"text/tsv"
);
}
catch (IOException e) {
throw new RuntimeException(e);
}
}

public int getNumberOfSheets(StreamResource streamResource){
try {
return new HSSFWorkbook(new ByteArrayInputStream(streamResource.getContent())).getNumberOfSheets();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
Expand Up @@ -62,6 +62,20 @@ void executeWithSimpleTransformation() throws URISyntaxException, IOException {
assertTrue(outputContext.getDefaultModel().size() > 0);
}

@Test
void executeWithSimpleTransformationXls() throws URISyntaxException, IOException {
module.setSourceResource(
StreamResourceUtils.getStreamResource(
"http://test-file",
getFilePath("countries.xls"))
);
module.setProcessXLSFile(true);

ExecutionContext outputContext = module.executeSelf();

assertTrue(outputContext.getDefaultModel().size() > 0);
}

@Test
void executeWithDuplicateColumnsThrowsResourceNotUniqueException()
throws URISyntaxException, IOException {
Expand Down
Binary file not shown.

0 comments on commit 46f40c1

Please sign in to comment.