Skip to content

Commit

Permalink
[New] Rename custom tokenizer, change invalid quoting property
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthew-Kulich committed Nov 15, 2022
1 parent 520e7a8 commit 2dd221c
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 53 deletions.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package cz.cvut.spipes;

import org.supercsv.io.Tokenizer;
import org.supercsv.prefs.CsvPreference;

import java.io.IOException;
import java.io.Reader;

/**
* <p> This class is a custom implementation of the Tokenizer interface
* that allows to parse CSV and TSV files with invalid quoting.
* Any quotes not next to a delimiter or at the start/end of the line should be escaped.</p>
* <p>Example of invalid quoting in the CSV:
* <table>
* <thead>
* <td>Company Name</td>
* <td>Product</td>
* </thead>
* <tr>
* <td>"Albanese Confectionery", </td>
* <td>"FRUIT WORMS 2" 4/5LB"</td>
* <td> <--- Invalid quoting</td>
* </tr>
* <tr>
* <td>"Albanese Confectionery", </td>
* <td>"FRUIT WORMS 2"" 4/5LB"</td>
* <td><--- Valid quoting</td>
* </tr>
* </table>
* </p>
* <p> The tokenizer is compliant with following formats: CSV, TSV </p>
* <p>Notes:
* In the TSV standard, there is no mention of quotes, but in this implementation, we process
* the TSV quotes the same way as the CSV quotes.
* </p>
* @see <a href="https://www.rfc-editor.org/rfc/rfc4180">CSV</a>
* @see <a href="https://www.iana.org/assignments/media-types/text/tab-separated-values">TSV</a>
*/
public class InvalidQuotingTokenizer extends Tokenizer {

public InvalidQuotingTokenizer(Reader reader, CsvPreference preferences) {
super(reader, preferences);
}

@Override
protected String readLine() throws IOException {
final String line = super.readLine();
if (line == null) {
return null;
}

final char quote = getPreferences().getQuoteChar();
final char delimiter = (char) getPreferences().getDelimiterChar();

// escape all quotes not next to a delimiter (or start/end of line)
final StringBuilder b = new StringBuilder(line);
for (int i = b.length() - 1; i >= 0; i--) {
if (quote == b.charAt(i)) {
final boolean validCharBefore = i - 1 < 0
|| b.charAt(i - 1) == delimiter;
final boolean validCharAfter = i + 1 == b.length()
|| b.charAt(i + 1) == delimiter;
if (!(validCharBefore || validCharAfter)) {
// escape that quote!
b.insert(i, quote);
}
}
}
return b.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

import cz.cvut.kbss.jopa.model.EntityManager;
import cz.cvut.kbss.jopa.model.query.TypedQuery;
import cz.cvut.spipes.CustomTokenizer;
import cz.cvut.spipes.InvalidQuotingTokenizer;
import cz.cvut.spipes.config.ExecutionConfig;
import cz.cvut.spipes.constants.CSVW;
import cz.cvut.spipes.constants.KBSS_MODULE;
import cz.cvut.spipes.constants.SML;
Expand All @@ -16,6 +17,7 @@
import cz.cvut.spipes.registry.StreamResource;
import cz.cvut.spipes.registry.StreamResourceRegistry;
import cz.cvut.spipes.util.JenaUtils;
import org.apache.commons.cli.MissingArgumentException;
import org.apache.jena.rdf.model.*;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
Expand Down Expand Up @@ -71,7 +73,7 @@ public class TabularModule extends AbstractModule {

private final Property P_DELIMITER = getSpecificParameter("delimiter");
private final Property P_QUOTE_CHARACTER = getSpecificParameter("quote-character");
private final Property P_CUSTOM_TOKENIZER = getSpecificParameter("use-custom-tokenizer");
private final Property P_CUSTOM_TOKENIZER = getSpecificParameter("accept-invalid-quoting");
private final Property P_DATE_PREFIX = getSpecificParameter("data-prefix");
private final Property P_OUTPUT_MODE = getSpecificParameter("output-mode");
private final Property P_SOURCE_RESOURCE_URI = getSpecificParameter("source-resource-uri");
Expand All @@ -98,8 +100,8 @@ public class TabularModule extends AbstractModule {
//:output-mode
private Mode outputMode;

//:use-custom-tokenizer
private boolean useCustomTokenizer;
//:accept-invalid-quoting
private boolean acceptInvalidQuoting;

/**
* Represent a group of tables.
Expand Down Expand Up @@ -138,11 +140,11 @@ ExecutionContext executeSelf() {
System.lineSeparator()).build();

try{
ICsvListReader listReader;
if (useCustomTokenizer) {
listReader = new CsvListReader(new CustomTokenizer(getReader(),csvPreference), csvPreference);
}else{
listReader = new CsvListReader(getReader(), csvPreference);
ICsvListReader listReader = getCsvListReader(csvPreference);

if (listReader == null) {
logMissingQuoteError();
return getExecutionContext(inputModel, outputModel);
}

String[] header = listReader.getHeader(true); // skip the header (can't be used with CsvListReader)
Expand Down Expand Up @@ -226,7 +228,7 @@ ExecutionContext executeSelf() {
}
}

} catch (IOException e) {
} catch (IOException | MissingArgumentException e) {
LOG.error("Error while reading file from resource uri {}", sourceResource, e);
}

Expand All @@ -246,6 +248,16 @@ ExecutionContext executeSelf() {
return getExecutionContext(inputModel, outputModel);
}

private ICsvListReader getCsvListReader(CsvPreference csvPreference) {
if (acceptInvalidQuoting) {
if (getQuote() == '\0') {
return null;
}else
return new CsvListReader(new InvalidQuotingTokenizer(getReader(), csvPreference), csvPreference);
}
return new CsvListReader(getReader(), csvPreference);
}

private Statement createRowResource(String cellValue, int rowNumber, Column column) {
Resource rowResource = ResourceFactory.createResource(tableSchema.createAboutUrl(rowNumber));

Expand Down Expand Up @@ -312,8 +324,8 @@ public void loadConfiguration() {
isReplace = getPropertyValue(SML.replace, false);
delimiter = getPropertyValue(P_DELIMITER, '\t');
skipHeader = getPropertyValue(P_SKIP_HEADER, false);
useCustomTokenizer = getPropertyValue(P_CUSTOM_TOKENIZER, false);
quoteCharacter = getPropertyValue(P_QUOTE_CHARACTER, '\'');
acceptInvalidQuoting = getPropertyValue(P_CUSTOM_TOKENIZER, false);
quoteCharacter = getPropertyValue(P_QUOTE_CHARACTER, '\0');
dataPrefix = getEffectiveValue(P_DATE_PREFIX).asLiteral().toString();
sourceResource = getResourceByUri(getEffectiveValue(P_SOURCE_RESOURCE_URI).asLiteral().toString());
outputMode = Mode.fromResource(
Expand Down Expand Up @@ -478,4 +490,11 @@ private String[] createHeaders(int size, List<Column> columns) {
}
return headers;
}

private void logMissingQuoteError() throws MissingArgumentException {
String message = "Quote character must be specified when using custom tokenizer.";
if (ExecutionConfig.isExitOnError()) {
throw new MissingArgumentException(message);
}else LOG.error(message);
}
}

0 comments on commit 2dd221c

Please sign in to comment.