Skip to content

Commit

Permalink
[Upd] Add info about processing standards
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthew-Kulich committed Nov 22, 2022
1 parent a16f34d commit 21c0198
Showing 1 changed file with 26 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,21 @@
/**
* Module for converting tabular data (e.g. CSV or TSV) to RDF
* <p>
* The implementation loosely follows the W3C Recommendation described here:
* <a href="https://www.w3.org/TR/csv2rdf/">Generating RDF from Tabular Data on the Web</a>
* It supports two major processing standards that can be set by separator:
* <ul><li> separator ',' -- defaults to
* <a href="https://www.rfc-editor.org/rfc/rfc4180">CSV standard</a>, i.e. it uses by default quoting " and UTF-8 </li>
* <li> separator '\t' -- defaults to
* <a href="https://www.iana.org/assignments/media-types/text/tab-separated-values">TSV standard</a>, with no quoting
* (In the TSV standard, there is no mention of quotes, but in this implementation, we process the TSV quotes
* the same way as the CSV quotes.)</li>
* <li> other separator -- defaults to no standard, with no quoting</li>
* </ul>
* </p>
* In addition, it supports bad quoting according to CSV standard, see option
* {@link TabularModule#acceptInvalidQuoting}
* and class {@link InvalidQuotingTokenizer}
* <p>The implementation loosely follows the W3C Recommendation described here:
* <a href="https://www.w3.org/TR/csv2rdf/">Generating RDF from Tabular Data on the Web</a></p>
* <p>
* Within the recommendation, it is possible to define schema
* defining the shape of the output RDF data
Expand Down Expand Up @@ -326,12 +339,18 @@ public void loadConfiguration() {
delimiter = getPropertyValue(P_DELIMITER, ',');
skipHeader = getPropertyValue(P_SKIP_HEADER, false);
acceptInvalidQuoting = getPropertyValue(P_ACCEPT_INVALID_QUOTING, false);
quoteCharacter = getPropertyValue(P_QUOTE_CHARACTER, '"');
quoteCharacter = getPropertyValue(P_QUOTE_CHARACTER, '\0');
dataPrefix = getEffectiveValue(P_DATE_PREFIX).asLiteral().toString();
sourceResource = getResourceByUri(getEffectiveValue(P_SOURCE_RESOURCE_URI).asLiteral().toString());
outputMode = Mode.fromResource(
getPropertyValue(P_OUTPUT_MODE, Mode.STANDARD.getResource())
);

if(delimiter == ','){
quoteCharacter = '"';
}else if (delimiter == '\t'){
quoteCharacter = '\0';
}
}

@Override
Expand Down Expand Up @@ -394,7 +413,10 @@ private String normalize(String label) {
}

private Reader getReader() {
return new StringReader(new String(sourceResource.getContent(), StandardCharsets.UTF_8));
return new StringReader(
delimiter == ','
? new String(sourceResource.getContent(), StandardCharsets.UTF_8)
: new String(sourceResource.getContent()));
}

@NotNull
Expand Down

0 comments on commit 21c0198

Please sign in to comment.