[New] Rename custom tokenizer, change invalid quoting property

kbss-cvut · Nov 15, 2022 · 2dd221c · 2dd221c
1 parent 520e7a8
commit 2dd221c
Show file tree

Hide file tree

Showing 3 changed files with 103 additions and 53 deletions.
diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/CustomTokenizer.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/CustomTokenizer.java
diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/InvalidQuotingTokenizer.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/InvalidQuotingTokenizer.java
@@ -0,0 +1,72 @@
+package cz.cvut.spipes;
+
+import org.supercsv.io.Tokenizer;
+import org.supercsv.prefs.CsvPreference;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+    * <p> This class is a custom implementation of the Tokenizer interface
+    * that allows to parse CSV and TSV files with invalid quoting.
+    * Any quotes not next to a delimiter or at the start/end of the line should be escaped.</p>
+    * <p>Example of invalid quoting in the CSV:
+    *  <table>
+    *      <thead>
+    *          <td>Company Name</td>
+    *          <td>Product</td>
+    *      </thead>
+    *   <tr>
+    *     <td>"Albanese Confectionery", </td>
+    *     <td>"FRUIT WORMS 2" 4/5LB"</td>
+    *     <td>     <--- Invalid quoting</td>
+    *   </tr>
+    *   <tr>
+    *    <td>"Albanese Confectionery", </td>
+    *      <td>"FRUIT WORMS 2"" 4/5LB"</td>
+    *       <td><--- Valid quoting</td>
+    *    </tr>
+    * </table>
+    * </p>
+    * <p> The tokenizer is compliant with following formats: CSV, TSV </p>
+    * <p>Notes:
+    * In the TSV standard, there is no mention of quotes, but in this implementation, we process
+    * the TSV quotes the same way as the CSV quotes.
+    * </p>
+    * @see <a href="https://www.rfc-editor.org/rfc/rfc4180">CSV</a>
+    * @see <a href="https://www.iana.org/assignments/media-types/text/tab-separated-values">TSV</a>
+
+ */
+public class InvalidQuotingTokenizer extends Tokenizer {
+
+    public InvalidQuotingTokenizer(Reader reader, CsvPreference preferences) {
+        super(reader, preferences);
+    }
+
+    @Override
+    protected String readLine() throws IOException {
+        final String line = super.readLine();
+        if (line == null) {
+            return null;
+        }
+
+        final char quote = getPreferences().getQuoteChar();
+        final char delimiter = (char) getPreferences().getDelimiterChar();
+
+        // escape all quotes not next to a delimiter (or start/end of line)
+        final StringBuilder b = new StringBuilder(line);
+        for (int i = b.length() - 1; i >= 0; i--) {
+            if (quote == b.charAt(i)) {
+                final boolean validCharBefore = i - 1 < 0
+                        || b.charAt(i - 1) == delimiter;
+                final boolean validCharAfter = i + 1 == b.length()
+                        || b.charAt(i + 1) == delimiter;
+                if (!(validCharBefore || validCharAfter)) {
+                    // escape that quote!
+                    b.insert(i, quote);
+                }
+            }
+        }
+        return b.toString();
+    }
+}
diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
@@ -2,7 +2,8 @@
 
 import cz.cvut.kbss.jopa.model.EntityManager;
 import cz.cvut.kbss.jopa.model.query.TypedQuery;
-import cz.cvut.spipes.CustomTokenizer;
+import cz.cvut.spipes.InvalidQuotingTokenizer;
+import cz.cvut.spipes.config.ExecutionConfig;
 import cz.cvut.spipes.constants.CSVW;
 import cz.cvut.spipes.constants.KBSS_MODULE;
 import cz.cvut.spipes.constants.SML;
@@ -16,6 +17,7 @@
 import cz.cvut.spipes.registry.StreamResource;
 import cz.cvut.spipes.registry.StreamResourceRegistry;
 import cz.cvut.spipes.util.JenaUtils;
+import org.apache.commons.cli.MissingArgumentException;
 import org.apache.jena.rdf.model.*;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
@@ -71,7 +73,7 @@ public class TabularModule extends AbstractModule {
 
     private final Property P_DELIMITER = getSpecificParameter("delimiter");
     private final Property P_QUOTE_CHARACTER = getSpecificParameter("quote-character");
-    private final Property P_CUSTOM_TOKENIZER = getSpecificParameter("use-custom-tokenizer");
+    private final Property P_CUSTOM_TOKENIZER = getSpecificParameter("accept-invalid-quoting");
     private final Property P_DATE_PREFIX = getSpecificParameter("data-prefix");
     private final Property P_OUTPUT_MODE = getSpecificParameter("output-mode");
     private final Property P_SOURCE_RESOURCE_URI = getSpecificParameter("source-resource-uri");
@@ -98,8 +100,8 @@ public class TabularModule extends AbstractModule {
     //:output-mode
     private Mode outputMode;
 
-    //:use-custom-tokenizer
-    private boolean useCustomTokenizer;
+    //:accept-invalid-quoting
+    private boolean acceptInvalidQuoting;
 
     /**
      * Represent a group of tables.
@@ -138,11 +140,11 @@ ExecutionContext executeSelf() {
                 System.lineSeparator()).build();
 
         try{
-            ICsvListReader listReader;
-            if (useCustomTokenizer) {
-                listReader = new CsvListReader(new CustomTokenizer(getReader(),csvPreference), csvPreference);
-            }else{
-                listReader = new CsvListReader(getReader(), csvPreference);
+            ICsvListReader listReader = getCsvListReader(csvPreference);
+
+            if (listReader == null) {
+                logMissingQuoteError();
+                return getExecutionContext(inputModel, outputModel);
             }
 
             String[] header = listReader.getHeader(true); // skip the header (can't be used with CsvListReader)
@@ -226,7 +228,7 @@ ExecutionContext executeSelf() {
                 }
             }
 
-        } catch (IOException e) {
+        } catch (IOException | MissingArgumentException e) {
             LOG.error("Error while reading file from resource uri {}", sourceResource, e);
         }
 
@@ -246,6 +248,16 @@ ExecutionContext executeSelf() {
         return getExecutionContext(inputModel, outputModel);
     }
 
+    private ICsvListReader getCsvListReader(CsvPreference csvPreference) {
+        if (acceptInvalidQuoting) {
+            if (getQuote() == '\0') {
+                return null;
+            }else
+                return new CsvListReader(new InvalidQuotingTokenizer(getReader(), csvPreference), csvPreference);
+        }
+        return new CsvListReader(getReader(), csvPreference);
+    }
+
     private Statement createRowResource(String cellValue, int rowNumber, Column column) {
         Resource rowResource = ResourceFactory.createResource(tableSchema.createAboutUrl(rowNumber));
 
@@ -312,8 +324,8 @@ public void loadConfiguration() {
         isReplace = getPropertyValue(SML.replace, false);
         delimiter = getPropertyValue(P_DELIMITER, '\t');
         skipHeader = getPropertyValue(P_SKIP_HEADER, false);
-        useCustomTokenizer = getPropertyValue(P_CUSTOM_TOKENIZER, false);
-        quoteCharacter = getPropertyValue(P_QUOTE_CHARACTER, '\'');
+        acceptInvalidQuoting = getPropertyValue(P_CUSTOM_TOKENIZER, false);
+        quoteCharacter = getPropertyValue(P_QUOTE_CHARACTER, '\0');
         dataPrefix = getEffectiveValue(P_DATE_PREFIX).asLiteral().toString();
         sourceResource = getResourceByUri(getEffectiveValue(P_SOURCE_RESOURCE_URI).asLiteral().toString());
         outputMode = Mode.fromResource(
@@ -478,4 +490,11 @@ private String[] createHeaders(int size, List<Column> columns) {
         }
         return headers;
     }
+
+    private void logMissingQuoteError() throws MissingArgumentException {
+        String message = "Quote character must be specified when using custom tokenizer.";
+        if (ExecutionConfig.isExitOnError()) {
+            throw new MissingArgumentException(message);
+        }else LOG.error(message);
+    }
 }