[ML] Rename the json file structure to ndjson (elastic#34901)

The file structure finder endpoint can find the NDJSON (newline-delimited JSON) file format, but called it `json`. This change renames the `format` for this file structure to `ndjson`, which is more precise and will hopefully avoid confusion.
javanna · Oct 29, 2018 · c455be7 · c455be7
1 parent f13d529
commit c455be7
Show file tree

Hide file tree

Showing 15 changed files with 62 additions and 62 deletions.
diff --git a/docs/reference/ml/apis/find-file-structure.asciidoc b/docs/reference/ml/apis/find-file-structure.asciidoc
@@ -74,7 +74,7 @@ chosen.
   structure finder produced its result. The default value is `false`.
 
 `format`::
-  (string) The high level structure of the file. Valid values are `json`, `xml`,
+  (string) The high level structure of the file. Valid values are `ndjson`, `xml`,
   `delimited`, and `semi_structured_text`. If this parameter is not specified,
   the structure finder chooses one.
 
@@ -259,7 +259,7 @@ If the request does not encounter errors, you receive the following result:
   "sample_start" : "{\"name\": \"Leviathan Wakes\", \"author\": \"James S.A. Corey\", \"release_date\": \"2011-06-02\", \"page_count\": 561}\n{\"name\": \"Hyperion\", \"author\": \"Dan Simmons\", \"release_date\": \"1989-05-26\", \"page_count\": 482}\n", <3>
   "charset" : "UTF-8", <4>
   "has_byte_order_marker" : false, <5>
-  "format" : "json", <6>
+  "format" : "ndjson", <6>
   "need_client_timezone" : false, <7>
   "mappings" : { <8>
     "author" : {
@@ -473,14 +473,14 @@ If the request does not encounter errors, you receive the following result:
 
 <1> `num_lines_analyzed` indicates how many lines of the file were analyzed.
 <2> `num_messages_analyzed` indicates how many distinct messages the lines contained.
-     For ND-JSON, this value is the same as `num_lines_analyzed`. For other file
+     For NDJSON, this value is the same as `num_lines_analyzed`. For other file
      formats, messages can span several lines.
 <3> `sample_start` reproduces the first two messages in the file verbatim. This
      may help to diagnose parse errors or accidental uploads of the wrong file.
 <4> `charset` indicates the character encoding used to parse the file.
 <5> For UTF character encodings, `has_byte_order_marker` indicates whether the
     file begins with a byte order marker.
-<6> `format` is one of `json`, `xml`, `delimited` or `semi_structured_text`.
+<6> `format` is one of `ndjson`, `xml`, `delimited` or `semi_structured_text`.
 <7> If a timestamp format is detected that does not include a timezone,
     `need_client_timezone` will be `true`. The server that parses the file must
     therefore be told the correct timezone by the client.

diff --git a/...core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java b/...core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java
@@ -32,11 +32,11 @@ public class FileStructure implements ToXContentObject, Writeable {
 
     public enum Format {
 
-        JSON, XML, DELIMITED, SEMI_STRUCTURED_TEXT;
+        NDJSON, XML, DELIMITED, SEMI_STRUCTURED_TEXT;
 
         public boolean supportsNesting() {
             switch (this) {
-                case JSON:
+                case NDJSON:
                 case XML:
                     return true;
                 case DELIMITED:
@@ -49,7 +49,7 @@ public boolean supportsNesting() {
 
         public boolean isStructured() {
             switch (this) {
-                case JSON:
+                case NDJSON:
                 case XML:
                 case DELIMITED:
                     return true;
@@ -62,7 +62,7 @@ public boolean isStructured() {
 
         public boolean isSemiStructured() {
             switch (this) {
-                case JSON:
+                case NDJSON:
                 case XML:
                 case DELIMITED:
                     return false;
@@ -645,7 +645,7 @@ public FileStructure build() {
             }
 
             switch (format) {
-                case JSON:
+                case NDJSON:
                     if (shouldTrimFields != null) {
                         throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures.");
                     }

diff --git a/...test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java b/...test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java
@@ -124,7 +124,7 @@ public void testValidateNonDelimited() {
     public void testValidateNonSemiStructuredText() {
 
         FindFileStructureAction.Request request = new FindFileStructureAction.Request();
-        request.setFormat(randomFrom(FileStructure.Format.JSON, FileStructure.Format.XML, FileStructure.Format.DELIMITED));
+        request.setFormat(randomFrom(FileStructure.Format.NDJSON, FileStructure.Format.XML, FileStructure.Format.DELIMITED));
         request.setGrokPattern(randomAlphaOfLength(80));
         request.setSample(new BytesArray("foo\n"));
 

diff --git a/.../main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java b/.../main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java
@@ -35,7 +35,7 @@
  * Runs the high-level steps needed to create ingest configs for the specified file.  In order:
  * 1. Determine the most likely character set (UTF-8, UTF-16LE, ISO-8859-2, etc.)
  * 2. Load a sample of the file, consisting of the first 1000 lines of the file
- * 3. Determine the most likely file structure - one of ND-JSON, XML, delimited or semi-structured text
+ * 3. Determine the most likely file structure - one of NDJSON, XML, delimited or semi-structured text
  * 4. Create an appropriate structure object and delegate writing configs to it
  */
 public final class FileStructureFinderManager {
@@ -73,9 +73,9 @@ public final class FileStructureFinderManager {
      * These need to be ordered so that the more generic formats come after the more specific ones
      */
     private static final List<FileStructureFinderFactory> ORDERED_STRUCTURE_FACTORIES = Collections.unmodifiableList(Arrays.asList(
-        new JsonFileStructureFinderFactory(),
+        new NdJsonFileStructureFinderFactory(),
         new XmlFileStructureFinderFactory(),
-        // ND-JSON will often also be valid (although utterly weird) CSV, so JSON must come before CSV
+        // NDJSON will often also be valid (although utterly weird) CSV, so NDJSON must come before CSV
         new DelimitedFileStructureFinderFactory(',', '"', 2, false),
         new DelimitedFileStructureFinderFactory('\t', '"', 2, false),
         new DelimitedFileStructureFinderFactory(';', '"', 4, false),

diff --git a/...ucturefinder/JsonFileStructureFinder.java → ...turefinder/NdJsonFileStructureFinder.java b/...ucturefinder/JsonFileStructureFinder.java → ...turefinder/NdJsonFileStructureFinder.java
@@ -25,16 +25,16 @@
 import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
 
 /**
- * Really ND-JSON.
+ * Newline-delimited JSON.
  */
-public class JsonFileStructureFinder implements FileStructureFinder {
+public class NdJsonFileStructureFinder implements FileStructureFinder {
 
     private final List<String> sampleMessages;
     private final FileStructure structure;
 
-    static JsonFileStructureFinder makeJsonFileStructureFinder(List<String> explanation, String sample, String charsetName,
-                                                               Boolean hasByteOrderMarker, FileStructureOverrides overrides,
-                                                               TimeoutChecker timeoutChecker) throws IOException {
+    static NdJsonFileStructureFinder makeNdJsonFileStructureFinder(List<String> explanation, String sample, String charsetName,
+                                                                   Boolean hasByteOrderMarker, FileStructureOverrides overrides,
+                                                                   TimeoutChecker timeoutChecker) throws IOException {
 
         List<Map<String, ?>> sampleRecords = new ArrayList<>();
 
@@ -43,10 +43,10 @@ static JsonFileStructureFinder makeJsonFileStructureFinder(List<String> explanat
             XContentParser parser = jsonXContent.createParser(NamedXContentRegistry.EMPTY, DeprecationHandler.THROW_UNSUPPORTED_OPERATION,
                 sampleMessage);
             sampleRecords.add(parser.mapOrdered());
-            timeoutChecker.check("JSON parsing");
+            timeoutChecker.check("NDJSON parsing");
         }
 
-        FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.JSON)
+        FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.NDJSON)
             .setCharset(charsetName)
             .setHasByteOrderMarker(hasByteOrderMarker)
             .setSampleStart(sampleMessages.stream().limit(2).collect(Collectors.joining("\n", "", "\n")))
@@ -84,10 +84,10 @@ static JsonFileStructureFinder makeJsonFileStructureFinder(List<String> explanat
             .setExplanation(explanation)
             .build();
 
-        return new JsonFileStructureFinder(sampleMessages, structure);
+        return new NdJsonFileStructureFinder(sampleMessages, structure);
     }
 
-    private JsonFileStructureFinder(List<String> sampleMessages, FileStructure structure) {
+    private NdJsonFileStructureFinder(List<String> sampleMessages, FileStructure structure) {
         this.sampleMessages = Collections.unmodifiableList(sampleMessages);
         this.structure = structure;
     }

diff --git a/...inder/JsonFileStructureFinderFactory.java → ...der/NdJsonFileStructureFinderFactory.java b/...inder/JsonFileStructureFinderFactory.java → ...der/NdJsonFileStructureFinderFactory.java
@@ -17,15 +17,15 @@
 
 import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
 
-public class JsonFileStructureFinderFactory implements FileStructureFinderFactory {
+public class NdJsonFileStructureFinderFactory implements FileStructureFinderFactory {
 
     @Override
     public boolean canFindFormat(FileStructure.Format format) {
-        return format == null || format == FileStructure.Format.JSON;
+        return format == null || format == FileStructure.Format.NDJSON;
     }
 
     /**
-     * This format matches if the sample consists of one or more JSON documents.
+     * This format matches if the sample consists of one or more NDJSON documents.
      * If there is more than one, they must be newline-delimited.  The
      * documents must be non-empty, to prevent lines containing "{}" from matching.
      */
@@ -41,35 +41,35 @@ public boolean canCreateFromSample(List<String> explanation, String sample) {
                     DeprecationHandler.THROW_UNSUPPORTED_OPERATION, new ContextPrintingStringReader(sampleLine))) {
 
                     if (parser.map().isEmpty()) {
-                        explanation.add("Not JSON because an empty object was parsed: [" + sampleLine + "]");
+                        explanation.add("Not NDJSON because an empty object was parsed: [" + sampleLine + "]");
                         return false;
                     }
                     ++completeDocCount;
                     if (parser.nextToken() != null) {
-                        explanation.add("Not newline delimited JSON because a line contained more than a single object: [" +
+                        explanation.add("Not newline delimited NDJSON because a line contained more than a single object: [" +
                             sampleLine + "]");
                         return false;
                     }
                 }
             }
         } catch (IOException | IllegalStateException e) {
-            explanation.add("Not JSON because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
+            explanation.add("Not NDJSON because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
             return false;
         }
 
         if (completeDocCount == 0) {
-            explanation.add("Not JSON because sample didn't contain a complete document");
+            explanation.add("Not NDJSON because sample didn't contain a complete document");
             return false;
         }
 
-        explanation.add("Deciding sample is newline delimited JSON");
+        explanation.add("Deciding sample is newline delimited NDJSON");
         return true;
     }
 
     @Override
     public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
                                                 FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException {
-        return JsonFileStructureFinder.makeJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides,
+        return NdJsonFileStructureFinder.makeNdJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides,
             timeoutChecker);
     }
 

diff --git a/.../elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java b/.../elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java
@@ -12,7 +12,7 @@ public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestC
     private FileStructureFinderFactory semiColonDelimitedfactory = new DelimitedFileStructureFinderFactory(';', '"', 4, false);
     private FileStructureFinderFactory pipeDelimitedFactory = new DelimitedFileStructureFinderFactory('|', '"', 5, true);
 
-    // CSV - no need to check JSON or XML because they come earlier in the order we check formats
+    // CSV - no need to check NDJSON or XML because they come earlier in the order we check formats
 
     public void testCanCreateCsvFromSampleGivenCsv() {
 
@@ -39,7 +39,7 @@ public void testCanCreateCsvFromSampleGivenText() {
         assertFalse(csvFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
     }
 
-    // TSV - no need to check JSON, XML or CSV because they come earlier in the order we check formats
+    // TSV - no need to check NDJSON, XML or CSV because they come earlier in the order we check formats
 
     public void testCanCreateTsvFromSampleGivenTsv() {
 
@@ -61,7 +61,7 @@ public void testCanCreateTsvFromSampleGivenText() {
         assertFalse(tsvFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
     }
 
-    // Semi-colon delimited - no need to check JSON, XML, CSV or TSV because they come earlier in the order we check formats
+    // Semi-colon delimited - no need to check NDJSON, XML, CSV or TSV because they come earlier in the order we check formats
 
     public void testCanCreateSemiColonDelimitedFromSampleGivenSemiColonDelimited() {
 
@@ -78,7 +78,7 @@ public void testCanCreateSemiColonDelimitedFromSampleGivenText() {
         assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, TEXT_SAMPLE));
     }
 
-    // Pipe delimited - no need to check JSON, XML, CSV, TSV or semi-colon delimited
+    // Pipe delimited - no need to check NDJSON, XML, CSV, TSV or semi-colon delimited
     // values because they come earlier in the order we check formats
 
     public void testCanCreatePipeDelimitedFromSampleGivenPipeDelimited() {

diff --git a/.../java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java b/.../java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java
@@ -73,20 +73,20 @@ public void testFindCharsetGivenBinary() throws Exception {
         }
     }
 
-    public void testMakeBestStructureGivenJson() throws Exception {
-        assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
-            EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(JsonFileStructureFinder.class));
+    public void testMakeBestStructureGivenNdJson() throws Exception {
+        assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(),
+            randomBoolean(), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(NdJsonFileStructureFinder.class));
     }
 
-    public void testMakeBestStructureGivenJsonAndDelimitedOverride() throws Exception {
+    public void testMakeBestStructureGivenNdJsonAndDelimitedOverride() throws Exception {
 
         // Need to change the quote character from the default of double quotes
-        // otherwise the quotes in the JSON will stop it parsing as CSV
+        // otherwise the quotes in the NDJSON will stop it parsing as CSV
         FileStructureOverrides overrides = FileStructureOverrides.builder()
             .setFormat(FileStructure.Format.DELIMITED).setQuote('\'').build();
 
-        assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
-            overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class));
+        assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(),
+            randomBoolean(), overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class));
     }
 
     public void testMakeBestStructureGivenXml() throws Exception {
@@ -109,13 +109,13 @@ public void testMakeBestStructureGivenCsv() throws Exception {
 
     public void testMakeBestStructureGivenCsvAndJsonOverride() {
 
-        FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.JSON).build();
+        FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.NDJSON).build();
 
         IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
             () -> structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
                 overrides, NOOP_TIMEOUT_CHECKER));
 
-        assertEquals("Input did not match the specified format [json]", e.getMessage());
+        assertEquals("Input did not match the specified format [ndjson]", e.getMessage());
     }
 
     public void testMakeBestStructureGivenText() throws Exception {

diff --git a/...l/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureTestCase.java b/...l/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureTestCase.java
@@ -27,7 +27,7 @@ public abstract class FileStructureTestCase extends ESTestCase {
         "2018-05-17T16:23:40,key1,42.0\n" +
         "2018-05-17T16:24:11,\"key with spaces\",42.0\n";
 
-    protected static final String JSON_SAMPLE = "{\"logger\":\"controller\",\"timestamp\":1478261151445,\"level\":\"INFO\"," +
+    protected static final String NDJSON_SAMPLE = "{\"logger\":\"controller\",\"timestamp\":1478261151445,\"level\":\"INFO\"," +
             "\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 1\",\"class\":\"ml\"," +
             "\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n" +
         "{\"logger\":\"controller\",\"timestamp\":1478261151445," +

diff --git a/...efinder/JsonFileStructureFinderTests.java → ...inder/NdJsonFileStructureFinderTests.java b/...efinder/JsonFileStructureFinderTests.java → ...inder/NdJsonFileStructureFinderTests.java
@@ -9,21 +9,21 @@
 
 import java.util.Collections;
 
-public class JsonFileStructureFinderTests extends FileStructureTestCase {
+public class NdJsonFileStructureFinderTests extends FileStructureTestCase {
 
-    private FileStructureFinderFactory factory = new JsonFileStructureFinderFactory();
+    private FileStructureFinderFactory factory = new NdJsonFileStructureFinderFactory();
 
     public void testCreateConfigsGivenGoodJson() throws Exception {
-        assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
+        assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
-        FileStructureFinder structureFinder = factory.createFromSample(explanation, JSON_SAMPLE, charset, hasByteOrderMarker,
+        FileStructureFinder structureFinder = factory.createFromSample(explanation, NDJSON_SAMPLE, charset, hasByteOrderMarker,
             FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
 
         FileStructure structure = structureFinder.getStructure();
 
-        assertEquals(FileStructure.Format.JSON, structure.getFormat());
+        assertEquals(FileStructure.Format.NDJSON, structure.getFormat());
         assertEquals(charset, structure.getCharset());
         if (hasByteOrderMarker == null) {
             assertNull(structure.getHasByteOrderMarker());

diff --git a/.../JsonFileStructureFinderFactoryTests.java → ...dJsonFileStructureFinderFactoryTests.java b/.../JsonFileStructureFinderFactoryTests.java → ...dJsonFileStructureFinderFactoryTests.java
@@ -5,13 +5,13 @@
  */
 package org.elasticsearch.xpack.ml.filestructurefinder;
 
-public class JsonFileStructureFinderFactoryTests extends FileStructureTestCase {
+public class NdNdJsonFileStructureFinderFactoryTests extends FileStructureTestCase {
 
-    private FileStructureFinderFactory factory = new JsonFileStructureFinderFactory();
+    private FileStructureFinderFactory factory = new NdJsonFileStructureFinderFactory();
 
-    public void testCanCreateFromSampleGivenJson() {
+    public void testCanCreateFromSampleGivenNdJson() {
 
-        assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
+        assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE));
     }
 
     public void testCanCreateFromSampleGivenXml() {

diff --git a/...rg/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactoryTests.java b/...rg/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactoryTests.java
@@ -9,7 +9,7 @@ public class TextLogFileStructureFinderFactoryTests extends FileStructureTestCas
 
     private FileStructureFinderFactory factory = new TextLogFileStructureFinderFactory();
 
-    // No need to check JSON, XML, CSV, TSV, semi-colon delimited values or pipe
+    // No need to check NDJSON, XML, CSV, TSV, semi-colon delimited values or pipe
     // delimited values because they come earlier in the order we check formats
 
     public void testCanCreateFromSampleGivenText() {

diff --git a/...va/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactoryTests.java b/...va/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactoryTests.java
@@ -9,7 +9,7 @@ public class XmlFileStructureFinderFactoryTests extends FileStructureTestCase {
 
     private FileStructureFinderFactory factory = new XmlFileStructureFinderFactory();
 
-    // No need to check JSON because it comes earlier in the order we check formats
+    // No need to check NDJSON because it comes earlier in the order we check formats
 
     public void testCanCreateFromSampleGivenXml() {