Skip to content

Commit

Permalink
[ML] Rename the json file structure to ndjson (elastic#34901)
Browse files Browse the repository at this point in the history
The file structure finder endpoint can find the NDJSON
(newline-delimited JSON) file format, but called it
`json`.  This change renames the `format` for this file
structure to `ndjson`, which is more precise and will
hopefully avoid confusion.
  • Loading branch information
droberts195 committed Oct 29, 2018
1 parent f13d529 commit c455be7
Show file tree
Hide file tree
Showing 15 changed files with 62 additions and 62 deletions.
8 changes: 4 additions & 4 deletions docs/reference/ml/apis/find-file-structure.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ chosen.
structure finder produced its result. The default value is `false`.

`format`::
(string) The high level structure of the file. Valid values are `json`, `xml`,
(string) The high level structure of the file. Valid values are `ndjson`, `xml`,
`delimited`, and `semi_structured_text`. If this parameter is not specified,
the structure finder chooses one.

Expand Down Expand Up @@ -259,7 +259,7 @@ If the request does not encounter errors, you receive the following result:
"sample_start" : "{\"name\": \"Leviathan Wakes\", \"author\": \"James S.A. Corey\", \"release_date\": \"2011-06-02\", \"page_count\": 561}\n{\"name\": \"Hyperion\", \"author\": \"Dan Simmons\", \"release_date\": \"1989-05-26\", \"page_count\": 482}\n", <3>
"charset" : "UTF-8", <4>
"has_byte_order_marker" : false, <5>
"format" : "json", <6>
"format" : "ndjson", <6>
"need_client_timezone" : false, <7>
"mappings" : { <8>
"author" : {
Expand Down Expand Up @@ -473,14 +473,14 @@ If the request does not encounter errors, you receive the following result:

<1> `num_lines_analyzed` indicates how many lines of the file were analyzed.
<2> `num_messages_analyzed` indicates how many distinct messages the lines contained.
For ND-JSON, this value is the same as `num_lines_analyzed`. For other file
For NDJSON, this value is the same as `num_lines_analyzed`. For other file
formats, messages can span several lines.
<3> `sample_start` reproduces the first two messages in the file verbatim. This
may help to diagnose parse errors or accidental uploads of the wrong file.
<4> `charset` indicates the character encoding used to parse the file.
<5> For UTF character encodings, `has_byte_order_marker` indicates whether the
file begins with a byte order marker.
<6> `format` is one of `json`, `xml`, `delimited` or `semi_structured_text`.
<6> `format` is one of `ndjson`, `xml`, `delimited` or `semi_structured_text`.
<7> If a timestamp format is detected that does not include a timezone,
`need_client_timezone` will be `true`. The server that parses the file must
therefore be told the correct timezone by the client.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@ public class FileStructure implements ToXContentObject, Writeable {

public enum Format {

JSON, XML, DELIMITED, SEMI_STRUCTURED_TEXT;
NDJSON, XML, DELIMITED, SEMI_STRUCTURED_TEXT;

public boolean supportsNesting() {
switch (this) {
case JSON:
case NDJSON:
case XML:
return true;
case DELIMITED:
Expand All @@ -49,7 +49,7 @@ public boolean supportsNesting() {

public boolean isStructured() {
switch (this) {
case JSON:
case NDJSON:
case XML:
case DELIMITED:
return true;
Expand All @@ -62,7 +62,7 @@ public boolean isStructured() {

public boolean isSemiStructured() {
switch (this) {
case JSON:
case NDJSON:
case XML:
case DELIMITED:
return false;
Expand Down Expand Up @@ -645,7 +645,7 @@ public FileStructure build() {
}

switch (format) {
case JSON:
case NDJSON:
if (shouldTrimFields != null) {
throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures.");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ public void testValidateNonDelimited() {
public void testValidateNonSemiStructuredText() {

FindFileStructureAction.Request request = new FindFileStructureAction.Request();
request.setFormat(randomFrom(FileStructure.Format.JSON, FileStructure.Format.XML, FileStructure.Format.DELIMITED));
request.setFormat(randomFrom(FileStructure.Format.NDJSON, FileStructure.Format.XML, FileStructure.Format.DELIMITED));
request.setGrokPattern(randomAlphaOfLength(80));
request.setSample(new BytesArray("foo\n"));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
* Runs the high-level steps needed to create ingest configs for the specified file. In order:
* 1. Determine the most likely character set (UTF-8, UTF-16LE, ISO-8859-2, etc.)
* 2. Load a sample of the file, consisting of the first 1000 lines of the file
* 3. Determine the most likely file structure - one of ND-JSON, XML, delimited or semi-structured text
* 3. Determine the most likely file structure - one of NDJSON, XML, delimited or semi-structured text
* 4. Create an appropriate structure object and delegate writing configs to it
*/
public final class FileStructureFinderManager {
Expand Down Expand Up @@ -73,9 +73,9 @@ public final class FileStructureFinderManager {
* These need to be ordered so that the more generic formats come after the more specific ones
*/
private static final List<FileStructureFinderFactory> ORDERED_STRUCTURE_FACTORIES = Collections.unmodifiableList(Arrays.asList(
new JsonFileStructureFinderFactory(),
new NdJsonFileStructureFinderFactory(),
new XmlFileStructureFinderFactory(),
// ND-JSON will often also be valid (although utterly weird) CSV, so JSON must come before CSV
// NDJSON will often also be valid (although utterly weird) CSV, so NDJSON must come before CSV
new DelimitedFileStructureFinderFactory(',', '"', 2, false),
new DelimitedFileStructureFinderFactory('\t', '"', 2, false),
new DelimitedFileStructureFinderFactory(';', '"', 4, false),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,16 @@
import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;

/**
* Really ND-JSON.
* Newline-delimited JSON.
*/
public class JsonFileStructureFinder implements FileStructureFinder {
public class NdJsonFileStructureFinder implements FileStructureFinder {

private final List<String> sampleMessages;
private final FileStructure structure;

static JsonFileStructureFinder makeJsonFileStructureFinder(List<String> explanation, String sample, String charsetName,
Boolean hasByteOrderMarker, FileStructureOverrides overrides,
TimeoutChecker timeoutChecker) throws IOException {
static NdJsonFileStructureFinder makeNdJsonFileStructureFinder(List<String> explanation, String sample, String charsetName,
Boolean hasByteOrderMarker, FileStructureOverrides overrides,
TimeoutChecker timeoutChecker) throws IOException {

List<Map<String, ?>> sampleRecords = new ArrayList<>();

Expand All @@ -43,10 +43,10 @@ static JsonFileStructureFinder makeJsonFileStructureFinder(List<String> explanat
XContentParser parser = jsonXContent.createParser(NamedXContentRegistry.EMPTY, DeprecationHandler.THROW_UNSUPPORTED_OPERATION,
sampleMessage);
sampleRecords.add(parser.mapOrdered());
timeoutChecker.check("JSON parsing");
timeoutChecker.check("NDJSON parsing");
}

FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.JSON)
FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.NDJSON)
.setCharset(charsetName)
.setHasByteOrderMarker(hasByteOrderMarker)
.setSampleStart(sampleMessages.stream().limit(2).collect(Collectors.joining("\n", "", "\n")))
Expand Down Expand Up @@ -84,10 +84,10 @@ static JsonFileStructureFinder makeJsonFileStructureFinder(List<String> explanat
.setExplanation(explanation)
.build();

return new JsonFileStructureFinder(sampleMessages, structure);
return new NdJsonFileStructureFinder(sampleMessages, structure);
}

private JsonFileStructureFinder(List<String> sampleMessages, FileStructure structure) {
private NdJsonFileStructureFinder(List<String> sampleMessages, FileStructure structure) {
this.sampleMessages = Collections.unmodifiableList(sampleMessages);
this.structure = structure;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@

import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;

public class JsonFileStructureFinderFactory implements FileStructureFinderFactory {
public class NdJsonFileStructureFinderFactory implements FileStructureFinderFactory {

@Override
public boolean canFindFormat(FileStructure.Format format) {
return format == null || format == FileStructure.Format.JSON;
return format == null || format == FileStructure.Format.NDJSON;
}

/**
* This format matches if the sample consists of one or more JSON documents.
* This format matches if the sample consists of one or more NDJSON documents.
* If there is more than one, they must be newline-delimited. The
* documents must be non-empty, to prevent lines containing "{}" from matching.
*/
Expand All @@ -41,35 +41,35 @@ public boolean canCreateFromSample(List<String> explanation, String sample) {
DeprecationHandler.THROW_UNSUPPORTED_OPERATION, new ContextPrintingStringReader(sampleLine))) {

if (parser.map().isEmpty()) {
explanation.add("Not JSON because an empty object was parsed: [" + sampleLine + "]");
explanation.add("Not NDJSON because an empty object was parsed: [" + sampleLine + "]");
return false;
}
++completeDocCount;
if (parser.nextToken() != null) {
explanation.add("Not newline delimited JSON because a line contained more than a single object: [" +
explanation.add("Not newline delimited NDJSON because a line contained more than a single object: [" +
sampleLine + "]");
return false;
}
}
}
} catch (IOException | IllegalStateException e) {
explanation.add("Not JSON because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
explanation.add("Not NDJSON because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
return false;
}

if (completeDocCount == 0) {
explanation.add("Not JSON because sample didn't contain a complete document");
explanation.add("Not NDJSON because sample didn't contain a complete document");
return false;
}

explanation.add("Deciding sample is newline delimited JSON");
explanation.add("Deciding sample is newline delimited NDJSON");
return true;
}

@Override
public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException {
return JsonFileStructureFinder.makeJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides,
return NdJsonFileStructureFinder.makeNdJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides,
timeoutChecker);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestC
private FileStructureFinderFactory semiColonDelimitedfactory = new DelimitedFileStructureFinderFactory(';', '"', 4, false);
private FileStructureFinderFactory pipeDelimitedFactory = new DelimitedFileStructureFinderFactory('|', '"', 5, true);

// CSV - no need to check JSON or XML because they come earlier in the order we check formats
// CSV - no need to check NDJSON or XML because they come earlier in the order we check formats

public void testCanCreateCsvFromSampleGivenCsv() {

Expand All @@ -39,7 +39,7 @@ public void testCanCreateCsvFromSampleGivenText() {
assertFalse(csvFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
}

// TSV - no need to check JSON, XML or CSV because they come earlier in the order we check formats
// TSV - no need to check NDJSON, XML or CSV because they come earlier in the order we check formats

public void testCanCreateTsvFromSampleGivenTsv() {

Expand All @@ -61,7 +61,7 @@ public void testCanCreateTsvFromSampleGivenText() {
assertFalse(tsvFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
}

// Semi-colon delimited - no need to check JSON, XML, CSV or TSV because they come earlier in the order we check formats
// Semi-colon delimited - no need to check NDJSON, XML, CSV or TSV because they come earlier in the order we check formats

public void testCanCreateSemiColonDelimitedFromSampleGivenSemiColonDelimited() {

Expand All @@ -78,7 +78,7 @@ public void testCanCreateSemiColonDelimitedFromSampleGivenText() {
assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, TEXT_SAMPLE));
}

// Pipe delimited - no need to check JSON, XML, CSV, TSV or semi-colon delimited
// Pipe delimited - no need to check NDJSON, XML, CSV, TSV or semi-colon delimited
// values because they come earlier in the order we check formats

public void testCanCreatePipeDelimitedFromSampleGivenPipeDelimited() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,20 +73,20 @@ public void testFindCharsetGivenBinary() throws Exception {
}
}

public void testMakeBestStructureGivenJson() throws Exception {
assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(JsonFileStructureFinder.class));
public void testMakeBestStructureGivenNdJson() throws Exception {
assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(),
randomBoolean(), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), instanceOf(NdJsonFileStructureFinder.class));
}

public void testMakeBestStructureGivenJsonAndDelimitedOverride() throws Exception {
public void testMakeBestStructureGivenNdJsonAndDelimitedOverride() throws Exception {

// Need to change the quote character from the default of double quotes
// otherwise the quotes in the JSON will stop it parsing as CSV
// otherwise the quotes in the NDJSON will stop it parsing as CSV
FileStructureOverrides overrides = FileStructureOverrides.builder()
.setFormat(FileStructure.Format.DELIMITED).setQuote('\'').build();

assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class));
assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(),
randomBoolean(), overrides, NOOP_TIMEOUT_CHECKER), instanceOf(DelimitedFileStructureFinder.class));
}

public void testMakeBestStructureGivenXml() throws Exception {
Expand All @@ -109,13 +109,13 @@ public void testMakeBestStructureGivenCsv() throws Exception {

public void testMakeBestStructureGivenCsvAndJsonOverride() {

FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.JSON).build();
FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.NDJSON).build();

IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
overrides, NOOP_TIMEOUT_CHECKER));

assertEquals("Input did not match the specified format [json]", e.getMessage());
assertEquals("Input did not match the specified format [ndjson]", e.getMessage());
}

public void testMakeBestStructureGivenText() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public abstract class FileStructureTestCase extends ESTestCase {
"2018-05-17T16:23:40,key1,42.0\n" +
"2018-05-17T16:24:11,\"key with spaces\",42.0\n";

protected static final String JSON_SAMPLE = "{\"logger\":\"controller\",\"timestamp\":1478261151445,\"level\":\"INFO\"," +
protected static final String NDJSON_SAMPLE = "{\"logger\":\"controller\",\"timestamp\":1478261151445,\"level\":\"INFO\"," +
"\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 1\",\"class\":\"ml\"," +
"\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n" +
"{\"logger\":\"controller\",\"timestamp\":1478261151445," +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,21 @@

import java.util.Collections;

public class JsonFileStructureFinderTests extends FileStructureTestCase {
public class NdJsonFileStructureFinderTests extends FileStructureTestCase {

private FileStructureFinderFactory factory = new JsonFileStructureFinderFactory();
private FileStructureFinderFactory factory = new NdJsonFileStructureFinderFactory();

public void testCreateConfigsGivenGoodJson() throws Exception {
assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE));

String charset = randomFrom(POSSIBLE_CHARSETS);
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
FileStructureFinder structureFinder = factory.createFromSample(explanation, JSON_SAMPLE, charset, hasByteOrderMarker,
FileStructureFinder structureFinder = factory.createFromSample(explanation, NDJSON_SAMPLE, charset, hasByteOrderMarker,
FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);

FileStructure structure = structureFinder.getStructure();

assertEquals(FileStructure.Format.JSON, structure.getFormat());
assertEquals(FileStructure.Format.NDJSON, structure.getFormat());
assertEquals(charset, structure.getCharset());
if (hasByteOrderMarker == null) {
assertNull(structure.getHasByteOrderMarker());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
*/
package org.elasticsearch.xpack.ml.filestructurefinder;

public class JsonFileStructureFinderFactoryTests extends FileStructureTestCase {
public class NdNdJsonFileStructureFinderFactoryTests extends FileStructureTestCase {

private FileStructureFinderFactory factory = new JsonFileStructureFinderFactory();
private FileStructureFinderFactory factory = new NdJsonFileStructureFinderFactory();

public void testCanCreateFromSampleGivenJson() {
public void testCanCreateFromSampleGivenNdJson() {

assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE));
}

public void testCanCreateFromSampleGivenXml() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ public class TextLogFileStructureFinderFactoryTests extends FileStructureTestCas

private FileStructureFinderFactory factory = new TextLogFileStructureFinderFactory();

// No need to check JSON, XML, CSV, TSV, semi-colon delimited values or pipe
// No need to check NDJSON, XML, CSV, TSV, semi-colon delimited values or pipe
// delimited values because they come earlier in the order we check formats

public void testCanCreateFromSampleGivenText() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ public class XmlFileStructureFinderFactoryTests extends FileStructureTestCase {

private FileStructureFinderFactory factory = new XmlFileStructureFinderFactory();

// No need to check JSON because it comes earlier in the order we check formats
// No need to check NDJSON because it comes earlier in the order we check formats

public void testCanCreateFromSampleGivenXml() {

Expand Down

0 comments on commit c455be7

Please sign in to comment.