diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 987589d9f30..d31a4cc3697 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -159,11 +159,11 @@ jobs: env: RUST_BACKTRACE: full RUST_LOG: 'trace' - # --skip io: miri does not handle IO very well, unfortunately. + # --skip io: miri can't handle opening of files, so we skip those run: | cargo miri setup cargo clean - cargo miri test -- --skip io + cargo miri test -- --skip io::parquet --skip io::ipc coverage: name: Coverage diff --git a/src/io/json/read/infer_schema.rs b/src/io/json/read/infer_schema.rs index 38082ded789..0ae6a8c6681 100644 --- a/src/io/json/read/infer_schema.rs +++ b/src/io/json/read/infer_schema.rs @@ -170,20 +170,18 @@ fn generate_schema(spec: HashMap>) -> Result { /// /// # Examples /// ``` -/// use std::fs::File; -/// use std::io::{BufReader, SeekFrom, Seek}; -/// use flate2::read::GzDecoder; +/// use std::io::{BufReader, Cursor, SeekFrom, Seek}; /// use arrow2::io::json::infer_json_schema; /// -/// let mut file = File::open("test/data/mixed_arrays.json.gz").unwrap(); +/// let data = r#"{"a":1, "b":[2.0, 1.3, -6.1], "c":[false, true], "d":4.1} +/// {"a":-10, "b":[2.0, 1.3, -6.1], "c":null, "d":null} +/// {"a":2, "b":[2.0, null, -6.1], "c":[false, null], "d":"text"} +/// {"a":3, "b":4, "c": true, "d":[1, false, "array", 2.4]} +/// "#; /// /// // file's cursor's offset at 0 -/// let mut reader = BufReader::new(GzDecoder::new(&file)); +/// let mut reader = BufReader::new(Cursor::new(data)); /// let inferred_schema = infer_json_schema(&mut reader, None).unwrap(); -/// // cursor's offset at end of file -/// -/// // seek back to start so that the original file is usable again -/// file.seek(SeekFrom::Start(0)).unwrap(); /// ``` pub fn infer_json_schema( reader: &mut BufReader, @@ -345,14 +343,17 @@ where /// # Examples /// ``` /// use std::fs::File; -/// use std::io::BufReader; +/// use std::io::{BufReader, Cursor}; /// use arrow2::io::json::infer_json_schema_from_seekable; /// -/// let file = File::open("test/data/mixed_arrays.json").unwrap(); -/// // file's cursor's offset at 0 -/// let mut reader = BufReader::new(file); +/// let data = r#"{"a":1, "b":[2.0, 1.3, -6.1], "c":[false, true], "d":4.1} +/// {"a":-10, "b":[2.0, 1.3, -6.1], "c":null, "d":null} +/// {"a":2, "b":[2.0, null, -6.1], "c":[false, null], "d":"text"} +/// {"a":3, "b":4, "c": true, "d":[1, false, "array", 2.4]} +/// "#; +/// let mut reader = BufReader::new(Cursor::new(data)); /// let inferred_schema = infer_json_schema_from_seekable(&mut reader, None).unwrap(); -/// // file's cursor's offset automatically set at 0 +/// // cursor's position automatically set at 0 /// ``` pub fn infer_json_schema_from_seekable( reader: &mut BufReader, diff --git a/src/io/json/read/reader.rs b/src/io/json/read/reader.rs index 0fe8fed271e..ff51c818220 100644 --- a/src/io/json/read/reader.rs +++ b/src/io/json/read/reader.rs @@ -133,19 +133,21 @@ impl Decoder { /// use std::sync::Arc; /// use arrow2::datatypes::{DataType, Field, Schema}; /// use arrow2::io::json; -/// use std::fs::File; -/// use std::io::BufReader; +/// use std::io::{Cursor, BufReader}; /// /// let schema = Arc::new(Schema::new(vec![ -/// Field::new("a", DataType::Float64, false), -/// Field::new("b", DataType::Float64, false), -/// Field::new("c", DataType::Float64, false), +/// Field::new("a", DataType::Int64, true), +/// Field::new("b", DataType::Float32, true), +/// Field::new("c", DataType::Boolean, true), +/// Field::new("d", DataType::Utf8, true), /// ])); /// -/// let file = File::open("test/data/basic.json").unwrap(); -/// -/// let mut json = json::Reader::new(BufReader::new(file), schema, 1024, None); -/// let batch = json.next().unwrap().unwrap(); +/// let data = r#"{"a":1, "b":2.0, "c":false, "d":"4"} +/// {"a":-10, "b":-3.5, "c":true, "d":null} +/// {"a":100000000, "b":0.6, "d":"text"}"#; +/// let mut reader = BufReader::new(Cursor::new(data)); +/// let mut reader = json::Reader::new(&mut reader, schema, 1024, None); +/// let batch = reader.next().unwrap().unwrap(); /// ``` #[derive(Debug)] pub struct Reader { diff --git a/test/data/arrays.json b/test/data/arrays.json deleted file mode 100644 index 5dbdd19ffc0..00000000000 --- a/test/data/arrays.json +++ /dev/null @@ -1,3 +0,0 @@ -{"a":1, "b":[2.0, 1.3, -6.1], "c":[false, true], "d":"4"} -{"a":-10, "b":[2.0, 1.3, -6.1], "c":[true, true], "d":"4"} -{"a":2, "b":[2.0, null, -6.1], "c":[false, null], "d":"text"} diff --git a/test/data/basic.json b/test/data/basic.json deleted file mode 100644 index dafd2dd2e42..00000000000 --- a/test/data/basic.json +++ /dev/null @@ -1,12 +0,0 @@ -{"a":1, "b":2.0, "c":false, "d":"4"} -{"a":-10, "b":-3.5, "c":true, "d":"4"} -{"a":2, "b":0.6, "c":false, "d":"text"} -{"a":1, "b":2.0, "c":false, "d":"4"} -{"a":7, "b":-3.5, "c":true, "d":"4"} -{"a":1, "b":0.6, "c":false, "d":"text"} -{"a":1, "b":2.0, "c":false, "d":"4"} -{"a":5, "b":-3.5, "c":true, "d":"4"} -{"a":1, "b":0.6, "c":false, "d":"text"} -{"a":1, "b":2.0, "c":false, "d":"4"} -{"a":1, "b":-3.5, "c":true, "d":"4"} -{"a":100000000000000, "b":0.6, "c":false, "d":"text"} \ No newline at end of file diff --git a/test/data/basic_nulls.json b/test/data/basic_nulls.json deleted file mode 100644 index 1451df7f57f..00000000000 --- a/test/data/basic_nulls.json +++ /dev/null @@ -1,12 +0,0 @@ -{"a":1, "b":2.0, "c":false} -{"a":null, "b":-3.5, "c":true, "d":"4"} -{"c":false, "d":"text"} -{"a":1, "b":2.0, "c":false, "d":"4"} -{"a":7, "b":-3.5, "c":null, "d":null} -{"a":1, "b":0.6, "c":false} -{"a":1, "b":2.0, "d":"4"} -{"a":5, "c":true} -{"a":1, "b":0.6, "c":false, "d":"text"} -{"a":1, "b":2.0, "c":false, "d":"4"} -{"a":1, "b":-3.5, "c":true, "d":"4"} -{} \ No newline at end of file diff --git a/test/data/integration.json b/test/data/integration.json deleted file mode 100644 index 7e4a22cddba..00000000000 --- a/test/data/integration.json +++ /dev/null @@ -1,808 +0,0 @@ -{ - "schema": { - "fields": [ - { - "name": "bools-with-metadata-map", - "type": { - "name": "bool" - }, - "nullable": true, - "metadata": { - "k": "v" - }, - "children": [] - }, - { - "name": "bools-with-metadata-vec", - "type": { - "name": "bool" - }, - "nullable": true, - "metadata": [ - { - "key": "k2", - "value": "v2" - } - ], - "children": [] - }, - { - "name": "bools", - "type": { - "name": "bool" - }, - "nullable": true, - "children": [] - }, - { - "name": "int8s", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 8 - }, - "nullable": true, - "children": [] - }, - { - "name": "int16s", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 16 - }, - "nullable": true, - "children": [] - }, - { - "name": "int32s", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 32 - }, - "nullable": true, - "children": [] - }, - { - "name": "int64s", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 64 - }, - "nullable": true, - "children": [] - }, - { - "name": "uint8s", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 8 - }, - "nullable": true, - "children": [] - }, - { - "name": "uint16s", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 16 - }, - "nullable": true, - "children": [] - }, - { - "name": "uint32s", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 32 - }, - "nullable": true, - "children": [] - }, - { - "name": "uint64s", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 64 - }, - "nullable": true, - "children": [] - }, - { - "name": "float32s", - "type": { - "name": "floatingpoint", - "precision": "SINGLE" - }, - "nullable": true, - "children": [] - }, - { - "name": "float64s", - "type": { - "name": "floatingpoint", - "precision": "DOUBLE" - }, - "nullable": true, - "children": [] - }, - { - "name": "date_days", - "type": { - "name": "date", - "unit": "DAY" - }, - "nullable": true, - "children": [] - }, - { - "name": "date_millis", - "type": { - "name": "date", - "unit": "MILLISECOND" - }, - "nullable": true, - "children": [] - }, - { - "name": "time_secs", - "type": { - "name": "time", - "unit": "SECOND", - "bitWidth": 32 - }, - "nullable": true, - "children": [] - }, - { - "name": "time_millis", - "type": { - "name": "time", - "unit": "MILLISECOND", - "bitWidth": 32 - }, - "nullable": true, - "children": [] - }, - { - "name": "time_micros", - "type": { - "name": "time", - "unit": "MICROSECOND", - "bitWidth": 64 - }, - "nullable": true, - "children": [] - }, - { - "name": "time_nanos", - "type": { - "name": "time", - "unit": "NANOSECOND", - "bitWidth": 64 - }, - "nullable": true, - "children": [] - }, - { - "name": "ts_secs", - "type": { - "name": "timestamp", - "unit": "SECOND" - }, - "nullable": true, - "children": [] - }, - { - "name": "ts_millis", - "type": { - "name": "timestamp", - "unit": "MILLISECOND" - }, - "nullable": true, - "children": [] - }, - { - "name": "ts_micros", - "type": { - "name": "timestamp", - "unit": "MICROSECOND" - }, - "nullable": true, - "children": [] - }, - { - "name": "ts_nanos", - "type": { - "name": "timestamp", - "unit": "NANOSECOND" - }, - "nullable": true, - "children": [] - }, - { - "name": "ts_secs_tz", - "type": { - "name": "timestamp", - "unit": "SECOND", - "timezone": "Europe/Budapest" - }, - "nullable": true, - "children": [] - }, - { - "name": "ts_millis_tz", - "type": { - "name": "timestamp", - "unit": "MILLISECOND", - "timezone": "America/New_York" - }, - "nullable": true, - "children": [] - }, - { - "name": "ts_micros_tz", - "type": { - "name": "timestamp", - "unit": "MICROSECOND", - "timezone": "UTC" - }, - "nullable": true, - "children": [] - }, - { - "name": "ts_nanos_tz", - "type": { - "name": "timestamp", - "unit": "NANOSECOND", - "timezone": "Africa/Johannesburg" - }, - "nullable": true, - "children": [] - }, - { - "name": "utf8s", - "type": { - "name": "utf8" - }, - "nullable": true, - "children": [] - }, - { - "name": "lists", - "nullable": true, - "type": { - "name": "list" - }, - "children": [ - { - "name": "item", - "nullable": true, - "type": { - "name": "int", - "bitWidth": 32, - "isSigned": true - }, - "children": [] - } - ] - }, - { - "name": "structs", - "type": { - "name": "struct" - }, - "nullable": true, - "children": [ - { - "name": "int32s", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 32 - }, - "nullable": true, - "children": [] - }, - { - "name": "utf8s", - "type": { - "name": "utf8" - }, - "nullable": true, - "children": [] - } - ] - } - ] - }, - "batches": [ - { - "count": 3, - "columns": [ - { - "name": "bools-with-metadata-map", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - true, - true, - false - ] - }, - { - "name": "bools-with-metadata-vec", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - true, - true, - false - ] - }, - { - "name": "bools", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - true, - true, - false - ] - }, - { - "name": "int8s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1, - 2, - 3 - ] - }, - { - "name": "int16s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1, - 2, - 3 - ] - }, - { - "name": "int32s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1, - 2, - 3 - ] - }, - { - "name": "int64s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1, - 2, - 3 - ] - }, - { - "name": "uint8s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1, - 2, - 3 - ] - }, - { - "name": "uint16s", - "count": 5, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1, - 2, - 3 - ] - }, - { - "name": "uint32s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1, - 2, - 3 - ] - }, - { - "name": "uint64s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1, - 2, - 3 - ] - }, - { - "name": "float32s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1.0, - 2.0, - 3.0 - ] - }, - { - "name": "float64s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1.0, - 2.0, - 3.0 - ] - }, - { - "name": "date_days", - "count": 3, - "VALIDITY": [ - 1, - 0, - 0 - ], - "DATA": [ - 1196848, - 2319603, - 2755982 - ] - }, - { - "name": "date_millis", - "count": 3, - "VALIDITY": [ - 1, - 1, - 1 - ], - "DATA": [ - 167903550396207, - 29923997007884, - 30612271819236 - ] - }, - { - "name": "time_secs", - "count": 3, - "VALIDITY": [ - 1, - 1, - 1 - ], - "DATA": [ - 27974, - 78592, - 43207 - ] - }, - { - "name": "time_millis", - "count": 3, - "VALIDITY": [ - 1, - 1, - 1 - ], - "DATA": [ - 6613125, - 74667230, - 52260079 - ] - }, - { - "name": "time_micros", - "count": 3, - "VALIDITY": [ - 1, - 0, - 0 - ], - "DATA": [ - 62522958593, - 13470380050, - 50797036705 - ] - }, - { - "name": "time_nanos", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 73380123595985, - 52520995325145, - 16584393546415 - ] - }, - { - "name": "ts_secs", - "count": 3, - "VALIDITY": [ - 0, - 1, - 0 - ], - "DATA": [ - 209869064422, - 193438817552, - 51757838205 - ] - }, - { - "name": "ts_millis", - "count": 3, - "VALIDITY": [ - 0, - 1, - 1 - ], - "DATA": [ - 228315043570185, - 38606916383008, - 58113709376587 - ] - }, - { - "name": "ts_micros", - "count": 3, - "VALIDITY": [ - 0, - 0, - 0 - ], - "DATA": [ - 133457416537791415, - 129522736067409280, - 177110451066832967 - ] - }, - { - "name": "ts_nanos", - "count": 3, - "VALIDITY": [ - 0, - 0, - 1 - ], - "DATA": [ - -804525722984600007, - 8166038652634779458, - -6473623571954960143 - ] - }, - { - "name": "ts_secs_tz", - "count": 3, - "VALIDITY": [ - 0, - 1, - 0 - ], - "DATA": [ - 209869064422, - 193438817552, - 51757838205 - ] - }, - { - "name": "ts_millis_tz", - "count": 3, - "VALIDITY": [ - 0, - 1, - 1 - ], - "DATA": [ - 228315043570185, - 38606916383008, - 58113709376587 - ] - }, - { - "name": "ts_micros_tz", - "count": 3, - "VALIDITY": [ - 0, - 0, - 0 - ], - "DATA": [ - 133457416537791415, - 129522736067409280, - 177110451066832967 - ] - }, - { - "name": "ts_nanos_tz", - "count": 3, - "VALIDITY": [ - 0, - 0, - 1 - ], - "DATA": [ - -804525722984600007, - 8166038652634779458, - -6473623571954960143 - ] - }, - { - "name": "utf8s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "OFFSET": [ - 0, - 2, - 2, - 5 - ], - "DATA": [ - "aa", - "", - "bbb" - ] - }, - { - "name": "lists", - "count": 3, - "VALIDITY": [ - 1, - 1, - 0 - ], - "OFFSET": [ - 0, - 3, - 4, - 4 - ], - "children": [ - { - "name": "item", - "count": 4, - "VALIDITY": [ - 0, - 1, - 0, - 0 - ], - "DATA": [ - 1, - 2, - 3, - 4 - ] - } - ] - }, - { - "name": "structs", - "count": 3, - "VALIDITY": [ - 1, - 1, - 0 - ], - "children": [ - { - "name": "int32s", - "count": 3, - "VALIDITY": [ - 0, - 1, - 0 - ], - "DATA": [ - -1, - -2, - -3 - ] - }, - { - "name": "utf8s", - "count": 3, - "VALIDITY": [ - 0, - 0, - 1 - ], - "OFFSET": [ - 0, - 0, - 0, - 7 - ], - "DATA": [ - "", - "", - "aaaaaa" - ] - } - ] - } - ] - } - ] -} \ No newline at end of file diff --git a/test/data/list_string_dict_nested_nulls.json b/test/data/list_string_dict_nested_nulls.json deleted file mode 100644 index 9300b14ce27..00000000000 --- a/test/data/list_string_dict_nested_nulls.json +++ /dev/null @@ -1,3 +0,0 @@ -{"machine": "a", "events": [null, "Elect Leader", "Do Ballot"]} -{"machine": "b", "events": ["Do Ballot", null, "Send Data", "Elect Leader"]} -{"machine": "c", "events": ["Send Data"]} diff --git a/test/data/mixed_arrays.json b/test/data/mixed_arrays.json deleted file mode 100644 index 18987284a5b..00000000000 --- a/test/data/mixed_arrays.json +++ /dev/null @@ -1,4 +0,0 @@ -{"a":1, "b":[2.0, 1.3, -6.1], "c":[false, true], "d":4.1} -{"a":-10, "b":[2.0, 1.3, -6.1], "c":null, "d":null} -{"a":2, "b":[2.0, null, -6.1], "c":[false, null], "d":"text"} -{"a":3, "b":4, "c": true, "d":[1, false, "array", 2.4]} diff --git a/test/data/mixed_arrays.json.gz b/test/data/mixed_arrays.json.gz deleted file mode 100644 index 0f6040092ff..00000000000 Binary files a/test/data/mixed_arrays.json.gz and /dev/null differ diff --git a/test/data/nested_structs.json b/test/data/nested_structs.json deleted file mode 100644 index 32a3ac85c61..00000000000 --- a/test/data/nested_structs.json +++ /dev/null @@ -1,4 +0,0 @@ -{"a": {"b": true, "c": {"d": "text"}}} -{"a": {"b": false, "c": null}} -{"a": {"b": true, "c": {"d": "text"}}} -{"a": 1} \ No newline at end of file diff --git a/test/data/null_test.csv b/test/data/null_test.csv deleted file mode 100644 index 7e0dde53714..00000000000 --- a/test/data/null_test.csv +++ /dev/null @@ -1,6 +0,0 @@ -c_int,c_float,c_string,c_bool -1,1.1,"1.11",True -2,2.2,"2.22",TRUE -3,,"3.33",true -4,4.4,,False -5,6.6,"",FALSE \ No newline at end of file diff --git a/test/data/uk_cities.csv b/test/data/uk_cities.csv deleted file mode 100644 index db9e6da8c7a..00000000000 --- a/test/data/uk_cities.csv +++ /dev/null @@ -1,37 +0,0 @@ -"Elgin, Scotland, the UK",57.653484,-3.335724 -"Stoke-on-Trent, Staffordshire, the UK",53.002666,-2.179404 -"Solihull, Birmingham, UK",52.412811,-1.778197 -"Cardiff, Cardiff county, UK",51.481583,-3.179090 -"Eastbourne, East Sussex, UK",50.768036,0.290472 -"Oxford, Oxfordshire, UK",51.752022,-1.257677 -"London, UK",51.509865,-0.118092 -"Swindon, Swindon, UK",51.568535,-1.772232 -"Gravesend, Kent, UK",51.441883,0.370759 -"Northampton, Northamptonshire, UK",52.240479,-0.902656 -"Rugby, Warwickshire, UK",52.370876,-1.265032 -"Sutton Coldfield, West Midlands, UK",52.570385,-1.824042 -"Harlow, Essex, UK",51.772938,0.102310 -"Aberdeen, Aberdeen City, UK",57.149651,-2.099075 -"Swansea, Swansea, UK",51.621441,-3.943646 -"Chesterfield, Derbyshire, UK",53.235046,-1.421629 -"Londonderry, Derry, UK",55.006763,-7.318268 -"Salisbury, Wiltshire, UK",51.068787,-1.794472 -"Weymouth, Dorset, UK",50.614429,-2.457621 -"Wolverhampton, West Midlands, UK",52.591370,-2.110748 -"Preston, Lancashire, UK",53.765762,-2.692337 -"Bournemouth, UK",50.720806,-1.904755 -"Doncaster, South Yorkshire, UK",53.522820,-1.128462 -"Ayr, South Ayrshire, UK",55.458565,-4.629179 -"Hastings, East Sussex, UK",50.854259,0.573453 -"Bedford, UK",52.136436,-0.460739 -"Basildon, Essex, UK",51.572376,0.470009 -"Chippenham, Wiltshire, UK",51.458057,-2.116074 -"Belfast, UK",54.607868,-5.926437 -"Uckfield, East Sussex, UK",50.967941,0.085831 -"Worthing, West Sussex, UK",50.825024,-0.383835 -"Leeds, West Yorkshire, UK",53.801277,-1.548567 -"Kendal, Cumbria, UK",54.328506,-2.743870 -"Plymouth, UK",50.376289,-4.143841 -"Haverhill, Suffolk, UK",52.080875,0.444517 -"Frankton, Warwickshire, UK",52.328415,-1.377561 -"Inverness, the UK",57.477772,-4.224721 \ No newline at end of file diff --git a/test/data/uk_cities_with_headers.csv b/test/data/uk_cities_with_headers.csv deleted file mode 100644 index 92f5a17bdda..00000000000 --- a/test/data/uk_cities_with_headers.csv +++ /dev/null @@ -1,38 +0,0 @@ -city,lat,lng -"Elgin, Scotland, the UK",57.653484,-3.335724 -"Stoke-on-Trent, Staffordshire, the UK",53.002666,-2.179404 -"Solihull, Birmingham, UK",52.412811,-1.778197 -"Cardiff, Cardiff county, UK",51.481583,-3.179090 -"Eastbourne, East Sussex, UK",50.768036,0.290472 -"Oxford, Oxfordshire, UK",51.752022,-1.257677 -"London, UK",51.509865,-0.118092 -"Swindon, Swindon, UK",51.568535,-1.772232 -"Gravesend, Kent, UK",51.441883,0.370759 -"Northampton, Northamptonshire, UK",52.240479,-0.902656 -"Rugby, Warwickshire, UK",52.370876,-1.265032 -"Sutton Coldfield, West Midlands, UK",52.570385,-1.824042 -"Harlow, Essex, UK",51.772938,0.102310 -"Aberdeen, Aberdeen City, UK",57.149651,-2.099075 -"Swansea, Swansea, UK",51.621441,-3.943646 -"Chesterfield, Derbyshire, UK",53.235046,-1.421629 -"Londonderry, Derry, UK",55.006763,-7.318268 -"Salisbury, Wiltshire, UK",51.068787,-1.794472 -"Weymouth, Dorset, UK",50.614429,-2.457621 -"Wolverhampton, West Midlands, UK",52.591370,-2.110748 -"Preston, Lancashire, UK",53.765762,-2.692337 -"Bournemouth, UK",50.720806,-1.904755 -"Doncaster, South Yorkshire, UK",53.522820,-1.128462 -"Ayr, South Ayrshire, UK",55.458565,-4.629179 -"Hastings, East Sussex, UK",50.854259,0.573453 -"Bedford, UK",52.136436,-0.460739 -"Basildon, Essex, UK",51.572376,0.470009 -"Chippenham, Wiltshire, UK",51.458057,-2.116074 -"Belfast, UK",54.607868,-5.926437 -"Uckfield, East Sussex, UK",50.967941,0.085831 -"Worthing, West Sussex, UK",50.825024,-0.383835 -"Leeds, West Yorkshire, UK",53.801277,-1.548567 -"Kendal, Cumbria, UK",54.328506,-2.743870 -"Plymouth, UK",50.376289,-4.143841 -"Haverhill, Suffolk, UK",52.080875,0.444517 -"Frankton, Warwickshire, UK",52.328415,-1.377561 -"Inverness, the UK",57.477772,-4.224721 \ No newline at end of file diff --git a/test/data/various_types.csv b/test/data/various_types.csv deleted file mode 100644 index 8f4466fbe6a..00000000000 --- a/test/data/various_types.csv +++ /dev/null @@ -1,6 +0,0 @@ -c_int|c_float|c_string|c_bool|c_date|c_datetime -1|1.1|"1.11"|true|1970-01-01|1970-01-01T00:00:00 -2|2.2|"2.22"|true|2020-11-08|2020-11-08T01:00:00 -3||"3.33"|true|1969-12-31|1969-11-08T02:00:00 -4|4.4||false|| -5|6.6|""|false|1990-01-01|1990-01-01T03:00:00 \ No newline at end of file diff --git a/test/data/various_types_invalid.csv b/test/data/various_types_invalid.csv deleted file mode 100644 index 6f059cb73e6..00000000000 --- a/test/data/various_types_invalid.csv +++ /dev/null @@ -1,6 +0,0 @@ -c_int|c_float|c_string|c_bool -1|1.1|"1.11"|true -2|2.2|"2.22"|true -3||"3.33"|true -4|4.x4||false -5|6.6|""|false \ No newline at end of file diff --git a/tests/it/io/csv/read.rs b/tests/it/io/csv/read.rs index 2a5ee355f54..bdba648e038 100644 --- a/tests/it/io/csv/read.rs +++ b/tests/it/io/csv/read.rs @@ -8,7 +8,22 @@ use arrow2::io::csv::read::*; #[test] fn read() -> Result<()> { - let mut reader = ReaderBuilder::new().from_path("test/data/uk_cities_with_headers.csv")?; + let data = r#"city,lat,lng +"Elgin, Scotland, the UK",57.653484,-3.335724 +"Stoke-on-Trent, Staffordshire, the UK",53.002666,-2.179404 +"Solihull, Birmingham, UK",52.412811,-1.778197 +"Cardiff, Cardiff county, UK",51.481583,-3.179090 +"Eastbourne, East Sussex, UK",50.768036,0.290472 +"Oxford, Oxfordshire, UK",51.752022,-1.257677 +"London, UK",51.509865,-0.118092 +"Swindon, Swindon, UK",51.568535,-1.772232 +"Gravesend, Kent, UK",51.441883,0.370759 +"Northampton, Northamptonshire, UK",52.240479,-0.902656 +"Rugby, Warwickshire, UK",52.370876,-1.265032 +"Sutton Coldfield, West Midlands, UK",52.570385,-1.824042 +"Harlow, Essex, UK",51.772938,0.102310 +"Aberdeen, Aberdeen City, UK",57.149651,-2.099075"#; + let mut reader = ReaderBuilder::new().from_reader(Cursor::new(data)); let schema = Arc::new(infer_schema(&mut reader, None, true, &infer)?); @@ -26,7 +41,7 @@ fn read() -> Result<()> { let batch_schema = batch.schema(); assert_eq!(&schema, batch_schema); - assert_eq!(37, batch.num_rows()); + assert_eq!(14, batch.num_rows()); assert_eq!(3, batch.num_columns()); let lat = batch diff --git a/tests/it/io/json/mod.rs b/tests/it/io/json/mod.rs index 63c11ec1647..37810e9ec39 100644 --- a/tests/it/io/json/mod.rs +++ b/tests/it/io/json/mod.rs @@ -1,20 +1,20 @@ mod read; mod write; +use std::io::Cursor; +use std::sync::Arc; + use serde_json::Value; -use std::fs::{read_to_string, File}; -use arrow2::io::json::LineDelimitedWriter; -use arrow2::io::json::Reader; -use arrow2::io::json::ReaderBuilder; +use arrow2::array::*; +use arrow2::datatypes::*; +use arrow2::io::json::{LineDelimitedWriter, ReaderBuilder}; -fn test_write_for_file(test_file: &str) { +fn round_trip(data: String) { let builder = ReaderBuilder::new() .infer_schema(None) .with_batch_size(1024); - let mut reader: Reader = builder - .build::(File::open(test_file).unwrap()) - .unwrap(); + let mut reader = builder.build(Cursor::new(data.clone())).unwrap(); let batch = reader.next().unwrap().unwrap(); let mut buf = Vec::new(); @@ -24,12 +24,11 @@ fn test_write_for_file(test_file: &str) { } let result = String::from_utf8(buf).unwrap(); - let expected = read_to_string(test_file).unwrap(); - for (r, e) in result.lines().zip(expected.lines()) { + for (r, e) in result.lines().zip(data.lines()) { let mut result_json = serde_json::from_str::(r).unwrap(); let expected_json = serde_json::from_str::(e).unwrap(); if let Value::Object(e) = &expected_json { - // remove null value from object to make comparision consistent: + // remove null value from object to make comparison consistent: if let Value::Object(r) = result_json { result_json = Value::Object( r.into_iter() @@ -43,16 +42,178 @@ fn test_write_for_file(test_file: &str) { } #[test] -fn write_basic_rows() { - test_write_for_file("test/data/basic.json"); +fn round_trip_basics() { + let (data, _, _) = case_basics(); + round_trip(data); } #[test] -fn write_arrays() { - test_write_for_file("test/data/arrays.json"); +fn round_trip_list() { + let (data, _, _) = case_list(); + round_trip(data); } -#[test] -fn write_basic_nulls() { - test_write_for_file("test/data/basic_nulls.json"); +fn case_list() -> (String, Schema, Vec>) { + let data = r#"{"a":1, "b":[2.0, 1.3, -6.1], "c":[false, true], "d":"4"} + {"a":-10, "b":null, "c":[true, true]} + {"a":null, "b":[2.1, null, -6.2], "c":[false, null], "d":"text"} + "# + .to_string(); + + let schema = Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new( + "b", + DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + true, + ), + Field::new( + "c", + DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + true, + ), + Field::new("d", DataType::Utf8, true), + ]); + let a = Int64Array::from(&[Some(1), Some(-10), None]); + + let mut b = MutableListArray::>::new(); + b.try_extend(vec![ + Some(vec![Some(2.0), Some(1.3), Some(-6.1)]), + None, + Some(vec![Some(2.1), None, Some(-6.2)]), + ]) + .unwrap(); + let b: ListArray = b.into(); + + let mut c = MutableListArray::::new(); + c.try_extend(vec![ + Some(vec![Some(false), Some(true)]), + Some(vec![Some(true), Some(true)]), + Some(vec![Some(false), None]), + ]) + .unwrap(); + let c: ListArray = c.into(); + + let d = Utf8Array::::from(&[Some("4"), None, Some("text")]); + + let columns = vec![ + Box::new(a) as Box, + Box::new(b), + Box::new(c), + Box::new(d), + ]; + + (data, schema, columns) +} + +fn case_dict() -> (String, Schema, Vec>) { + let data = r#"{"machine": "a", "events": [null, "Elect Leader", "Do Ballot"]} + {"machine": "b", "events": ["Do Ballot", null, "Send Data", "Elect Leader"]} + {"machine": "c", "events": ["Send Data"]} + {"machine": "c"} + {"machine": "c", "events": null} + "# + .to_string(); + + let data_type = DataType::List(Box::new(Field::new( + "item", + DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), + true, + ))); + + let schema = Schema::new(vec![Field::new("events", data_type, true)]); + + type A = MutableDictionaryArray>; + + let mut array = MutableListArray::::new(); + array + .try_extend(vec![ + Some(vec![None, Some("Elect Leader"), Some("Do Ballot")]), + Some(vec![ + Some("Do Ballot"), + None, + Some("Send Data"), + Some("Elect Leader"), + ]), + Some(vec![Some("Send Data")]), + None, + None, + ]) + .unwrap(); + + let array: ListArray = array.into(); + + (data, schema, vec![Box::new(array) as Box]) +} + +fn case_basics() -> (String, Schema, Vec>) { + let data = r#"{"a":1, "b":2.0, "c":false, "d":"4"} + {"a":-10, "b":-3.5, "c":true, "d":null} + {"a":100000000, "b":0.6, "d":"text"}"# + .to_string(); + let schema = Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Float64, true), + Field::new("c", DataType::Boolean, true), + Field::new("d", DataType::Utf8, true), + ]); + let columns = vec![ + Box::new(Int64Array::from_slice(&[1, -10, 100000000])) as Box, + Box::new(Float64Array::from_slice(&[2.0, -3.5, 0.6])), + Box::new(BooleanArray::from(&[Some(false), Some(true), None])), + Box::new(Utf8Array::::from(&[Some("4"), None, Some("text")])), + ]; + (data, schema, columns) +} + +fn case_basics_schema() -> (String, Schema, Vec>) { + let data = r#"{"a":1, "b":2.0, "c":false, "d":"4"} + {"a":10, "b":-3.5, "c":true, "d":null} + {"a":100000000, "b":0.6, "d":"text"}"# + .to_string(); + let schema = Schema::new(vec![ + Field::new("a", DataType::UInt32, true), + Field::new("b", DataType::Float32, true), + Field::new("c", DataType::Boolean, true), + // note how "d" is not here + ]); + let columns = vec![ + Box::new(UInt32Array::from_slice(&[1, 10, 100000000])) as Box, + Box::new(Float32Array::from_slice(&[2.0, -3.5, 0.6])), + Box::new(BooleanArray::from(&[Some(false), Some(true), None])), + ]; + (data, schema, columns) +} + +fn case_struct() -> (String, Schema, Vec>) { + let data = r#"{"a": {"b": true, "c": {"d": "text"}}} + {"a": {"b": false, "c": null}} + {"a": {"b": true, "c": {"d": "text"}}} + {"a": 1}"# + .to_string(); + + let d_field = Field::new("d", DataType::Utf8, true); + let c_field = Field::new("c", DataType::Struct(vec![d_field.clone()]), true); + let a_field = Field::new( + "a", + DataType::Struct(vec![ + Field::new("b", DataType::Boolean, true), + c_field.clone(), + ]), + true, + ); + let schema = Schema::new(vec![a_field]); + + // build expected output + let d = Utf8Array::::from(&vec![Some("text"), None, Some("text"), None]); + let c = StructArray::from_data(vec![d_field], vec![Arc::new(d)], None); + + let b = BooleanArray::from(vec![Some(true), Some(false), Some(true), None]); + let expected = StructArray::from_data( + vec![Field::new("b", DataType::Boolean, true), c_field], + vec![Arc::new(b), Arc::new(c)], + None, + ); + + (data, schema, vec![Box::new(expected) as Box]) } diff --git a/tests/it/io/json/read.rs b/tests/it/io/json/read.rs index de3bd737623..89dc10e39b4 100644 --- a/tests/it/io/json/read.rs +++ b/tests/it/io/json/read.rs @@ -1,423 +1,128 @@ -use flate2::read::GzDecoder; use std::io::BufReader; -use std::{ - fs::File, - io::{Seek, SeekFrom}, -}; use std::{io::Cursor, sync::Arc}; use arrow2::array::*; use arrow2::datatypes::*; use arrow2::{bitmap::Bitmap, buffer::Buffer, error::Result, io::json::*}; +use crate::io::json::*; + #[test] -fn json_basic() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); +fn basic() -> Result<()> { + let (data, schema, columns) = case_basics(); - assert_eq!(4, batch.num_columns()); - assert_eq!(12, batch.num_rows()); + let mut reader = ReaderBuilder::new().build(Cursor::new(data))?; + let batch = reader.next()?.unwrap(); - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(0, a.0); - assert_eq!(&DataType::Int64, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!(1, b.0); - assert_eq!(&DataType::Float64, b.1.data_type()); - let c = schema.column_with_name("c").unwrap(); - assert_eq!(2, c.0); - assert_eq!(&DataType::Boolean, c.1.data_type()); - let d = schema.column_with_name("d").unwrap(); - assert_eq!(3, d.0); - assert_eq!(&DataType::Utf8, d.1.data_type()); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert_eq!(1, aa.value(0)); - assert_eq!(-10, aa.value(1)); - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert!((2.0 - bb.value(0)).abs() < f64::EPSILON); - assert!((-3.5 - bb.value(1)).abs() < f64::EPSILON); - let cc = batch - .column(c.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!cc.value(0)); - assert!(cc.value(10)); - let dd = batch - .column(d.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert_eq!("4", dd.value(0)); - assert_eq!("text", dd.value(8)); + assert_eq!(&schema, batch.schema().as_ref()); + + columns + .iter() + .zip(batch.columns()) + .for_each(|(expected, result)| assert_eq!(expected.as_ref(), result.as_ref())); + Ok(()) } #[test] -fn json_basic_with_nulls() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); +fn basics_with_schema_projection() -> Result<()> { + let (data, schema, columns) = case_basics_schema(); - assert_eq!(4, batch.num_columns()); - assert_eq!(12, batch.num_rows()); + let mut reader = ReaderBuilder::new() + .with_schema(Arc::new(schema.clone())) + .build(Cursor::new(data))?; + let batch = reader.next()?.unwrap(); - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Int64, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!(&DataType::Float64, b.1.data_type()); - let c = schema.column_with_name("c").unwrap(); - assert_eq!(&DataType::Boolean, c.1.data_type()); - let d = schema.column_with_name("d").unwrap(); - assert_eq!(&DataType::Utf8, d.1.data_type()); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert!(aa.is_valid(0)); - assert!(!aa.is_valid(1)); - assert!(!aa.is_valid(11)); - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert!(bb.is_valid(0)); - assert!(!bb.is_valid(2)); - assert!(!bb.is_valid(11)); - let cc = batch - .column(c.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(cc.is_valid(0)); - assert!(!cc.is_valid(4)); - assert!(!cc.is_valid(11)); - let dd = batch - .column(d.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert!(!dd.is_valid(0)); - assert!(dd.is_valid(1)); - assert!(!dd.is_valid(4)); - assert!(!dd.is_valid(11)); -} + assert_eq!(&schema, batch.schema().as_ref()); -#[test] -fn json_basic_schema() { - let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Float32, false), - Field::new("c", DataType::Boolean, false), - Field::new("d", DataType::Utf8, false), - ])); - - let mut reader: Reader = Reader::new( - File::open("test/data/basic.json").unwrap(), - schema.clone(), - 1024, - None, - ); - let reader_schema = reader.schema(); - assert_eq!(reader_schema, &schema); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(4, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = batch.schema(); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Int32, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!(&DataType::Float32, b.1.data_type()); - let c = schema.column_with_name("c").unwrap(); - assert_eq!(&DataType::Boolean, c.1.data_type()); - let d = schema.column_with_name("d").unwrap(); - assert_eq!(&DataType::Utf8, d.1.data_type()); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(1, aa.value(0)); - // test that a 64bit value is returned as null due to overflowing - assert!(!aa.is_valid(11)); - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!((2.0 - bb.value(0)).abs() < f32::EPSILON); - assert!((-3.5 - bb.value(1)).abs() < f32::EPSILON); + columns + .iter() + .zip(batch.columns()) + .for_each(|(expected, result)| assert_eq!(expected.as_ref(), result.as_ref())); + Ok(()) } #[test] -fn json_basic_schema_projection() { - // We test implicit and explicit projection: - // Implicit: omitting fields from a schema - // Explicit: supplying a vec of fields to take - let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Float32, false), - Field::new("c", DataType::Boolean, false), - ])); - - let mut reader: Reader = Reader::new( - File::open("test/data/basic.json").unwrap(), - schema, - 1024, - Some(vec!["a".to_string(), "c".to_string()]), - ); - let reader_schema = reader.schema().clone(); - let expected_schema = Schema::new(vec![ - Field::new("a", DataType::Int32, false), - Field::new("c", DataType::Boolean, false), - ]); - assert_eq!(reader_schema.as_ref(), &expected_schema); +fn lists() -> Result<()> { + let (data, schema, columns) = case_list(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(2, batch.num_columns()); - assert_eq!(12, batch.num_rows()); + let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); + let mut reader = builder.build(Cursor::new(data))?; + let batch = reader.next()?.unwrap(); - let batch_schema = batch.schema(); - assert_eq!(&reader_schema, batch_schema); + assert_eq!(&schema, batch.schema().as_ref()); - let a = batch_schema.column_with_name("a").unwrap(); - assert_eq!(0, a.0); - assert_eq!(&DataType::Int32, a.1.data_type()); - let c = batch_schema.column_with_name("c").unwrap(); - assert_eq!(1, c.0); - assert_eq!(&DataType::Boolean, c.1.data_type()); + columns + .iter() + .zip(batch.columns()) + .for_each(|(expected, result)| assert_eq!(expected.as_ref(), result.as_ref())); + Ok(()) } #[test] -fn json_arrays() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/arrays.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); +fn line_break_in_values() -> Result<()> { + let data = r#" + {"a":"aa\n\n"} + {"a":"aa\n"} + {"a":null} + "#; - assert_eq!(4, batch.num_columns()); - assert_eq!(3, batch.num_rows()); + let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); + let mut reader = builder.build(Cursor::new(data))?; + let batch = reader.next()?.unwrap(); - let schema = batch.schema(); + let expected = Utf8Array::::from(&[Some("aa\n\n"), Some("aa\n"), None]); - let a = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Int64, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), - b.1.data_type() - ); - let c = schema.column_with_name("c").unwrap(); - assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), - c.1.data_type() - ); - let d = schema.column_with_name("d").unwrap(); - assert_eq!(&DataType::Utf8, d.1.data_type()); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(1, aa.value(0)); - assert_eq!(-10, aa.value(1)); - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::>() - .unwrap(); - let bb = bb.values(); - let bb = bb.as_any().downcast_ref::().unwrap(); - assert_eq!(9, bb.len()); - assert!((2.0 - bb.value(0)).abs() < f64::EPSILON); - assert!((-6.1 - bb.value(5)).abs() < f64::EPSILON); - assert!(!bb.is_valid(7)); - - let cc = batch - .column(c.0) - .as_any() - .downcast_ref::>() - .unwrap(); - let cc = cc.values(); - let cc = cc.as_any().downcast_ref::().unwrap(); - assert_eq!(6, cc.len()); - assert!(!cc.value(0)); - assert!(!cc.value(4)); - assert!(!cc.is_valid(5)); + assert_eq!(expected, batch.columns()[0].as_ref()); + Ok(()) } #[test] -fn invalid_json_infer_schema() { - let re = infer_json_schema_from_seekable( - &mut BufReader::new(File::open("test/data/uk_cities_with_headers.csv").unwrap()), - None, - ); +fn invalid_infer_schema() -> Result<()> { + let re = + infer_json_schema_from_seekable(&mut BufReader::new(Cursor::new("city,lat,lng")), None); assert_eq!( re.err().unwrap().to_string(), "External error: expected value at line 1 column 1", ); + Ok(()) } #[test] -fn invalid_json_read_record() { +fn invalid_read_record() -> Result<()> { let schema = Arc::new(Schema::new(vec![Field::new( "a", DataType::Struct(vec![Field::new("a", DataType::Utf8, true)]), true, )])); let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/uk_cities_with_headers.csv").unwrap()) - .unwrap(); + let mut data = Cursor::new("city,lat,lng"); + let mut reader = builder.build(&mut data)?; assert_eq!( reader.next().err().unwrap().to_string(), "External error: expected value at line 1 column 1", ); + Ok(()) } #[test] -fn mixed_json_arrays() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/mixed_arrays.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - let mut file = File::open("test/data/mixed_arrays.json.gz").unwrap(); - let mut reader = BufReader::new(GzDecoder::new(&file)); - let schema = Arc::new(infer_json_schema(&mut reader, None).unwrap()); - file.seek(SeekFrom::Start(0)).unwrap(); - - let reader = BufReader::new(GzDecoder::new(&file)); - let mut reader = Reader::from_buf_reader(reader, schema, 64, None); - let batch_gz = reader.next().unwrap().unwrap(); - - for batch in vec![batch, batch_gz] { - assert_eq!(4, batch.num_columns()); - assert_eq!(4, batch.num_rows()); - - let schema = batch.schema(); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Int64, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), - b.1.data_type() - ); - let c = schema.column_with_name("c").unwrap(); - assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), - c.1.data_type() - ); - let d = schema.column_with_name("d").unwrap(); - assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), - d.1.data_type() - ); - - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::>() - .unwrap(); - let bb = bb.values(); - let bb = bb.as_any().downcast_ref::().unwrap(); - assert_eq!(9, bb.len()); - assert!((-6.1 - bb.value(8)).abs() < f64::EPSILON); - - let cc = batch - .column(c.0) - .as_any() - .downcast_ref::>() - .unwrap(); - let cc = cc.values(); - let cc = cc.as_any().downcast_ref::().unwrap(); - let cc_expected = BooleanArray::from(vec![Some(false), Some(true), Some(false), None]); - assert_eq!(cc, &cc_expected); - - let dd = batch - .column(d.0) - .as_any() - .downcast_ref::>() - .unwrap(); - let dd = dd.values(); - let dd = dd.as_any().downcast_ref::>().unwrap(); - assert_eq!( - dd, - &Utf8Array::::from_slice(&["1", "false", "array", "2.4"]) - ); - } -} - -#[test] -fn nested_struct_json_arrays() { - let d_field = Field::new("d", DataType::Utf8, true); - let c_field = Field::new("c", DataType::Struct(vec![d_field.clone()]), true); - let a_field = Field::new( - "a", - DataType::Struct(vec![ - Field::new("b", DataType::Boolean, true), - c_field.clone(), - ]), - true, - ); - let schema = Arc::new(Schema::new(vec![a_field])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/nested_structs.json").unwrap()) - .unwrap(); +fn nested_struct_arrays() -> Result<()> { + let (data, schema, columns) = case_struct(); - // build expected output - let d = Utf8Array::::from(&vec![Some("text"), None, Some("text"), None]); - let c = StructArray::from_data(vec![d_field], vec![Arc::new(d)], None); + let builder = ReaderBuilder::new().with_schema(Arc::new(schema.clone())); + let mut reader = builder.build(Cursor::new(data))?; + let batch = reader.next()?.unwrap(); - let b = BooleanArray::from(vec![Some(true), Some(false), Some(true), None]); - let expected = StructArray::from_data( - vec![Field::new("b", DataType::Boolean, true), c_field], - vec![Arc::new(b), Arc::new(c)], - None, - ); + assert_eq!(&schema, batch.schema().as_ref()); - // compare `a` with result from json reader - let batch = reader.next().unwrap().unwrap(); - let read = batch.column(0); - assert_eq!(expected, read.as_ref()); + columns + .iter() + .zip(batch.columns()) + .for_each(|(expected, result)| assert_eq!(expected.as_ref(), result.as_ref())); + Ok(()) } #[test] -fn nested_list_json_arrays() { +fn nested_list_arrays() { let d_field = Field::new("d", DataType::Utf8, true); let c_field = Field::new("c", DataType::Struct(vec![d_field.clone()]), true); let b_field = Field::new("b", DataType::Boolean, true); @@ -430,14 +135,14 @@ fn nested_list_json_arrays() { let a_field = Field::new("a", a_list_data_type.clone(), true); let schema = Arc::new(Schema::new(vec![a_field])); let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let json_content = r#" + let content = r#" {"a": [{"b": true, "c": {"d": "a_text"}}, {"b": false, "c": {"d": "b_text"}}]} {"a": [{"b": false, "c": null}]} {"a": [{"b": true, "c": {"d": "c_text"}}, {"b": null, "c": {"d": "d_text"}}, {"b": true, "c": {"d": null}}]} {"a": null} {"a": []} "#; - let mut reader = builder.build(Cursor::new(json_content)).unwrap(); + let mut reader = builder.build(Cursor::new(content)).unwrap(); // build expected output let d = Utf8Array::::from(&vec![ @@ -477,65 +182,16 @@ fn nested_list_json_arrays() { assert_eq!(expected, read.as_ref()); } -#[test] -fn dictionary_from_json_basic_with_nulls() -> Result<()> { - let schema = Arc::new(Schema::new(vec![Field::new( - "d", - DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), - true, - )])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let d = schema.column_with_name("d").unwrap(); - let data_type = DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)); - assert_eq!(&data_type, d.1.data_type()); - - let result = batch.column(d.0); - - let values = vec![ - None, - Some("4"), - Some("text"), - Some("4"), - None, - None, - Some("4"), - None, - Some("text"), - Some("4"), - Some("4"), - None, - ]; - - let mut expected = MutableDictionaryArray::>::new(); - expected.try_extend(values)?; - let expected: DictionaryArray = expected.into(); - - assert_eq!(expected, result.as_ref()); - Ok(()) -} - #[test] fn skip_empty_lines() { let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let json_content = " + let content = " {\"a\": 1} {\"a\": 2} {\"a\": 3}"; - let mut reader = builder.build(Cursor::new(json_content)).unwrap(); + let mut reader = builder.build(Cursor::new(content)).unwrap(); let batch = reader.next().unwrap().unwrap(); assert_eq!(1, batch.num_columns()); @@ -549,10 +205,10 @@ fn skip_empty_lines() { #[test] fn row_type_validation() { let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let json_content = " + let content = " [1, \"hello\"] \"world\""; - let re = builder.build(Cursor::new(json_content)); + let re = builder.build(Cursor::new(content)); assert_eq!( re.err().unwrap().to_string(), r#"Expected JSON record to be an object, found Array([Number(1), String("hello")])"#, @@ -560,75 +216,60 @@ fn row_type_validation() { } #[test] -fn list_of_string_dictionary_from_json_with_nulls() -> Result<()> { - let data_type = DataType::List(Box::new(Field::new( - "item", - DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), - true, - ))); - - let schema = Arc::new(Schema::new(vec![Field::new( - "events", - data_type.clone(), - true, - )])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/list_string_dict_nested_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(3, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); +fn list_of_string_dictionary_from_with_nulls() -> Result<()> { + let (data, schema, columns) = case_dict(); - let events = schema.column_with_name("events").unwrap(); - assert_eq!(&data_type, events.1.data_type()); - - let expected = vec![ - Some(vec![None, Some("Elect Leader"), Some("Do Ballot")]), - Some(vec![ - Some("Do Ballot"), - None, - Some("Send Data"), - Some("Elect Leader"), - ]), - Some(vec![Some("Send Data")]), - ]; - - type A = MutableDictionaryArray>; - - let mut array = MutableListArray::::new(); - array.try_extend(expected)?; + let builder = ReaderBuilder::new() + .with_schema(Arc::new(schema)) + .with_batch_size(64); + let mut reader = builder.build(Cursor::new(data))?; + let batch = reader.next()?.unwrap(); - let expected: ListArray = array.into(); + assert_eq!(reader.schema(), batch.schema()); - assert_eq!(expected, batch.column(0).as_ref()); + assert_eq!(columns[0].as_ref(), batch.columns()[0].as_ref()); Ok(()) } #[test] -fn with_multiple_batches() { +fn with_multiple_batches() -> Result<()> { + let data = r#" + {"a":1} + {"a":null} + {} + {"a":1} + {"a":7} + {"a":1} + {"a":1} + {"a":5} + {"a":1} + {"a":1} + {"a":1} + {} + "#; + let builder = ReaderBuilder::new() .infer_schema(Some(4)) .with_batch_size(5); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); + let mut reader = builder.build(Cursor::new(data))?; let mut num_records = Vec::new(); - while let Some(rb) = reader.next().unwrap() { + while let Some(rb) = reader.next()? { num_records.push(rb.num_rows()); } assert_eq!(vec![5, 5, 2], num_records); + Ok(()) } #[test] -fn json_infer_schema() { +fn infer_schema_mixed_list() -> Result<()> { + let data = r#"{"a":1, "b":[2.0, 1.3, -6.1], "c":[false, true], "d":4.1} + {"a":-10, "b":[2.0, 1.3, -6.1], "c":null, "d":null} + {"a":2, "b":[2.0, null, -6.1], "c":[false, null], "d":"text"} + {"a":3, "b":4, "c": true, "d":[1, false, "array", 2.4]} + "#; + let schema = Schema::new(vec![ Field::new("a", DataType::Int64, true), Field::new( @@ -648,14 +289,8 @@ fn json_infer_schema() { ), ]); - let mut reader = BufReader::new(File::open("test/data/mixed_arrays.json").unwrap()); - let inferred_schema = infer_json_schema_from_seekable(&mut reader, None).unwrap(); - - assert_eq!(inferred_schema, schema); - - let file = File::open("test/data/mixed_arrays.json.gz").unwrap(); - let mut reader = BufReader::new(GzDecoder::new(&file)); - let inferred_schema = infer_json_schema(&mut reader, None).unwrap(); + let inferred_schema = infer_json_schema(&mut BufReader::new(Cursor::new(data)), None)?; assert_eq!(inferred_schema, schema); + Ok(()) }