Moved dict_id to IPC-specific IO (#713)

jorgecarleitao · Dec 28, 2021 · d14ae86 · d14ae86
1 parent f07cc2c
commit d14ae86
Show file tree

Hide file tree

Showing 51 changed files with 2,085 additions and 1,882 deletions.
diff --git a/arrow-parquet-integration-testing/src/main.rs b/arrow-parquet-integration-testing/src/main.rs
@@ -1,12 +1,14 @@
 use std::fs::File;
 use std::sync::Arc;
-use std::{collections::HashMap, convert::TryFrom, io::Read};
+use std::{collections::HashMap, io::Read};
 
+use arrow2::io::ipc::IpcField;
 use arrow2::{
     datatypes::{DataType, Schema},
     error::Result,
     io::{
-        json_integration::{to_record_batch, ArrowJson},
+        json_integration::read,
+        json_integration::ArrowJson,
         parquet::write::{
             write_file, Compression, Encoding, RowGroupIterator, Version, WriteOptions,
         },
@@ -19,7 +21,7 @@ use clap::{App, Arg};
 use flate2::read::GzDecoder;
 
 /// Read gzipped JSON file
-fn read_gzip_json(version: &str, file_name: &str) -> (Schema, Vec<RecordBatch>) {
+fn read_gzip_json(version: &str, file_name: &str) -> (Schema, Vec<IpcField>, Vec<RecordBatch>) {
     let path = format!(
         "../testing/arrow-testing/data/arrow-ipc-stream/integration/{}/{}.json.gz",
         version, file_name
@@ -32,7 +34,7 @@ fn read_gzip_json(version: &str, file_name: &str) -> (Schema, Vec<RecordBatch>)
     let arrow_json: ArrowJson = serde_json::from_str(&s).unwrap();
 
     let schema = serde_json::to_value(arrow_json.schema).unwrap();
-    let schema = Schema::try_from(&schema).unwrap();
+    let (schema, ipc_fields) = read::deserialize_schema(&schema).unwrap();
 
     // read dictionaries
     let mut dictionaries = HashMap::new();
@@ -46,11 +48,11 @@ fn read_gzip_json(version: &str, file_name: &str) -> (Schema, Vec<RecordBatch>)
     let batches = arrow_json
         .batches
         .iter()
-        .map(|batch| to_record_batch(&schema, batch, &dictionaries))
+        .map(|batch| read::to_record_batch(&schema, &ipc_fields, batch, &dictionaries))
         .collect::<Result<Vec<_>>>()
         .unwrap();
 
-    (schema, batches)
+    (schema, ipc_fields, batches)
 }
 
 fn main() -> Result<()> {
@@ -106,7 +108,7 @@ fn main() -> Result<()> {
             .collect::<Vec<_>>()
     });
 
-    let (schema, batches) = read_gzip_json("1.0.0-littleendian", json_file);
+    let (schema, _, batches) = read_gzip_json("1.0.0-littleendian", json_file);
 
     let schema = if let Some(projection) = &projection {
         let fields = schema

diff --git a/examples/extension.rs b/examples/extension.rs
@@ -38,11 +38,11 @@ fn write_ipc<W: Write + Seek>(writer: W, array: impl Array + 'static) -> Result<
     let schema = Schema::new(vec![Field::new("a", array.data_type().clone(), false)]);
 
     let options = write::WriteOptions { compression: None };
-    let mut writer = write::FileWriter::try_new(writer, &schema, options)?;
+    let mut writer = write::FileWriter::try_new(writer, &schema, None, options)?;
 
     let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)])?;
 
-    writer.write(&batch)?;
+    writer.write(&batch, None)?;
 
     Ok(writer.into_inner())
 }

diff --git a/examples/ipc_file_write.rs b/examples/ipc_file_write.rs
@@ -11,10 +11,10 @@ fn write_batches(path: &str, schema: &Schema, batches: &[RecordBatch]) -> Result
     let file = File::create(path)?;
 
     let options = write::WriteOptions { compression: None };
-    let mut writer = write::FileWriter::try_new(file, schema, options)?;
+    let mut writer = write::FileWriter::try_new(file, schema, None, options)?;
 
     for batch in batches {
-        writer.write(batch)?
+        writer.write(batch, None)?
     }
     writer.finish()
 }

diff --git a/examples/json_read.rs b/examples/json_read.rs
@@ -31,7 +31,7 @@ fn read_path(path: &str, projection: Option<Vec<&str>>) -> Result<RecordBatch> {
 
     // deserialize `rows` into a `RecordBatch`. This is CPU-intensive, has no IO,
     // and can be performed on a different thread pool via a channel.
-    read::deserialize(&rows, fields)
+    read::deserialize(rows, fields)
 }
 
 fn main() -> Result<()> {

diff --git a/integration-testing/src/bin/arrow-file-to-stream.rs b/integration-testing/src/bin/arrow-file-to-stream.rs
@@ -27,15 +27,16 @@ fn main() -> Result<()> {
     let filename = &args[1];
     let mut f = File::open(filename)?;
     let metadata = read::read_file_metadata(&mut f)?;
-    let mut reader = read::FileReader::new(f, metadata, None);
-    let schema = reader.schema();
+    let mut reader = read::FileReader::new(f, metadata.clone(), None);
 
     let options = write::WriteOptions { compression: None };
-    let mut writer = write::StreamWriter::try_new(std::io::stdout(), schema, options)?;
+    let mut writer = write::StreamWriter::new(std::io::stdout(), options);
+
+    writer.start(&metadata.schema, &metadata.ipc_schema.fields)?;
 
     reader.try_for_each(|batch| {
         let batch = batch?;
-        writer.write(&batch)
+        writer.write(&batch, &metadata.ipc_schema.fields)
     })?;
     writer.finish()?;
 

diff --git a/integration-testing/src/bin/arrow-json-integration-test.rs b/integration-testing/src/bin/arrow-json-integration-test.rs
@@ -1,29 +1,13 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
 use std::fs::File;
 
+use arrow2::io::json_integration::ArrowJson;
 use clap::{App, Arg};
 
 use arrow2::io::ipc::read;
 use arrow2::io::ipc::write;
 use arrow2::{
     error::{ArrowError, Result},
-    io::json_integration::*,
+    io::json_integration::write as json_write,
 };
 use arrow_integration_testing::read_json_file;
 
@@ -82,10 +66,15 @@ fn json_to_arrow(json_name: &str, arrow_name: &str, verbose: bool) -> Result<()>
 
     let arrow_file = File::create(arrow_name)?;
     let options = write::WriteOptions { compression: None };
-    let mut writer = write::FileWriter::try_new(arrow_file, &json_file.schema, options)?;
+    let mut writer = write::FileWriter::try_new(
+        arrow_file,
+        &json_file.schema,
+        Some(json_file.fields),
+        options,
+    )?;
 
     for b in json_file.batches {
-        writer.write(&b)?;
+        writer.write(&b, None)?;
     }
 
     writer.finish()?;
@@ -100,19 +89,12 @@ fn arrow_to_json(arrow_name: &str, json_name: &str, verbose: bool) -> Result<()>
 
     let mut arrow_file = File::open(arrow_name)?;
     let metadata = read::read_file_metadata(&mut arrow_file)?;
-    let reader = read::FileReader::new(arrow_file, metadata, None);
+    let reader = read::FileReader::new(arrow_file, metadata.clone(), None);
 
-    let mut fields: Vec<ArrowJsonField> = vec![];
-    for f in reader.schema().fields() {
-        fields.push(ArrowJsonField::from(f));
-    }
-    let schema = ArrowJsonSchema {
-        fields,
-        metadata: None,
-    };
+    let schema = json_write::serialize_schema(&metadata.schema, &metadata.ipc_schema.fields);
 
     let batches = reader
-        .map(|batch| Ok(from_record_batch(&batch?)))
+        .map(|batch| Ok(json_write::from_record_batch(&batch?)))
         .collect::<Result<Vec<_>>>()?;
 
     let arrow_json = ArrowJson {

diff --git a/integration-testing/src/bin/arrow-stream-to-file.rs b/integration-testing/src/bin/arrow-stream-to-file.rs
@@ -24,15 +24,19 @@ use arrow2::io::ipc::write;
 fn main() -> Result<()> {
     let mut reader = io::stdin();
     let metadata = read::read_stream_metadata(&mut reader)?;
-    let mut arrow_stream_reader = read::StreamReader::new(reader, metadata);
-    let schema = arrow_stream_reader.schema();
+    let mut arrow_stream_reader = read::StreamReader::new(reader, metadata.clone());
 
     let writer = io::stdout();
 
     let options = write::WriteOptions { compression: None };
-    let mut writer = write::FileWriter::try_new(writer, schema, options)?;
+    let mut writer = write::FileWriter::try_new(
+        writer,
+        &metadata.schema,
+        Some(metadata.ipc_schema.fields),
+        options,
+    )?;
 
-    arrow_stream_reader.try_for_each(|batch| writer.write(&batch?.unwrap()))?;
+    arrow_stream_reader.try_for_each(|batch| writer.write(&batch?.unwrap(), None))?;
     writer.finish()?;
 
     Ok(())