diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index f1a37910c65..2a1d92de9a1 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -49,6 +49,6 @@ jobs: uses: peaceiris/actions-gh-pages@v3.7.3 with: personal_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: ./target/doc/arrow2 + publish_dir: ./target/doc destination_dir: ./${{github.ref_name}}/docs keep_files: false diff --git a/README.md b/README.md index 51818066b45..0b9dba8e51e 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ implementation. Check out [the guide](https://jorgecarleitao.github.io/arrow2/main/guide) for a general introduction on how to use this crate, and -[API docs](https://jorgecarleitao.github.io/arrow2/main/docs) +[API docs](https://jorgecarleitao.github.io/arrow2/main/docs/arrow2) for a detailed documentation of each of its APIs. ## Features diff --git a/examples/ipc_file_mmap.rs b/examples/ipc_file_mmap.rs index 51dd2a27477..c58e55657a4 100644 --- a/examples/ipc_file_mmap.rs +++ b/examples/ipc_file_mmap.rs @@ -1,13 +1,13 @@ //! Example showing how to memory map an Arrow IPC file into a [`Chunk`]. use std::sync::Arc; -use arrow2::error::Result; -use arrow2::io::ipc::read; -use arrow2::mmap::{mmap_dictionaries_unchecked, mmap_unchecked}; +use arrow2::array::{Array, BooleanArray}; +use arrow2::chunk::Chunk; +use arrow2::datatypes::{Field, Schema}; +use arrow2::error::Error; -// Arrow2 requires a struct that implements `Clone + AsRef<[u8]>`, which -// usually `Arc` supports. Here we mock it -#[derive(Clone)] +// Arrow2 requires something that implements `AsRef<[u8]>`, which +// `Mmap` supports. Here we mock it struct Mmap(Vec); impl AsRef<[u8]> for Mmap { @@ -17,19 +17,49 @@ impl AsRef<[u8]> for Mmap { } } -fn main() -> Result<()> { +// Auxiliary function to write an arrow file +// This function is guaranteed to produce a valid arrow file +fn write( + chunks: &[Chunk>], + schema: &Schema, + ipc_fields: Option>, + compression: Option, +) -> Result, Error> { + let result = vec![]; + let options = arrow2::io::ipc::write::WriteOptions { compression }; + let mut writer = + arrow2::io::ipc::write::FileWriter::try_new(result, schema, ipc_fields.clone(), options)?; + for chunk in chunks { + writer.write(chunk, ipc_fields.as_ref().map(|x| x.as_ref()))?; + } + writer.finish()?; + Ok(writer.into_inner()) +} + +fn check_round_trip(array: Box) -> Result<(), Error> { + let schema = Schema::from(vec![Field::new("a", array.data_type().clone(), true)]); + let columns = Chunk::try_new(vec![array.clone()])?; + // given a mmap - let mmap = Arc::new(Mmap(vec![])); + let data = Arc::new(write(&[columns], &schema, None, None)?); - // read the metadata - let metadata = read::read_file_metadata(&mut std::io::Cursor::new(mmap.as_ref()))?; + // we first read the files' metadata + let metadata = + arrow2::io::ipc::read::read_file_metadata(&mut std::io::Cursor::new(data.as_ref()))?; - // mmap the dictionaries - let dictionaries = unsafe { mmap_dictionaries_unchecked(&metadata, mmap.clone())? }; + // next we mmap the dictionaries + // Safety: `write` above guarantees that this is a valid Arrow IPC file + let dictionaries = + unsafe { arrow2::mmap::mmap_dictionaries_unchecked(&metadata, data.clone())? }; // and finally mmap a chunk (0 in this case). - let chunk = unsafe { mmap_unchecked(&metadata, &dictionaries, mmap, 0) }?; - - println!("{chunk:?}"); + // Safety: `write` above guarantees that this is a valid Arrow IPC file + let new_array = unsafe { arrow2::mmap::mmap_unchecked(&metadata, &dictionaries, data, 0)? }; + assert_eq!(new_array.into_arrays()[0], array); Ok(()) } + +fn main() -> Result<(), Error> { + let array = BooleanArray::from([None, None, Some(true)]).boxed(); + check_round_trip(array) +} diff --git a/guide/src/README.md b/guide/src/README.md index de2cd418bea..6256537501b 100644 --- a/guide/src/README.md +++ b/guide/src/README.md @@ -5,10 +5,29 @@ interoperability with the arrow format. The typical use-case for this library is to perform CPU and memory-intensive analytics in a format that supports heterogeneous data structures, null values, and IPC and FFI interfaces across languages. -Arrow2 is divided into 5 main parts: +Arrow2 is divided in 5 main APIs: -* a [low-level API](./low_level.md) to efficiently operate with contiguous memory regions; -* a [high-level API](./high_level.md) to operate with arrow arrays; -* a [metadata API](./metadata.md) to declare and operate with logical types and metadata; -* a [compute API](./compute.md) with operators to operate over arrays; -* an [IO API](./io/README.md) with interfaces to read from, and write to, other formats. +* a [low-level API](./low_level.md) to efficiently operate with contiguous memory regions +* a [high-level API](./high_level.md) to operate with arrow arrays +* a [metadata API](./metadata.md) to declare and operate with logical types and metadata +* a [compute API](./compute.md) with operators to operate over arrays +* an IO API with interfaces to read from, and write to, other formats + * Arrow + * [Read files](./io/ipc_read.md) + * [Read streams](./io/ipc_stream_read.md) + * [Memory map files](./io/ipc_mmap.md) + * [Write](./io/ipc_write.md) + * CSV + * [Read](./io/csv_read.md) + * [Write](./io/csv_write.md) + * Parquet + * [Read](./io/parquet_read.md) + * [Write](./io/parquet_write.md) + * JSON and NDJSON + * [Read](./io/json_read.md) + * [Write](./io/json_write.md) + * Avro + * [Read](./io/avro_read.md) + * [Write](./io/avro_write.md) + * ODBC + * [Read and write](./io/odbc.md) diff --git a/guide/src/high_level.md b/guide/src/high_level.md index b460e44f5c2..78d71215db4 100644 --- a/guide/src/high_level.md +++ b/guide/src/high_level.md @@ -279,5 +279,5 @@ Below is a complete example of how to operate on a `Box` without extra allocations. ```rust,ignore -{{#include ../examples/cow.rs}} +{{#include ../../examples/cow.rs}} ``` diff --git a/guide/src/io/README.md b/guide/src/io/README.md index 5a8cd477949..1b3628b3b01 100644 --- a/guide/src/io/README.md +++ b/guide/src/io/README.md @@ -1,12 +1 @@ # IO - -This crate offers optional features that enable interoperability with different formats: - -* Arrow (`io_ipc`) -* CSV (`io_csv`) -* Parquet (`io_parquet`) -* JSON and NDJSON (`io_json`) -* Avro (`io_avro` and `io_avro_async`) -* ODBC-compliant databases (`io_odbc`) - -In this section you can find a guide and examples for each one of them. diff --git a/guide/src/io/odbc.md b/guide/src/io/odbc.md index 7e362daf7c6..627cda7501a 100644 --- a/guide/src/io/odbc.md +++ b/guide/src/io/odbc.md @@ -4,5 +4,5 @@ When compiled with feature `io_odbc`, this crate can be used to read from, and w any [ODBC](https://en.wikipedia.org/wiki/Open_Database_Connectivity) interface: ```rust -{{#include ../../../examples/odbc.rs}} +{{#include ../../../examples/io_odbc.rs}} ``` diff --git a/guide/src/io/parquet_read.md b/guide/src/io/parquet_read.md index 367721d39c6..ddcb5d453a6 100644 --- a/guide/src/io/parquet_read.md +++ b/guide/src/io/parquet_read.md @@ -19,13 +19,6 @@ Here is how to read a single column chunk from a single row group: The example above minimizes memory usage at the expense of mixing IO and CPU tasks on the same thread, which may hurt performance if one of them is a bottleneck. -For single-threaded reading, buffers used to read and decompress pages can be re-used. -This create offers an API that encapsulates the above logic: - -```rust -{{#include ../../../examples/parquet_read_record.rs}} -``` - ### Parallelism decoupling of CPU from IO One important aspect of the pages created by the iterator above is that they can cross @@ -38,7 +31,7 @@ and that it is advantageous to have a single thread performing all IO-intensive by delegating all CPU-intensive tasks to separate threads. ```rust -{{#include ../../../examples/parquet_read_parallel.rs}} +{{#include ../../../examples/parquet_read_parallel/src/main.rs}} ``` This can of course be reversed; in configurations where IO is bounded (e.g. when a diff --git a/src/io/ipc/write/common.rs b/src/io/ipc/write/common.rs index 8b1a2a03c56..1672d69bc01 100644 --- a/src/io/ipc/write/common.rs +++ b/src/io/ipc/write/common.rs @@ -172,14 +172,14 @@ fn encode_dictionary( } pub fn encode_chunk( - columns: &Chunk>, + chunk: &Chunk>, fields: &[IpcField], dictionary_tracker: &mut DictionaryTracker, options: &WriteOptions, ) -> Result<(Vec, EncodedData)> { let mut encoded_dictionaries = vec![]; - for (field, array) in fields.iter().zip(columns.as_ref()) { + for (field, array) in fields.iter().zip(chunk.as_ref()) { encode_dictionary( field, array.as_ref(), @@ -189,7 +189,7 @@ pub fn encode_chunk( )?; } - let encoded_message = columns_to_bytes(columns, options); + let encoded_message = chunk_to_bytes(chunk, options); Ok((encoded_dictionaries, encoded_message)) } @@ -213,12 +213,12 @@ fn serialize_compression( /// Write [`Chunk`] into two sets of bytes, one for the header (ipc::Schema::Message) and the /// other for the batch's data -fn columns_to_bytes(columns: &Chunk>, options: &WriteOptions) -> EncodedData { +fn chunk_to_bytes(chunk: &Chunk>, options: &WriteOptions) -> EncodedData { let mut nodes: Vec = vec![]; let mut buffers: Vec = vec![]; let mut arrow_data: Vec = vec![]; let mut offset = 0; - for array in columns.arrays() { + for array in chunk.arrays() { write( array.as_ref(), &mut buffers, @@ -236,7 +236,7 @@ fn columns_to_bytes(columns: &Chunk>, options: &WriteOptions) -> version: arrow_format::ipc::MetadataVersion::V5, header: Some(arrow_format::ipc::MessageHeader::RecordBatch(Box::new( arrow_format::ipc::RecordBatch { - length: columns.len() as i64, + length: chunk.len() as i64, nodes: Some(nodes), buffers: Some(buffers), compression, diff --git a/src/io/ipc/write/writer.rs b/src/io/ipc/write/writer.rs index 65354aca83e..600dd92cb67 100644 --- a/src/io/ipc/write/writer.rs +++ b/src/io/ipc/write/writer.rs @@ -118,7 +118,7 @@ impl FileWriter { /// Writes [`Chunk`] to the file pub fn write( &mut self, - columns: &Chunk>, + chunk: &Chunk>, ipc_fields: Option<&[IpcField]>, ) -> Result<()> { if self.state != State::Started { @@ -134,7 +134,7 @@ impl FileWriter { }; let (encoded_dictionaries, encoded_message) = encode_chunk( - columns, + chunk, ipc_fields, &mut self.dictionary_tracker, &self.options,