Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Added support to mmap IPC format #1197

Merged
merged 8 commits into from
Aug 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,25 @@ jobs:
- name: Run
run: MIRIFLAGS="-Zmiri-disable-isolation" cargo miri test --tests --features io_ipc,io_json_integration io::ipc::write::write_sliced_list

miri-checks-mmap:
name: MIRI on IO IPC mmaping
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
toolchain: nightly-2022-07-12
override: true
- uses: Swatinem/rust-cache@v1
with:
key: key1
- name: Install Miri
run: |
rustup component add miri
cargo miri setup
- name: Run
run: cargo miri test --tests --features io_ipc io::ipc::mmap

feature-compilation:
name: Feature coverage
runs-on: ubuntu-latest
Expand Down
35 changes: 35 additions & 0 deletions examples/ipc_file_mmap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
//! Example showing how to memory map an Arrow IPC file into a [`Chunk`].
use std::sync::Arc;

use arrow2::error::Result;
use arrow2::io::ipc::read;
use arrow2::mmap::{mmap_dictionaries_unchecked, mmap_unchecked};

// Arrow2 requires a struct that implements `Clone + AsRef<[u8]>`, which
// usually `Arc<Mmap>` supports. Here we mock it
#[derive(Clone)]
struct Mmap(Arc<Vec<u8>>);

impl AsRef<[u8]> for Mmap {
#[inline]
fn as_ref(&self) -> &[u8] {
self.0.as_ref()
}
}

fn main() -> Result<()> {
// given a mmap
let mmap = Mmap(Arc::new(vec![]));

// read the metadata
let metadata = read::read_file_metadata(&mut std::io::Cursor::new(mmap.as_ref()))?;

// mmap the dictionaries
let dictionaries = unsafe { mmap_dictionaries_unchecked(&metadata, mmap.clone())? };

// and finally mmap a chunk (0 in this case).
let chunk = unsafe { mmap_unchecked(&metadata, &dictionaries, mmap, 0) }?;

println!("{chunk:?}");
Ok(())
}
1 change: 1 addition & 0 deletions guide/src/SUMMARY.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- [Read Parquet](./io/parquet_read.md)
- [Write Parquet](./io/parquet_write.md)
- [Read Arrow](./io/ipc_read.md)
- [Memory map Arrow](./io/ipc_mmap.md)
- [Read Arrow stream](./io/ipc_stream_read.md)
- [Write Arrow](./io/ipc_write.md)
- [Read Avro](./io/avro_read.md)
Expand Down
10 changes: 10 additions & 0 deletions guide/src/io/ipc_mmap.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Read Arrow

When compiled with feature `io_ipc`, this crate can be used to memory map IPC Arrow files
into arrays.

The example below shows how to memory map an IPC Arrow file into `Chunk`es:

```rust
{{#include ../../../examples/ipc_file_mmap.rs}}
```
2 changes: 1 addition & 1 deletion src/array/list/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ impl<O: Offset> ListArray<O> {
/// Returns a the inner [`Field`]
/// # Errors
/// Panics iff the logical type is not consistent with this struct.
fn try_get_child(data_type: &DataType) -> Result<&Field, Error> {
pub fn try_get_child(data_type: &DataType) -> Result<&Field, Error> {
if O::IS_LARGE {
match data_type.to_logical_type() {
DataType::LargeList(child) => Ok(child.as_ref()),
Expand Down
2 changes: 1 addition & 1 deletion src/ffi/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ impl InternalArrowArray {
}
}

impl ArrowArrayRef for Box<InternalArrowArray> {
impl ArrowArrayRef for InternalArrowArray {
/// the data_type as declared in the schema
fn data_type(&self) -> &DataType {
&self.data_type
Expand Down
Loading