diff --git a/Cargo.toml b/Cargo.toml index c823cb058ba..869fe4fe00a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -80,7 +80,7 @@ crc = { version = "2", optional = true } async-stream = { version = "0.3.2", optional = true } # ORC support -orc-format = { git = "https://github.com/DataEngineeringLabs/orc-format.git", branch = "iters", optional = true } +orc-format = { version = "0.3.0", optional = true } # Arrow integration tests support serde = { version = "^1.0", features = ["rc"], optional = true } diff --git a/examples/orc_read.rs b/examples/orc_read.rs new file mode 100644 index 00000000000..f1a5acee4dd --- /dev/null +++ b/examples/orc_read.rs @@ -0,0 +1,55 @@ +use arrow2::array::*; +use arrow2::error::Error; +use arrow2::io::orc::{format, read}; + +fn deserialize_column(path: &str, column_name: &str) -> Result, Error> { + // open the file + let mut reader = std::fs::File::open(path).unwrap(); + + // read its metadata (IO-bounded) + let metadata = format::read::read_metadata(&mut reader)?; + + // infer its (Arrow) [`Schema`] + let schema = read::infer_schema(&metadata.footer)?; + + // find the position of the column in the schema + let (pos, field) = schema + .fields + .iter() + .enumerate() + .find(|f| f.1.name == column_name) + .unwrap(); + + // pick a stripe (basically a set of rows) + let stripe = 0; + + // read the stripe's footer (IO-bounded) + let footer = format::read::read_stripe_footer(&mut reader, &metadata, stripe, &mut vec![])?; + + // read the column's data from the stripe (IO-bounded) + let data_type = field.data_type.clone(); + let column = format::read::read_stripe_column( + &mut reader, + &metadata, + 0, + footer, + // 1 because ORC schemas always start with a struct, which we ignore + 1 + pos as u32, + vec![], + )?; + + // finally, deserialize to Arrow (CPU-bounded) + read::deserialize(data_type, &column) +} + +fn main() -> Result<(), Error> { + use std::env; + let args: Vec = env::args().collect(); + + let file_path = &args[1]; + let column = &args[2]; + + let array = deserialize_column(file_path, column)?; + println!("{array:?}"); + Ok(()) +} diff --git a/src/io/mod.rs b/src/io/mod.rs index bc7d218ad36..69e4657fd75 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -6,6 +6,7 @@ pub mod odbc; #[cfg(feature = "io_orc")] +#[cfg_attr(docsrs, doc(cfg(feature = "io_orc")))] pub mod orc; #[cfg(any( diff --git a/tests/it/io/orc/read.rs b/tests/it/io/orc/read.rs index 9d6ccaee4ff..a35c54d34e6 100644 --- a/tests/it/io/orc/read.rs +++ b/tests/it/io/orc/read.rs @@ -1,5 +1,4 @@ use arrow2::array::*; -use arrow2::datatypes::DataType; use arrow2::error::Error; use arrow2::io::orc::{format, read};