Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Added example
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao committed Jul 30, 2022
1 parent e0a9b5e commit 25d40f2
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 3 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/test.yml
Expand Up @@ -42,7 +42,9 @@ jobs:
- uses: Swatinem/rust-cache@v1
- name: Run
shell: bash
run: ARROW2_IGNORE_PARQUET= cargo test --features full
run: |
cargo check --features full
cargo test --tests
clippy:
name: Clippy
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Expand Up @@ -80,7 +80,7 @@ crc = { version = "2", optional = true }
async-stream = { version = "0.3.2", optional = true }

# ORC support
orc-format = { git = "https://github.com/DataEngineeringLabs/orc-format.git", branch = "iters", optional = true }
orc-format = { version = "0.3.0", optional = true }

# Arrow integration tests support
serde = { version = "^1.0", features = ["rc"], optional = true }
Expand Down
55 changes: 55 additions & 0 deletions examples/orc_read.rs
@@ -0,0 +1,55 @@
use arrow2::array::*;
use arrow2::error::Error;
use arrow2::io::orc::{format, read};

fn deserialize_column(path: &str, column_name: &str) -> Result<Box<dyn Array>, Error> {
// open the file
let mut reader = std::fs::File::open(path).unwrap();

// read its metadata (IO-bounded)
let metadata = format::read::read_metadata(&mut reader)?;

// infer its (Arrow) [`Schema`]
let schema = read::infer_schema(&metadata.footer)?;

// find the position of the column in the schema
let (pos, field) = schema
.fields
.iter()
.enumerate()
.find(|f| f.1.name == column_name)
.unwrap();

// pick a stripe (basically a set of rows)
let stripe = 0;

// read the stripe's footer (IO-bounded)
let footer = format::read::read_stripe_footer(&mut reader, &metadata, stripe, &mut vec![])?;

// read the column's data from the stripe (IO-bounded)
let data_type = field.data_type.clone();
let column = format::read::read_stripe_column(
&mut reader,
&metadata,
0,
footer,
// 1 because ORC schemas always start with a struct, which we ignore
1 + pos as u32,
vec![],
)?;

// finally, deserialize to Arrow (CPU-bounded)
read::deserialize(data_type, &column)
}

fn main() -> Result<(), Error> {
use std::env;
let args: Vec<String> = env::args().collect();

let file_path = &args[1];
let column = &args[2];

let array = deserialize_column(file_path, column)?;
println!("{array:?}");
Ok(())
}
1 change: 1 addition & 0 deletions src/io/mod.rs
Expand Up @@ -6,6 +6,7 @@
pub mod odbc;

#[cfg(feature = "io_orc")]
#[cfg_attr(docsrs, doc(cfg(feature = "io_orc")))]
pub mod orc;

#[cfg(any(
Expand Down
1 change: 0 additions & 1 deletion tests/it/io/orc/read.rs
@@ -1,5 +1,4 @@
use arrow2::array::*;
use arrow2::datatypes::DataType;
use arrow2::error::Error;
use arrow2::io::orc::{format, read};

Expand Down

0 comments on commit 25d40f2

Please sign in to comment.