This repository has been archived by the owner on Feb 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 221
/
parquet_read.rs
46 lines (35 loc) · 1.71 KB
/
parquet_read.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
use std::fs::File;
use std::io::BufReader;
use arrow2::io::parquet::read;
use arrow2::{array::Array, error::Result};
fn read_field(path: &str, row_group: usize, field: usize) -> Result<Box<dyn Array>> {
// Open a file, a common operation in Rust
let mut file = BufReader::new(File::open(path)?);
// Read the files' metadata. This has a small IO cost because it requires seeking to the end
// of the file to read its footer.
let metadata = read::read_metadata(&mut file)?;
// Convert the files' metadata into an arrow schema. This is CPU-only and amounts to
// parse thrift if the arrow format is available on a key, or infering the arrow schema from
// the parquet's physical, converted and logical types.
let arrow_schema = read::get_schema(&metadata)?;
// Created an iterator of column chunks. Each iteration
// yields an iterator of compressed pages. There is almost no CPU work in iterating.
let columns = read::get_column_iterator(&mut file, &metadata, row_group, field, None, vec![]);
// get the columns' field
let field = &arrow_schema.fields[field];
// This is the actual work. In this case, pages are read and
// decompressed, decoded and deserialized to arrow.
// Because `columns` is an iterator, it uses a combination of IO and CPU.
let (array, _, _) = read::column_iter_to_array(columns, field, vec![])?;
Ok(array)
}
fn main() -> Result<()> {
use std::env;
let args: Vec<String> = env::args().collect();
let file_path = &args[1];
let field = args[2].parse::<usize>().unwrap();
let row_group = args[3].parse::<usize>().unwrap();
let array = read_field(file_path, row_group, field)?;
println!("{:?}", array);
Ok(())
}