- Author: Benjamin Du
- Date: 2023-01-01 18:52:22
- Modified: 2023-01-03 10:48:35
- Title: Read Parquet Files Using Polars in Rust
- Slug: read-parquet-files-using-polars-in-rust
- Category: Computer Science
- Tags: Computer Science, programming, Rust, Polars, Parquet, scan_parquet, DataFrame, IO

**Things on this page are fragmentary and immature notes/thoughts of the author. Please read with your own judgement!**

In [None]:
:timing
:sccache 1
:dep polars = { version = "0.26.1", features = ["lazy", "parquet"] }

In [None]:
use polars::df;
use polars::prelude::*;
use polars::datatypes::DataType;
use std::fs::File;
use std::io::BufWriter;
use std::io::Write;

In [8]:
let mut frame = LazyFrame::scan_parquet(
        "part-000.parquet",
        ScanArgsParquet::default(),
    )?
    .collect()?;
frame

shape: (4002557, 4)
┌──────┬──────────────────┬──────────────────┬─────────────┐
│ id0  ┆ id1              ┆ id2              ┆ score_r4_it │
│ ---  ┆ ---              ┆ ---              ┆ ---         │
│ u64  ┆ u64              ┆ u64              ┆ f64         │
╞══════╪══════════════════╪══════════════════╪═════════════╡
│ 2    ┆ 16796161         ┆ 4503599635760400 ┆ 1.07059     │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 16   ┆ 8293             ┆ 3288727552       ┆ -0.085568   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2    ┆ 4503599660926465 ┆ 28684            ┆ 4.779815    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4    ┆ 4503599627386947 ┆ 274911465616     ┆ 4.650999    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ...  ┆ ...              ┆ ...              ┆ ...         │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4    ┆ 68727866378      ┆ 4224323673915392 ┆ 11.800134   │
├╌╌╌

## Count the Total Number of Rows of All Parquet Files

In [46]:
LazyFrame::scan_parquet(
        "data/test/**/*.parquet",
        ScanArgsParquet::default(),
    ).unwrap().select(
        &[count().cast(DataType::UInt64).alias("n")]
    ).collect().unwrap()["n"].u64().unwrap().get(0).unwrap()

382761717

In [37]:
let frame = LazyFrame::scan_parquet(
        "data/test/**/*.parquet",
        ScanArgsParquet::default(),
    ).unwrap();
let df = frame.select(&[count().cast(DataType::UInt64).alias("n")]).collect().unwrap();
df

shape: (1, 1)
┌───────────┐
│ n         │
│ ---       │
│ u64       │
╞═══════════╡
│ 382761717 │
└───────────┘