- Author: Benjamin Du
- Date: 2023-01-01 18:52:22
- Modified: 2023-01-03 10:48:35
- Title: Read Parquet Files Using Polars in Rust
- Slug: read-parquet-files-using-polars-in-rust
- Category: Computer Science
- Tags: Computer Science, programming, Rust, Polars, Parquet, scan_parquet, DataFrame, IO

**Things on this page are fragmentary and immature notes/thoughts of the author. Please read with your own judgement!**

In [2]:
:timing
:sccache 1
:dep polars = { version = "0.42.0", features = ["lazy", "parquet"] }

Timing: true


In [5]:
use polars::df;
use polars::prelude::*;
use polars::datatypes::DataType;
use std::fs::File;
use std::io::BufWriter;
use std::io::Write;

In [6]:
let mut frame = LazyFrame::scan_parquet(
        "part-000.parquet",
        ScanArgsParquet::default(),
    )?
    .collect()?;
frame

shape: (10_498_456, 4)
┌──────────────────┬───────────────────┬──────────────────┬─────────────┐
│ id0              ┆ id1               ┆ id2              ┆ score_r4_it │
│ ---              ┆ ---               ┆ ---              ┆ ---         │
│ u64              ┆ u64               ┆ u64              ┆ f64         │
╞══════════════════╪═══════════════════╪══════════════════╪═════════════╡
│ 33701888         ┆ 13510798882120448 ┆ 4101             ┆ -0.954137   │
│ 28               ┆ 4505798920142848  ┆ 2097282          ┆ -11.599546  │
│ 41943042         ┆ 275951782400      ┆ 336              ┆ -3.984118   │
│ 18939904         ┆ 4503599637331969  ┆ 26               ┆ -1.175188   │
│ 416              ┆ 4503599628682241  ┆ 74               ┆ -0.97172    │
│ …                ┆ …                 ┆ …                ┆ …           │
│ 2195456          ┆ 9007199523700769  ┆ 4503599627370502 ┆ 1.033871    │
│ 14               ┆ 557840            ┆ 224              ┆ -0.068309   │
│ 4503599627374

## Count the Total Number of Rows of All Parquet Files

In [15]:
LazyFrame::scan_parquet(
        "part-000.parquet",
        ScanArgsParquet::default(),
    ).unwrap().count().collect()

Ok(shape: (1, 4)
┌──────────┬──────────┬──────────┬─────────────┐
│ id0      ┆ id1      ┆ id2      ┆ score_r4_it │
│ ---      ┆ ---      ┆ ---      ┆ ---         │
│ u32      ┆ u32      ┆ u32      ┆ u32         │
╞══════════╪══════════╪══════════╪═════════════╡
│ 10498456 ┆ 10498456 ┆ 10498456 ┆ 10498456    │
└──────────┴──────────┴──────────┴─────────────┘)

In [12]:
LazyFrame::scan_parquet(
        "part-000.parquet",
        ScanArgsParquet::default(),
    ).unwrap().select(
        &[col("*").count().cast(DataType::UInt64)]
    ).collect()

Ok(shape: (1, 4)
┌──────────┬──────────┬──────────┬─────────────┐
│ id0      ┆ id1      ┆ id2      ┆ score_r4_it │
│ ---      ┆ ---      ┆ ---      ┆ ---         │
│ u64      ┆ u64      ┆ u64      ┆ u64         │
╞══════════╪══════════╪══════════╪═════════════╡
│ 10498456 ┆ 10498456 ┆ 10498456 ┆ 10498456    │
└──────────┴──────────┴──────────┴─────────────┘)

In [16]:
LazyFrame::scan_parquet(
        "part-000.parquet",
        ScanArgsParquet::default(),
    ).unwrap().select(
        &[col("id0").count().cast(DataType::UInt64).alias("n")]
    ).collect().unwrap()["n"].u64().unwrap().get(0).unwrap()

10498456

In [17]:
LazyFrame::scan_parquet(
        "part-000.parquet",
        ScanArgsParquet::default(),
    ).unwrap().select(
        &[lit(1).count().cast(DataType::UInt64).alias("n")]
    ).collect().unwrap()["n"].u64().unwrap().get(0).unwrap()

1