- Author: Benjamin Du
- Date: 2021-12-04 20:31:38
- Modified: 2021-12-04 20:31:38
- Title: Hands on the polars Crate in Rust
- Slug: hands-on-polars-in-rust
- Category: Computer Science
- Tags: Computer Science, programming, Rust, Polars, DataFrame

**Things on this page are fragmentary and immature notes/thoughts of the author. Please read with your own judgement!**

## [polars](http://www.legendu.net/misc/blog/tips-on-polars)   

Polars is a blazingly fast DataFrames library implemented in Rust using Apache Arrow as memory model.

1. Polars support multithreading and lazy computation.
2. Polars CANNOT handle data larger than memory at this time 
    (even though this might change in future).

In [4]:
:dep polars = { version = "0.21.1", features = ["lazy", "parquet"] }

In [26]:
use polars::prelude::*;
use polars::df;

In [31]:
use std::fs::File;
use std::io::BufWriter;
use std::io::Write;
use polars::prelude::*;
use polars::datatypes::DataType;

In [26]:
let mut df = LazyFrame::scan_parquet(
        "test_data_11_dedup/part-00000-da5cd2f7-7f35-41ba-8660-0441ae9e0d8d-c000.snappy.parquet".into(),
        ScanArgsParquet::default(),
    )?
    .collect()?;
df

shape: (4002557, 4)
┌──────┬──────────────────┬──────────────────┬─────────────┐
│ id0  ┆ id1              ┆ id2              ┆ score_r4_it │
│ ---  ┆ ---              ┆ ---              ┆ ---         │
│ i64  ┆ i64              ┆ i64              ┆ f64         │
╞══════╪══════════════════╪══════════════════╪═════════════╡
│ 2    ┆ 16796161         ┆ 4503599635760400 ┆ 1.0705      │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 16   ┆ 8293             ┆ 3288727552       ┆ -0.085568   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2    ┆ 4503599660926465 ┆ 28684            ┆ 4.779815    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4    ┆ 4503599627386947 ┆ 274911465616     ┆ 4.650999    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ...  ┆ ...              ┆ ...              ┆ ...         │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4    ┆ 68727866378      ┆ 4224323673915392 ┆ 11.800134   │
├╌╌╌

## DataFrame.apply

In [27]:
fn as_u64(id: &Series) -> Series {
    id.cast(&DataType::UInt64).unwrap()
}

In [29]:
df.apply("id0", as_u64);
df.apply("id1", as_u64);
df.apply("id2", as_u64);
df

shape: (4002557, 4)
┌──────┬──────────────────┬──────────────────┬─────────────┐
│ id0  ┆ id1              ┆ id2              ┆ score_r4_it │
│ ---  ┆ ---              ┆ ---              ┆ ---         │
│ u64  ┆ u64              ┆ u64              ┆ f64         │
╞══════╪══════════════════╪══════════════════╪═════════════╡
│ 2    ┆ 16796161         ┆ 4503599635760400 ┆ 1.0705      │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 16   ┆ 8293             ┆ 3288727552       ┆ -0.085568   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2    ┆ 4503599660926465 ┆ 28684            ┆ 4.779815    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4    ┆ 4503599627386947 ┆ 274911465616     ┆ 4.650999    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ...  ┆ ...              ┆ ...              ┆ ...         │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4    ┆ 68727866378      ┆ 4224323673915392 ┆ 11.800134   │
├╌╌╌

In [36]:
let f = File::create("j.parquet").expect("Unable to create file");
let mut bfw = BufWriter::new(f);
let pw = ParquetWriter::new(bfw).with_compression(ParquetCompression::Snappy);

In [38]:
pw.finish(&mut df);

## Loop Through Rows

In [19]:
{
    let columns = df.get_columns();
    for i in 0..5 {
        print!("{i}: ");
        columns.iter().for_each(|s: &Series| {
            print!("{:?} ", s.get(i));
        });
        println!("");
    }
}

0: Int64(2) Int64(16796161) Int64(4503599635760400) Float64(1.0705899035734592) 
1: Int64(16) Int64(8293) Int64(3288727552) Float64(-0.08556843043513492) 
2: Int64(2) Int64(4503599660926465) Int64(28684) Float64(4.779815249979719) 
3: Int64(4) Int64(4503599627386947) Int64(274911465616) Float64(4.650999108662172) 
4: Int64(1) Int64(4194968) Int64(549822931264) Float64(0.5104124463171542) 


()

## References 

- [polars](https://github.com/pola-rs/polars)

- [Polars Eager cookbook](https://docs.rs/polars/latest/polars/docs/eager/index.html)