In [1]:
import io
import polars as pl

data = {"col1": [1, 2, 3], "col2": ["x", "y", "z"]}
df = pl.DataFrame(data)


def get_csv_text():
    csv_text = "col1,col2\n1,x\n2,y\n3,z\n"
    return io.StringIO(csv_text)

## 1. How to construct

### scan_*()

In [2]:
print(pl.scan_csv(get_csv_text()))

naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

Csv SCAN [22 in-mem bytes]
PROJECT */2 COLUMNS


### pl.DataFrame.lazy()

In [3]:
print(df.lazy())

naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

DF ["col1", "col2"]; PROJECT */2 COLUMNS


## 2. Materialize

In [4]:
print(pl.scan_csv(get_csv_text()).collect())

shape: (3, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ i64  ┆ str  │
╞══════╪══════╡
│ 1    ┆ x    │
│ 2    ┆ y    │
│ 3    ┆ z    │
└──────┴──────┘


## 3. Query plan + optimizations + type check

In [5]:
q1 = (
    pl.scan_csv(get_csv_text())
    .with_columns(pl.col("col2").str.to_uppercase())
    .filter(pl.col("col1").le(2))
)
print(q1.explain())

 WITH_COLUMNS:
 [col("col2").str.uppercase()] 
  Csv SCAN [22 in-mem bytes]
  PROJECT */2 COLUMNS
  SELECTION: [(col("col1")) <= (2)]


In [6]:
print(q1.collect())

shape: (2, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ i64  ┆ str  │
╞══════╪══════╡
│ 1    ┆ X    │
│ 2    ┆ Y    │
└──────┴──────┘


In [7]:
print(q1.explain(optimized=False))

FILTER [(col("col1")) <= (2)]
FROM
   WITH_COLUMNS:
   [col("col2").str.uppercase()] 
    Csv SCAN [22 in-mem bytes]
    PROJECT */2 COLUMNS


In [8]:
# ❌
# SchemaError: invalid series dtype: expected `String`, got `i64` for series with name `col1`

# (
#     pl.scan_csv(get_csv_text())
#     .with_columns(pl.col("col1").str.to_uppercase())
#     .collect()
# )

## 4. pl.collect_all()

In [9]:
lf1 = pl.scan_csv(get_csv_text())
lf1_a = lf1.filter(pl.col("col1").le(2))
lf1_b = lf1.select(pl.col("col2"))
print(pl.collect_all([lf1_a, lf1_b]))

[shape: (2, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ i64  ┆ str  │
╞══════╪══════╡
│ 1    ┆ x    │
│ 2    ┆ y    │
└──────┴──────┘, shape: (3, 1)
┌──────┐
│ col2 │
│ ---  │
│ str  │
╞══════╡
│ x    │
│ y    │
│ z    │
└──────┘]


## 5. Limitations

In [10]:
print(
    pl.LazyFrame({"id": ["a", "b", "c"], "month": ["jan", "feb", "mar"]})
    .with_columns(values=pl.Series([0, 1, 2]))
    .collect()
    .pivot(
        index="id", on="month", values="values", aggregate_function="first"
    )
    .lazy()
    .filter(pl.col("jan").is_null())
    .collect()
)

shape: (2, 4)
┌─────┬──────┬──────┬──────┐
│ id  ┆ jan  ┆ feb  ┆ mar  │
│ --- ┆ ---  ┆ ---  ┆ ---  │
│ str ┆ i64  ┆ i64  ┆ i64  │
╞═════╪══════╪══════╪══════╡
│ b   ┆ null ┆ 1    ┆ null │
│ c   ┆ null ┆ null ┆ 2    │
└─────┴──────┴──────┴──────┘


## 6. Use case

In [11]:
print(pl.read_csv(get_csv_text()).head(1))

shape: (1, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ i64  ┆ str  │
╞══════╪══════╡
│ 1    ┆ x    │
└──────┴──────┘


In [12]:
# better
print(pl.scan_csv(get_csv_text()).head(1).collect())

shape: (1, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ i64  ┆ str  │
╞══════╪══════╡
│ 1    ┆ x    │
└──────┴──────┘
