# Chapter 18: Polars Internals

In [1]:
import polars as pl
pl.show_versions()  # The book is built with Polars version 1.0.0

## Arrow

## Multi-Threaded Computations and SIMD Operations

## The String Data Type in Memory

## ChunkedArrays in Series

## Query Optimization

### LazyFrame Scan Level Optimizations

In [8]:
lf = pl.scan_parquet("data/taxi/yellow_tripdata_*.parquet")  
lf.select(pl.col("trip_distance")).show_graph()  

In [9]:
lf.filter(pl.col("trip_distance") > 10).show_graph()

In [10]:
lf.head(2).collect()

### Other Optimizations

In [12]:
lazy_df = pl.LazyFrame({
    "foo": [1, 2, 3, 4, 5],
    "bar": [6, 7, 8, 9, 10]
})

In [13]:
common_subplan = lazy_df.with_columns(pl.col("foo") * 2)

# Utilizing the common subplan in two separate expressions
expr1 = common_subplan.filter(pl.col("foo") * 2 > 4)
expr2 = common_subplan.filter(pl.col("foo") * 2 < 8)

result = pl.concat([expr1, expr2])

result.show_graph(optimized=False)
result.show_graph()

In [14]:
df = (
    df.lazy()
    .with_columns(weight_per_cm=pl.col("weight_kg") / pl.col("length_cm"))
    .with_columns(weight_kg_average=pl.lit(0))
    .with_columns(length_m=pl.col("length_cm") / 100)
    .with_columns(weight_kg_average=pl.col("weight_kg").mean())
)

In [15]:
df = df.lazy().with_columns(
    weight_per_cm=pl.col("weight_kg") / pl.col("length_cm"),
    weight_kg_average=pl.col("weight_kg").mean(),
    length_m=pl.col("length_cm") / 100,
)

## Profiling Polars

In [17]:
lf = pl.scan_parquet("data/taxi/yellow_tripdata_*.parquet")
lf = lf.filter(pl.col("trip_distance") > 10)
lf = lf.select(pl.col("VendorID"), pl.col("trip_distance"), pl.col("total_amount"))
lf = lf.group_by("VendorID").agg(
    pl.col("trip_distance").sum().alias("total_distance"),
    pl.col("total_amount").sum().alias("total_amount")
)
lf = lf.sort("total_distance", descending=True)

lf.show_graph()

In [18]:
lf.profile()

In [19]:
lf.profile(show_plot=True, figsize=(15,5))

## Tests in Polars

### Comparing DataFrames and Series

In [22]:
import polars as pl
from polars.testing import (
  assert_series_equal,
  assert_frame_equal,
  assert_series_not_equal,
  assert_frame_not_equal
)

In [23]:
df1 = pl.DataFrame({
    'a': [1.0, 2.0, 3.0, 4.0],
})

df2 = pl.DataFrame({
    'a': [1.001, 2.0, 3.0, 4.0],
})

assert_frame_equal(df1, df2)

In [24]:
assert_frame_equal(df1, df2, rtol=0.01)
print("The frames are equal.")

In [25]:
result = pl.DataFrame({
    "a": [1, 3],
    "b": [2, 4]
}).cast(pl.Schema({"a": pl.Int8, "b": pl.Int8}))

expected = pl.from_repr(
    """
┌─────┬─────┐
│ a   ┆ b   │
│ --- ┆ --- │
│ i8  ┆ i8  │
╞═════╪═════╡
│ 1   ┆ 2   │
│ 3   ┆ 4   │
└─────┴─────┘
    """
)

assert_frame_equal(result, expected)
print("Frames are equal!")

## Common Anti-patterns

### Using Brackets for Column Selection

### Misusing `collect()`

In [29]:
%%time
lf = pl.scan_parquet("data/taxi/yellow_tripdata_*.parquet")
vendor0 = lf.filter(pl.col("VendorID") == 0).collect()
vendor1 = lf.filter(pl.col("VendorID") == 1).collect()

In [30]:
%%time
lf = pl.scan_parquet("data/taxi/yellow_tripdata_*.parquet")
df = lf.filter(pl.col("VendorID").is_in([0,1])).collect()
vendor0 = df.filter(pl.col("VendorID") == 0)
vendor1 = df.filter(pl.col("VendorID") == 1)

### Using Python Code in your Polars Queries

## Takeaways