# Chapter 18: Polars Internals

In [None]:
import polars as pl
pl.__version__  # The book is built with Polars version 1.20.0

## Polars' Architecture

## Arrow

## Multithreaded Computations and SIMD Operations

## The String Data Type in Memory

## ChunkedArrays in Series

## Query Optimization

### LazyFrame Scan-Level Optimizations

In [None]:
taxis = pl.scan_parquet("data/taxi/yellow_tripdata_*.parquet")  
taxis.select(pl.col("trip_distance")).show_graph()  

In [None]:
taxis.filter(pl.col("trip_distance") > 10).show_graph()

In [None]:
taxis.head(2).collect()

### Other Optimizations

In [None]:
values = pl.LazyFrame({"value": [10, 20, 30, 40, 50, 60]})

common_subplan = values.with_columns(pl.col("value") * 2)

branch1 = common_subplan.select(value2=pl.col("value") * 4)
branch2 = common_subplan.select(value3=pl.col("value") * 2)

combined = pl.concat([branch1, branch2])

combined.show_graph(optimized=False)

In [None]:
combined.show_graph()

In [None]:
bmi = pl.LazyFrame(
    {"weight_kg": [70, 80, 60, 90], "length_cm": [175, 180, 160, 190]}
)

In [None]:
bmi = (
    bmi.with_columns(weight_per_cm=pl.col("weight_kg") / pl.col("length_cm"))
    .with_columns(weight_kg_average=pl.lit(0))
    .with_columns(length_m=pl.col("length_cm") / 100)
    .with_columns(weight_kg_average=pl.col("weight_kg").mean())
)

In [None]:
bmi = bmi.with_columns(
    weight_per_cm=pl.col("weight_kg") / pl.col("length_cm"),
    weight_kg_average=pl.col("weight_kg").mean(),
    length_m=pl.col("length_cm") / 100,
)

## Checking Your Expressions

### meta Namespace Overview

### meta Namespace Examples

In [None]:
expr1 = pl.col("name")
expr2 = pl.lit("constant")

print(f"Is {expr1} a column: {expr1.meta.is_column()}")
print(f"Is {expr2} a column: {expr2.meta.is_column()}")

In [None]:
print(f"Is {expr1} a literal: {expr1.meta.is_literal()}")
print(f"Is {expr2} a literal: {expr2.meta.is_literal()}")

In [None]:
expr1 = pl.col("age") * 2
expr2 = pl.col("name").alias("username")

# Get output names
print(f"{expr1} output name: {expr1.meta.output_name()}")
print(f"{expr2} output name: {expr2.meta.output_name()}")

In [None]:
expr = (pl.col("age") * 2).alias("double_age")

expr.meta.show_graph()

In [None]:
expr = pl.col("original_name").alias("new_name")

original_expr = expr.meta.undo_aliases()

original_expr.meta.output_name()

In [None]:
expr = pl.col("origin").alias("destination")

expr.meta.root_names()

## Profiling Polars

In [None]:
long_distance_taxis_per_vendor_sorted = (
    pl.scan_parquet("data/taxi/yellow_tripdata_*.parquet")
    .filter(pl.col("trip_distance") > 10)
    .select(pl.col("VendorID"), pl.col("trip_distance"), pl.col("total_amount"))
    .group_by("VendorID")
    .agg(
        total_distance=pl.col("trip_distance").sum(),
        total_amount=pl.col("total_amount").sum(),
    )
    .sort("total_distance", descending=True)
)

long_distance_taxis_per_vendor_sorted.show_graph()

In [None]:
result, profiling_info = long_distance_taxis_per_vendor_sorted.profile()

In [None]:
result

In [None]:
profiling_info

In [None]:
long_distance_taxis_per_vendor_sorted.profile(show_plot=True, figsize=(15, 5))

## Tests in Polars

### Comparing DataFrames and Series

In [None]:
from polars.testing import (
    assert_series_equal,
    assert_frame_equal,
    assert_series_not_equal,
    assert_frame_not_equal,
)

In [None]:
floats = pl.DataFrame({"a": [1.0, 2.0, 3.0, 4.0]})

different_floats = pl.DataFrame({"a": [1.001, 2.0, 3.0, 4.0]})

In [None]:
# This raises an AssertionError:
# assert_frame_equal(floats, different_floats)

In [None]:
assert_frame_equal(floats, different_floats, rel_tol=0.01)
print("The DataFrames are equal.")

In [None]:
result = pl.DataFrame({"a": [1, 3], "b": [2, 4]}).cast(
    pl.Schema({"a": pl.Int8, "b": pl.Int8})
)

expected = pl.from_repr(
    """
┌─────┬─────┐
│ a   ┆ b   │
│ --- ┆ --- │
│ i8  ┆ i8  │
╞═════╪═════╡
│ 1   ┆ 2   │
│ 3   ┆ 4   │
└─────┴─────┘
    """
)

assert_frame_equal(result, expected)
print("DataFrames are equal")

## Common Antipatterns

### Using Brackets for Column Selection

### Misusing Collect

In [None]:
%%time
taxis = pl.scan_parquet("data/taxi/yellow_tripdata_*.parquet")
vendor0 = taxis.filter(pl.col("VendorID") == 0).collect()
vendor1 = taxis.filter(pl.col("VendorID") == 1).collect()

In [None]:
%%time
taxis = pl.scan_parquet("data/taxi/yellow_tripdata_*.parquet")
vendors = taxis.filter(pl.col("VendorID").is_in([0, 1])).collect()
vendor0 = vendors.filter(pl.col("VendorID") == 0)
vendor1 = vendors.filter(pl.col("VendorID") == 1)

### Using Python Code in your Polars Queries

## Takeaways