# Eager and Lazy APIs

In [20]:
import polars as pl

In [21]:
# %%time
# trips = pl.read_parquet("data/taxi/yellow_tripdata_*.parquet")
# sum_per_vendor=trips.group_by("VendorID").sum()

# income_per_distance_per_vendor = sum_per_vendor.select(
#     "VendorID",
#     income_per_distance = pl.col("total_amount")/pl.col("trip_distance"),
#     )

# top_three = income_per_distance_per_vendor.sort(
#     by = "income_per_distance", descending=True
# ).head(3)

# top_three

haha looks like it was too much data and it killed the kernal in the dev container :D

In [22]:
%%time
trips = pl.scan_parquet("data/taxi/yellow_tripdata_*.parquet")
sum_per_vendor=trips.group_by("VendorID").sum()

income_per_distance_per_vendor = sum_per_vendor.select(
    "VendorID",
    income_per_distance = pl.col("total_amount")/pl.col("trip_distance"),
    )

top_three = income_per_distance_per_vendor.sort(
    by = "income_per_distance", descending=True
).head(3)

top_three.collect()

CPU times: user 1.52 s, sys: 391 ms, total: 1.91 s
Wall time: 596 ms


VendorID,income_per_distance
i64,f64
1,6.434789
6,5.296493
5,4.731557


very cool the cell above can actually run in the dinky little container, not true for the eager api.
there are 12 parquet files, each one ~ around 30-50mb. so ltos of data. lets actually see how many rows that is...

In [23]:
schema = trips.collect_schema()
schema

Schema([('VendorID', Int64),
        ('tpep_pickup_datetime', Datetime(time_unit='ns', time_zone=None)),
        ('tpep_dropoff_datetime', Datetime(time_unit='ns', time_zone=None)),
        ('passenger_count', Float64),
        ('trip_distance', Float64),
        ('RatecodeID', Float64),
        ('store_and_fwd_flag', String),
        ('PULocationID', Int64),
        ('DOLocationID', Int64),
        ('payment_type', Int64),
        ('fare_amount', Float64),
        ('extra', Float64),
        ('mta_tax', Float64),
        ('tip_amount', Float64),
        ('tolls_amount', Float64),
        ('improvement_surcharge', Float64),
        ('total_amount', Float64),
        ('congestion_surcharge', Float64),
        ('airport_fee', Float64)])

In [24]:

len(schema)

19

In [25]:

trips.select(pl.len()).collect().item()

39656098

goodness thats a pretty big data set -  39million rows x 19 columns

## Tips and Trick

lazy to dataframe and back again

In [26]:
# nl = trips.collect()
# nl

# back again
# nl = nl.lazy()
# nl

In [27]:
# trips.head(3).sink_parquet("sink.parquet")


# readthis = pl.read_parquet("sink.parquet")
# readthis

# readthis = pl.scan_parquet("sink.parquet")
# readthis

joing a dataframe with a lazyframe

In [28]:
big_sales_data = pl.LazyFrame(
    {"sale_id": [101, 102, 103], "amount": [250, 150, 300]}
)

sales_metadata = pl.DataFrame(
    {"sale_id": [101, 102, 103], "category": ["A", "B", "A"]}
)

# expect error
# big_sales_data.join(sales_metadata, on="sale_id").collect()

# instead you can make the lazyframe a df and then join or make the df lazy and append. opt for converting to lazy

# works
big_sales_data.join(sales_metadata.lazy(), on="sale_id").collect()

sale_id,amount,category
i64,i64,str
101,250,"""A"""
102,150,"""B"""
103,300,"""A"""


caching intermittent results

In [None]:
lf = pl.LazyFrame(
    {
        "col1": [1, 2, 3],
        "col2": [4, 5, 6],
    }
)

# pretend there is some heavy calculations

# now cache the results

lf = lf.collect().lazy()
print(lf.collect())

# the line below this would've recalc lf from scratch had it not been cached
print(lf.with_columns(pl.col("col1"), + 1).collect())

shape: (3, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ i64  ┆ i64  │
╞══════╪══════╡
│ 1    ┆ 4    │
│ 2    ┆ 5    │
│ 3    ┆ 6    │
└──────┴──────┘
shape: (3, 3)
┌──────┬──────┬─────────┐
│ col1 ┆ col2 ┆ literal │
│ ---  ┆ ---  ┆ ---     │
│ i64  ┆ i64  ┆ i32     │
╞══════╪══════╪═════════╡
│ 1    ┆ 4    ┆ 1       │
│ 2    ┆ 5    ┆ 1       │
│ 3    ┆ 6    ┆ 1       │
└──────┴──────┴─────────┘
