In [None]:
import marimo as mo


# Polars vs. Pandas: A Fast, Multi-Core Alternative for DataFrames

## Setup

In [None]:
import numpy as np
import pandas as pd

# Create a large dataset
n_rows = 10_000_000
data = {
    "category": np.random.choice(["A", "B", "C", "D"], size=n_rows),
    "value": np.random.rand(n_rows) * 1000,
}
pandas_df = pd.DataFrame(data)
pandas_df.head(10)

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>category</th>
      <th>value</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>A</td>
      <td>677.265640</td>
    </tr>
    <tr>
      <th>1</th>
      <td>D</td>
      <td>504.105839</td>
    </tr>
    <tr>
      <th>2</th>
      <td>C</td>
      <td>744.468596</td>
    </tr>
    <tr>
      <th>3</th>
      <td>C</td>
      <td>463.784413</td>
    </tr>
    <tr>
      <th>4</th>
      <td>B</td>
      <td>536.412236</td>
    </tr>
    <tr>
      <th>5</th>
      <td>D</td>
      <td>226.222343</td>
    </tr>
    <tr>
      <th>6</th>
      <td>C</td>
      <td>151.146594</td>
    </tr>
    <tr>
      <th>7</th>
      <td>B</td>
      <td>606.978756</td>
    </tr>
    <tr>
      <th>8</th>
      <td>A</td>
      <td>122.935896</td>
    </tr>
    <tr>
      <th>9</th>
      <td>C</td>
      <td>797.514319</td>
    </tr>
  </tbody>
</table>
</div>

In [None]:
pandas_df.to_csv("large_file.csv", index=False)

## 1. Reading Data Faster

### Pandas

In [None]:
import time

start_read_pd = time.time()
df_pd = pd.read_csv("large_file.csv")
end_read_pd = time.time()
print(f"Pandas read_csv took {end_read_pd - start_read_pd:.2f} seconds")

Pandas read_csv took 1.28 seconds


### Polars

In [None]:
import polars as pl

start_read_pl = time.time()
polars_df = pl.read_csv("large_file.csv")
end_read_pl = time.time()
print(f"Polars read_csv took {end_read_pl - start_read_pl:.2f} seconds")

Polars read_csv took 0.15 seconds


## 2. Lazy Evaluation (Only in Polars)

In [None]:
lazy_polars_df = polars_df.lazy()
result = (
    lazy_polars_df.filter(pl.col("value") > 100)
    .group_by("category")
    .agg(pl.col("value").mean().alias("avg_value"))
    .collect()
)
result.head(10)

<div><style>
.dataframe > thead > tr,
.dataframe > tbody > tr {
  text-align: right;
  white-space: pre-wrap;
}
</style>
<small>shape: (4, 2)</small><table border="1" class="dataframe"><thead><tr><th>category</th><th>avg_value</th></tr><tr><td>str</td><td>f64</td></tr></thead><tbody><tr><td>&quot;A&quot;</td><td>549.864712</td></tr><tr><td>&quot;C&quot;</td><td>550.31467</td></tr><tr><td>&quot;D&quot;</td><td>550.046616</td></tr><tr><td>&quot;B&quot;</td><td>549.950649</td></tr></tbody></table></div>

## 3. Multi-Core Performance

In [None]:
pandas_groupby_df = pd.DataFrame(data)
polars_groupby_df = pl.DataFrame(data)

### Groupby Mean

In [None]:
start_groupby_pd = time.time()
pandas_groupby_df.groupby("category")["value"].mean()
end_groupby_pd = time.time()
print(f"Pandas groupby took {end_groupby_pd - start_groupby_pd:.2f} seconds")

Pandas groupby took 0.51 seconds


In [None]:
start_groupby_pl = time.time()
polars_groupby_df.group_by("category").agg(pl.col("value").mean())
end_groupby_pl = time.time()
print(f"Polars groupby took {end_groupby_pl - start_groupby_pl:.2f} seconds")

Polars groupby took 0.12 seconds


### Filter Rows

In [None]:
start_filter_pd = time.time()
pandas_filtered_df = pandas_groupby_df[pandas_groupby_df["value"] > 500]
end_filter_pd = time.time()
print(f"Pandas filter took {end_filter_pd - start_filter_pd:.2f} seconds")

Pandas filter took 0.08 seconds


In [None]:
start_filter_pl = time.time()
polars_filtered_df = polars_groupby_df.filter(pl.col("value") > 500)
end_filter_pl = time.time()
print(f"Polars filter took {end_filter_pl - start_filter_pl:.2f} seconds")

Polars filter took 0.02 seconds


### Sort by Column

In [None]:
start_sort_pd = time.time()
pandas_sorted_df = pandas_groupby_df.sort_values("value")
end_sort_pd = time.time()
print(f"Pandas sort took {end_sort_pd - start_sort_pd:.2f} seconds")

Pandas sort took 2.14 seconds


In [None]:
start_sort_pl = time.time()
polars_sorted_df = polars_groupby_df.sort("value")
end_sort_pl = time.time()
print(f"Polars sort took {end_sort_pl - start_sort_pl:.2f} seconds")

Polars sort took 0.52 seconds


### Join on Key

In [None]:
pandas_df1 = pd.DataFrame({"key": range(5_000_000), "val1": range(5_000_000)})
pandas_df2 = pd.DataFrame({"key": range(5_000_000), "val2": range(5_000_000)})
start_join_pd = time.time()
pandas_joined_df = pd.merge(pandas_df1, pandas_df2, on="key")
end_join_pd = time.time()
print(f"Pandas join took {end_join_pd - start_join_pd:.2f} seconds")

Pandas join took 0.06 seconds


In [None]:
polars_df1 = pl.DataFrame({"key": range(5_000_000), "val1": range(5_000_000)})
polars_df2 = pl.DataFrame({"key": range(5_000_000), "val2": range(5_000_000)})
start_join_pl = time.time()
polars_joined_df = polars_df1.join(polars_df2, on="key", how="inner")
end_join_pl = time.time()
print(f"Polars join took {end_join_pl - start_join_pl:.2f} seconds")

Polars join took 0.06 seconds


## 4. Syntax Comparison

### Filtering rows

In [None]:
pandas_filtered_rows_df = pandas_groupby_df[pandas_groupby_df["value"] > 100]

In [None]:
polars_filtered_rows_df = polars_groupby_df.filter(pl.col("value") > 100)

### Selecting columns

In [None]:
pandas_selected_columns_df = pandas_groupby_df[["category", "value"]]

In [None]:
polars_selected_columns_df = polars_groupby_df.select(["category", "value"])

### Chained operations

In [None]:
pandas_chained_operations_df = pandas_groupby_df[pandas_groupby_df["value"] > 1000]
pandas_chained_operations_df = (
    pandas_chained_operations_df.groupby("category")["value"].mean().reset_index()
)

In [None]:
polars_chained_operations_df = polars_groupby_df.filter(pl.col("value") > 1000)
polars_chained_operations_df = polars_chained_operations_df.group_by(
    "category"
).agg(pl.col("value").mean().alias("avg_value"))

## 5. Memory Efficiency

In [None]:
print(
    f"Pandas DataFrame memory usage: {pandas_groupby_df.memory_usage(deep=True).sum() / 1000000.0:2f} MB"
)
print(
    f"Polars DataFrame estimated size: {polars_groupby_df.estimated_size() / 1000000.0} MB"
)

Pandas DataFrame memory usage: 660.000132 MB
Polars DataFrame estimated size: 90.0 MB
