# Chapter 17: Extending Polars

In [None]:
import polars as pl
pl.__version__  # The book is built with Polars version 1.20.0

## User-Defined Functions in Python

### Applying a Function to Elements

In [None]:
from textblob import TextBlob


def analyze_sentiment(review):
    return TextBlob(review).sentiment.polarity


reviews = pl.DataFrame(
    {
        "reviews": [
            "This product is great!",
            "Terrible service.",
            "Okay, but not what I expected.",
            "Excellent! I love it.",
        ]
    }
)

reviews.with_columns(
    sentiment_score=pl.col("reviews").map_elements(
        analyze_sentiment, return_dtype=pl.Float64
    )
)

In [None]:
ints = pl.DataFrame({"x": [1, 2, 3, 4]})


def add_one(x):
    return x + 1


ints.with_columns(
    pl.col("x")
    .map_elements(
        add_one,
        return_dtype=pl.Int64,
    )
    .alias("x + 1")
)

# Raises a PolarsInefficientMapWarning

### Applying a Function to a Series

In [None]:
import polars.selectors as cs
from scipy.special import softmax

ml_dataset = pl.DataFrame(
    {
        "feature1": [0.3, 0.2, 0.4, 0.1, 0.2, 0.3, 0.5],
        "feature2": [32, 50, 70, 65, 0, 10, 15],
        "label": [1, 0, 1, 0, 1, 0, 0],
    }
)

ml_dataset.select(
    "label",
    cs.starts_with("feature").map_batches(
        lambda x: softmax(x.to_numpy()),
        return_dtype=pl.Float64,
    ),
)

### Applying a Function to Groups

In [None]:
from sklearn.preprocessing import StandardScaler


def scale_temperature(group):
    scaler = StandardScaler()
    scaled_values = scaler.fit_transform(group[["temperature"]].to_numpy())
    return group.with_columns(
        pl.Series(values=scaled_values.flatten(), name="scaled_feature")
    )


temperatures = pl.DataFrame(
    {
        "country": ["USA", "USA", "USA", "USA", "NL", "NL", "NL"],
        "temperature": [32, 50, 70, 65, 0, 10, 15],
    }
)

temperatures.group_by("country").map_groups(scale_temperature)

In [None]:
temperatures = pl.DataFrame(
    {
        "country": ["USA", "USA", "USA", "USA", "NL", "NL", "NL"],
        "temperature": [32, 50, 70, 65, 0, 10, 15],
    }
)

for group, df in temperatures.group_by("country"):
    print(f"{group[0]}:\n{df}\n")

In [None]:
from functools import lru_cache

from textblob import TextBlob


@lru_cache(maxsize=256)
def analyze_sentiment(review):
    return TextBlob(review).sentiment.polarity


reviews = pl.DataFrame(
    {
        "reviews": [
            "This product is great!",
            "Terrible service.",
            "Okay, but not what I expected.",
            "Excellent! I love it.",
        ]
    }
)

reviews.with_columns(
    sentiment_score=pl.col("reviews").map_elements(
        analyze_sentiment, return_dtype=pl.Float64
    )
)

### Applying a Function to an Expression

In [None]:
addresses = pl.DataFrame(
    {
        "address": [
            "Nieuwezijds Voorburgwal 147",
            "Museumstraat 1",
            "Oosterdok 2",
        ]
    }
)


def extract_house_number(input_expr: pl.Expr) -> pl.Expr:
    """Extract the house number from an address String"""
    return input_expr.str.extract(r"\d+", 0).cast(pl.Int64)


addresses.with_columns(
    house_numbers=pl.col("address").pipe(extract_house_number)
)

### Applying a Function to a DataFrame or LazyFrame

In [None]:
small_numbers = pl.DataFrame({"ints": [2, 4, 6], "floats": [10.0, 20.0, 30.0]})


def scale_the_input(
    df: pl.DataFrame | pl.LazyFrame, scale_factor: int
) -> pl.DataFrame | pl.LazyFrame:
    """Scales the input by the input factor"""
    return df * scale_factor


small_numbers.pipe(scale_the_input, 5)

## Registering Your Own Namespace

In [None]:
@pl.api.register_expr_namespace("celsius")  
class Celsius:
    def __init__(self, expr: pl.Expr):  
        self._expr = expr

    def to_fahrenheit(self) -> pl.Expr:  
        return (self._expr * 9 / 5) + 32

    def to_kelvin(self) -> pl.Expr:
        return self._expr + 273.15

In [None]:
temperatures = pl.DataFrame({"celsius": [0, 10, 20, 30, 40]})

temperatures.with_columns(fahrenheit=pl.col("celsius").celsius.to_fahrenheit())

## Polars Plugins in Rust

### Prerequisites

In [None]:
! rustc --version

### The Anatomy of a Plugin Project

### The Plugin

### Compiling the Plugin

In [None]:
! cd plugins/hello_world_plugin && uv run maturin develop --release

In [None]:
# Reset the kernel to make the new plugin available

# The code below will do this automatically when run in IPython
get_ipython().kernel.do_shutdown(restart=True)

### Performance Benchmark

In [None]:
import polars as pl
from hello_world_func import hello_world  
import time

lots_of_strings = pl.DataFrame(
    {
        "a": ["1", "2", "3", "4"] * 100_000,
    }
)

times = []
for i in range(10):
    t0 = time.time()
    out = lots_of_strings.with_columns(
        pl.col("a").str.replace_all(r".*", "Hello, world!")
    )
    t1 = time.time()
    times.append(t1 - t0)
print(
    f"Polars native string replace:        {sum(times) / len(times):.5f}"
)  


times = []
for i in range(10):
    t0 = time.time()
    out = lots_of_strings.with_columns(hello_world("a"))  
    t1 = time.time()
    times.append(t1 - t0)
print(f"Our custom made Hello world replace: {sum(times) / len(times):.5f}")

### Register Arguments

#### Working with multiple arguments as input

#### Other register arguments

### Using a Rust Crate

### Use Case: geo

#### Adding the geo crate

#### The Rust code

In [None]:
! cd plugins/polars_geo && uv run maturin develop --release

In [None]:
# Reset the kernel to make the new plugin available

# The code below will do this automatically when run in IPython
get_ipython().kernel.do_shutdown(restart=True)

#### The Python code

#### Making the custom namespace

In [None]:
import polars as pl
points_and_polygons = pl.DataFrame(
    {
        "point": [[5.0, 5.0], [20.0, 20.0], [20.0, 20.0]],
        "polygon": [
            [[0.0, 0.0], [10.0, 0.0], [10.0, 10.0], [0.0, 10.0]],
            [
                [0.0, 0.0],
                [10.0, 0.0],
                [10.0, 10.0],
            ],
            [[0.0, None], [10.0, 0.0], [10.0, 10.0], [0.0, 10.0], [0.0, 0.0]],
        ],
    }
)

In [None]:
from plugins.polars_geo import polars_geo

# Apply the point_in_polygon function
points_and_polygons.with_columns(
    pl.col("point").geo.point_in_polygon(pl.col("polygon")).alias("in_polygon")
)

## Takeaways