# Chapter 17: Extending Polars

In [1]:
import polars as pl
pl.show_versions()  # The book is built with Polars version 1.0.0

## User Defined Functions in Python

In [3]:
from textblob import TextBlob

def analyze_sentiment(review):
    return TextBlob(review).sentiment.polarity

df = pl.DataFrame({
    "reviews": [
        "This product is great!",
        "Terrible service.",
        "Okay, but not what I expected.",
        "Excellent! I love it."
    ]
})

df = df.with_columns(
    pl.col("reviews")
    .map_elements(
        analyze_sentiment,
        return_dtype=pl.Float64
    )
    .alias("sentiment_score")
)
df

In [4]:
df = pl.DataFrame({
    "x": [1,2,3,4]
})

def add_one(x):
    return x + 1

df.with_columns(
    pl.col('x')
    .map_elements(
        add_one,
        return_dtype=pl.Int64,
    )
    .alias("x + 1")
)

In [5]:
from functools import lru_cache


df = pl.DataFrame({
    "x": [1,1,3,3]
})

@lru_cache(maxsize=None)
def add_one(x):
    return x + 1

df.with_columns(
    pl.col('x')
    .map_elements(
        add_one,
        return_dtype=pl.Int64,
    )
    .alias("x + 1")
)

In [6]:
import polars.selectors as cs
import numpy as np
from scipy.special import softmax

df = pl.DataFrame({
    "feature1": [0.3, 0.2, 0.4, 0.1, 0.2, 0.3, 0.5],
    "feature2": [32, 50, 70, 65, 0, 10, 15],
    "label": [1, 0, 1, 0, 1, 0, 0]
})

result = df.select(
    "label",
    cs.starts_with("feature").map_batches(
        lambda x: softmax(x.to_numpy()),
    )
)
result

In [7]:
from sklearn.preprocessing import StandardScaler

def scale_temperature(group):
    scaler = StandardScaler()
    scaled_values = scaler.fit_transform(group[['temperature']].to_numpy())
    return group.with_columns(pl.Series(values=scaled_values.flatten(), name="scaled_feature"))

df = pl.DataFrame({
    "group": ["USA", "USA", "USA", "USA", "NL", "NL", "NL"],
    "temperature": [32, 50, 70, 65, 0, 10, 15]
})

result = df.group_by("group").map_groups(scale_temperature)
result

In [8]:
df = pl.DataFrame({
    "group": ["USA", "USA", "USA", "USA", "NL", "NL", "NL"],
    "temperature": [32, 50, 70, 65, 0, 10, 15]
})

for group in df.group_by(["group"]):
    print(group)

## Registering Your Own Namespace

In [10]:
@pl.api.register_expr_namespace("celsius")  
class Celsius:
    def __init__(self, expr: pl.Expr):  
        self._expr = expr

    def to_fahrenheit(self) -> pl.Expr:  
        return (self._expr * 9 / 5) + 32

    def to_kelvin(self) -> pl.Expr:
        return self._expr + 273.15

In [11]:
import polars as pl

df = pl.DataFrame({
    "celsius": [0, 10, 20, 30, 40]
})

df.with_columns(pl.col("celsius").celsius.to_fahrenheit().alias("fahrenheit"))

## Polars Plug-Ins in Rust

### Prerequisites

### The Anatomy of a Plug-in Project

### The Plug-in

In [16]:
from pathlib import Path

import polars as pl
from polars.plugins import register_plugin_function
from polars.type_aliases import IntoExpr


def hello_world(expr: IntoExpr) -> pl.Expr:  
    return register_plugin_function(  
        plugin_path=Path(__file__).parent,  
        function_name="hello_world",  
        args=expr,  
        is_elementwise=True, 
    )

### Compiling the Plug-in

### Performance Benchmark

In [19]:
import polars as pl
from hello_world_func import hello_world  
import time

df = pl.DataFrame(
    {
        "a": ["1", "2", "3", "4"] * 100_000,
    }
)

times = []
for i in range(10):
    t0 = time.time()
    out = df.with_columns(pl.col("a").str.replace_all(r".*", "Hello, world!"))
    t1 = time.time()
    times.append(t1 - t0)
print("Polars native string replace:        ", sum(times) / len(times))


times = []
for i in range(10):
    t0 = time.time()
    out = df.with_columns(hello_world("a"))  
    t1 = time.time()
    times.append(t1 - t0)
print("Our custom made Hello world replace: ", sum(times) / len(times))

### Register Arguments

#### Working with Multiple `args` and `kwargs` as Input

In [22]:
def args_func(arg1: IntoExpr, arg2: IntoExpr) -> pl.Expr:
    return register_plugin_function(
        plugin_path=Path(__file__).parent,
        function_name="args_func",
        args=[arg1, arg2],
    )

In [23]:
def kwargs_func(
    expr: IntoExpr,
    float_arg: float,
    integer_arg: int,
    string_arg: str,
    boolean_arg: bool,
) -> pl.Expr:
    return register_plugin_function(
        plugin_path=Path(__file__).parent,
        function_name="kwargs_func",
        args=expr,
        kwargs={
            "float_arg": float_arg,
            "integer_arg": integer_arg,
            "string_arg": string_arg,
            "boolean_arg": boolean_arg,
        },
    )

#### Other Register Arguments

### Using a Rust Crate

### Use Case: Geo

#### Adding the `geo` Crate

#### The Rust Code: `expressions.rs`

#### The Python Code: `__init__.py`

In [30]:
from pathlib import Path

import polars as pl
from polars.plugins import register_plugin_function
from polars.type_aliases import IntoExpr


def point_in_polygon(point: IntoExpr, polygon: IntoExpr) -> pl.Expr:
    return register_plugin_function(
        plugin_path=Path(__file__).parent,
        args=[point, polygon],
        function_name="point_in_polygon",
        is_elementwise=True,
    )

#### The Python Code: Using the Custom Namespace

In [32]:
import polars as pl
import coordinates_plugin_py as coord

@pl.api.register_expr_namespace("point")  
class Point:
    def __init__(self, input_expression: pl.Expr):  
        self._input_expression = input_expression

    def is_in_polygon(self, polygon: list[list[pl.Float64]]) -> pl.Expr:  
        return coord.point_in_polygon(self._input_expression, polygon)

In [33]:
# Create a sample DataFrame
df = pl.DataFrame(
    {
        "point": [[5.0, 5.0], [20.0, 20.0], [20.0, 20.0]],
        "polygon": [
            [[0.0, 0.0], [10.0, 0.0], [10.0, 10.0], [0.0, 10.0]],
            [[0.0, 0.0], [10.0, 0.0], [10.0, 10.0], ],
            [[0.0, None], [10.0, 0.0], [10.0, 10.0], [0.0, 10.0], [0.0, 0.0]],
        ],
    }
)

In [34]:
# Apply the point_in_polygon function
df.with_columns(
    pl.col("point").point.is_in_polygon(pl.col("polygon")).alias("is_in_polygon")
)

## Conclusion