# Chapter 12: Working with Textual, Temporal, and Nested Data Types

In [None]:
import polars as pl
pl.__version__  # The book is built with Polars version 1.20.0

## String

### String Methods

#### String methods for conversion

#### String methods for describing and querying

#### String methods for manipulation

### String Examples

In [None]:
corpus = pl.DataFrame(
    {
        "raw_text": [
            "  Data Science is amazing ",
            "Data_analysis > Data entry",
            " Python&Polars; Fast",
        ]
    }
)

corpus

In [None]:
corpus = corpus.with_columns(
    processed_text=pl.col("raw_text")  
    .str.strip_chars()  
    .str.to_lowercase()  
    .str.replace_all("_", " ")  
)
corpus

In [None]:
corpus.with_columns(
    first_5_chars=pl.col("processed_text").str.slice(0, 5),  
    first_word=pl.col("processed_text")
    .str.split(" ")  
    .list.get(0),  
    second_word=pl.col("processed_text").str.split(" ").list.get(1),  
)

In [None]:
corpus.with_columns(
    len_chars=pl.col("processed_text").str.len_chars(),  
    len_bytes=pl.col("processed_text").str.len_bytes(),  
    count_a=pl.col("processed_text").str.count_matches("a"),  
)

In [None]:
posts = pl.DataFrame(
    {"post": ["Loving #python and #polars!", "A boomer post without a hashtag"]}
)

hashtag_regex = r"#(\w+)"  

posts.with_columns(
    hashtags=pl.col("post").str.extract_all(hashtag_regex)  
)

## Categorical

In [None]:
cats = pl.DataFrame(
    {"name": ["Persian cat", "Siamese Cat", "Lynx", "Lynx"]},
    schema={"name": pl.Categorical},
)

cats.with_columns(name_physical=pl.col("name").to_physical())

### Categorical Methods

### Categorical Examples

In [None]:
more_cats = pl.DataFrame(
    {"name": ["Maine Coon Cat", "Lynx", "Lynx", "Siamese Cat"]},
    schema={"name": pl.Categorical},
)

more_cats.with_columns(name_physical=pl.col("name").to_physical())

In [None]:
cats.join(more_cats, on="name")

In [None]:
with pl.StringCache():
    left = pl.DataFrame(
        {
            "categorical_column": ["value3", "value2", "value1"],
            "other": ["a", "b", "c"],
        },
        schema={"categorical_column": pl.Categorical, "other": pl.String},
    )
    right = pl.DataFrame(
        {
            "categorical_column": ["value2", "value3", "value4"],
            "other": ["d", "e", "f"],
        },
        schema={"categorical_column": pl.Categorical, "other": pl.String},
    )

In [None]:
left.join(right, on="categorical_column")

In [None]:
pl.enable_string_cache()

In [None]:
right.select(pl.col("categorical_column").cat.get_categories())

In [None]:
sorting_comparison_df = cats.select(cat_lexical=pl.col("name")).with_columns(
    cat_physical=pl.col("cat_lexical").to_physical()
)

sorting_comparison_df

In [None]:
# sorting_comparison_df.with_columns(
#     pl.col("cat_lexical").cast(pl.Categorical("physical"))
# ).sort(by="cat_lexical")

# A Categorical with physical ordering has been deprecated in the meanwhile. Sorting is now always lexical.

In [None]:
sorting_comparison_df.with_columns(
    pl.col("cat_lexical").cast(pl.Categorical("lexical"))
).sort(by="cat_lexical")

## Enum

In [None]:
bear_enum_dtype = pl.Enum(["Polar", "Panda", "Brown"])

bear_enum_series = pl.Series(
    ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=bear_enum_dtype
)

bear_cat_series = pl.Series(
    ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical
)

## Temporal

### Temporal Methods

#### Temporal methods for conversion

#### Temporal methods for describing and querying

#### Temporal methods for manipulation

### Temporal Examples

#### Loading from a CSV file

In [None]:
pl.read_csv("data/all_stocks.csv", try_parse_dates=True)

#### Converting to and from a String

In [None]:
dates = pl.DataFrame({"date_str": ["2023-12-31", "2024-02-29"]}).with_columns(
    date=pl.col("date_str").str.to_date("%Y-%m-%d")
)

dates

In [None]:
dates.with_columns(formatted_date=pl.col("date").dt.to_string("%d-%m-%Y"))

#### Generating date ranges

In [None]:
pl.DataFrame(
    {
        "monday": pl.date_range(
            start=pl.date(2024, 10, 28),
            end=pl.date(2024, 12, 1),
            interval="1w",  
            eager=True,  
        ),
    }
)

#### Time zones

In [None]:
pl.DataFrame(  
    {
        "utc_mixed_offset": [
            "2021-03-27T00:00:00+0100",
            "2021-03-28T00:00:00+0100",
            "2021-03-29T00:00:00+0200",
            "2021-03-30T00:00:00+0200",
        ]
    }
).with_columns(
    parsed=pl.col("utc_mixed_offset").str.to_datetime(
        "%Y-%m-%dT%H:%M:%S%z"
    )  
).with_columns(
    converted=pl.col("parsed").dt.convert_time_zone("Europe/Amsterdam")  
)

## List

### List Methods

### List Examples

In [None]:
bools = pl.DataFrame({"values": [[True, True], [False, False, True], [False]]})

bools.with_columns(
    all_true=pl.col("values").list.all(),
    any_true=pl.col("values").list.any(),
)

In [None]:
groups = pl.DataFrame({"ages": [[18, 21], [30, 40, 50], [42, 69]]})

groups.with_columns(
    over_forty=pl.col("ages").list.eval(
        pl.element() > 40,  
        parallel=True,  
    )
).with_columns(  
    all_over_forty=pl.col("over_forty").list.all()  
)

In [None]:
groups.with_columns(
    ages_sorted_descending=pl.col("ages").list.sort(descending=True)
)

In [None]:
groups.explode("ages")

In [None]:
groups.select(ages=pl.col("ages").list.explode())

## Array

### Array Methods

### Array Examples

In [None]:
events = pl.DataFrame(
    [
        pl.Series(
            "location", ["Paris", "Amsterdam", "Barcelona"], dtype=pl.String
        ),
        pl.Series(
            "temperatures",
            [
                [23, 27, 21, 22, 24, 23, 22],
                [17, 19, 15, 22, 18, 20, 21],
                [30, 32, 28, 29, 34, 33, 31],
            ],
            dtype=pl.Array(pl.Int64, shape=7),
        ),
    ]
)

events

In [None]:
events.with_columns(
    median=pl.col("temperatures").arr.median(),
    max=pl.col("temperatures").arr.max(),
    warmest_dow=pl.col("temperatures").arr.arg_max(),
)

## Struct

### Struct Methods

### Struct Examples

In [None]:
from datetime import date

orders = pl.DataFrame(
    {
        "customer_id": [2781, 6139, 5392],
        "order_details": [
            {"amount": 250.00, "date": date(2024, 1, 3), "items": 5},
            {"amount": 150.00, "date": date(2024, 1, 5), "items": 1},
            {"amount": 100.00, "date": date(2024, 1, 2), "items": 3},
        ],
    },
)

orders

In [None]:
orders.select(pl.col("order_details").struct.field("amount"))

In [None]:
order_details_df = orders.unnest("order_details")

order_details_df

In [None]:
order_details_df.select(
    "amount",
    "date",
    "items",
    order_details=pl.struct(pl.col("amount"), pl.col("date"), pl.col("items")),
)

In [None]:
basket = pl.DataFrame(
    {
        "fruit": ["cherry", "apple", "banana", "banana", "apple", "banana"],
    }
)

basket

In [None]:
basket.select(pl.col("fruit").value_counts(sort=True))

In [None]:
basket.select(pl.col("fruit").value_counts(sort=True).struct.unnest())

## Takeaways