# Chapter 12: Working with Special Data Types

In [1]:
import polars as pl
pl.show_versions()  # The book is built with Polars version 1.0.0

## Strings

### Methods

#### Conversion

#### Descriptive and Query Methods

#### Manipulation

### Examples

In [8]:
import polars as pl

df = pl.DataFrame({
    "raw_text": [
        "  Data Science is amazing ",
        "Data_analysis > Data entry",
        " Python&Polars; Fast",
    ]
})
print(df)

In [9]:
df = df.with_columns(
    pl.col("raw_text")
    .str.strip_chars()  
    .str.to_lowercase()  
    .str.replace_all("_", " ")  
    .alias("processed_text")  
)
print(df)

In [10]:
print(
    df.with_columns(
        pl.col("processed_text")
        .str.slice(0, 5)  
        .alias("first_5_chars"),
        pl.col("processed_text")
        .str.split(" ")  
        .list.get(0)  
        .alias("first_word"),
        pl.col("processed_text")
        .str.split(" ")
        .list.get(1)  
        .alias("second_word"),
    )
)

In [11]:
print(
    df.with_columns(
        pl.col("processed_text")
        .str.len_chars()  
        .alias("amount_of_chars"),
        pl.col("processed_text")
        .str.len_bytes()  
        .alias("amount_of_bytes"),
        pl.col("processed_text")
        .str.count_matches("a")  
        .alias("count_a"),
    )
)

In [12]:
df = pl.DataFrame({
    "post": ["Loving #python and #polars!", "A boomer post without a hashtag"]
})

hashtag_regex = r"#(\w+)"  

df.with_columns(
    pl.col("post").str.extract_all(hashtag_regex).alias("hashtags")  
)

## Categoricals

In [14]:
df1 = pl.DataFrame(
    {"categorical_column": ["value1", "value2", "value3"]},
    schema={"categorical_column": pl.Categorical},
)

print(
    df1.with_columns(
        pl.col("categorical_column")
        .to_physical()
        .alias("categorical_column_physical")
    )
)

### Methods

### Examples

In [17]:
df2 = pl.DataFrame(
    {"categorical_column": ["value4", "value3", "value2"]},
    schema={"categorical_column": pl.Categorical},
)

print(
    df2.with_columns(
        pl.col("categorical_column")
        .to_physical()
        .alias("categorical_column_physical")
    )
)

In [18]:
df1.join(df2, on="categorical_column")

In [19]:
with pl.StringCache():
    df1 = pl.DataFrame(
        {
            "categorical_column": ["value3", "value2", "value1"],
            "other": ["a", "b", "c"],
        },
        schema={"categorical_column": pl.Categorical, "other": pl.String},
    )
    df2 = pl.DataFrame(
        {
            "categorical_column": ["value2", "value3", "value4"],
            "other": ["d", "e", "f"],
        },
        schema={"categorical_column": pl.Categorical, "other": pl.String},
    )

# Even outside the global string cache's scope, you can now join the
# two dataframes containing Categorical columns
df1.join(df2, on="categorical_column")

In [20]:
pl.enable_string_cache()

In [21]:
df2.select(pl.col("categorical_column").cat.get_categories())

In [22]:
sorting_comparison_df = (
    df2
    .select(
        pl.col("categorical_column")
        .alias("categorical_lexical")
    )
    .with_columns(
        pl.col("categorical_lexical")
        .to_physical()
        .alias("categorical_physical")
    )
)
print(sorting_comparison_df)

In [23]:
print(
    sorting_comparison_df
    .with_columns(
        pl.col("categorical_lexical")
        .cast(pl.Categorical("physical"))  # The default option
    )
    .sort(by="categorical_lexical")
)

In [24]:
print(
    sorting_comparison_df
    .with_columns(
        pl.col("categorical_lexical")
        .cast(pl.Categorical("lexical"))
    )
    .sort(by="categorical_lexical")
)

### Enum

In [26]:
enum_dtype = pl.Enum(["Polar", "Panda", "Brown"])
enum_series = pl.Series(
    ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=enum_dtype
)

cat_series = pl.Series(
    ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical
)

## Temporal Data

### Methods

#### Conversion

#### Descriptive

#### Manipulation

### Examples

#### Loading from CSV

In [34]:
pl.read_csv("data/all_stocks.csv", try_parse_dates=True)

#### Converting to and from string

In [36]:
df = pl.DataFrame({
    "date_str": ["2023-12-31", "2024-02-29"]
})

df = df.with_columns(
    pl.col("date_str").str.strptime(pl.Date, "%Y-%m-%d").alias("date")
)
print(df)

In [37]:
df = df.with_columns(
    pl.col("date").dt.to_string("%d-%m-%Y").alias("formatted_date")
)

print(df)

#### Generating Ranges

In [39]:
from datetime import date
df = pl.DataFrame(
    {
        "date": pl.date_range(
            start=date(2023,12,31),  
            end=date(2024,1,15),
            interval="1w",  
            eager=True,  
        ),
    }
)
print(df)

#### Time Zones

In [41]:
df = pl.DataFrame(  
    {
        "utc_mixed_offset_data": [
            "2021-03-27T00:00:00+0100",
            "2021-03-28T00:00:00+0100",
            "2021-03-29T00:00:00+0200",
            "2021-03-30T00:00:00+0200",
        ]
    }
)
df = (
    df.with_columns(
        pl.col("utc_mixed_offset_data")
        .str.to_datetime("%Y-%m-%dT%H:%M:%S%z")  
        .alias("parsed_data")
    ).with_columns(
        pl.col("parsed_data")
        .dt.convert_time_zone("Europe/Amsterdam")  
        .alias("converted_data")
    )
)
print(df)

## List

### Methods

### Examples

In [45]:
bool_df = pl.DataFrame({
    "values": [[True, True], [False, False, True], [False]]
})
print(
    bool_df
    .with_columns(
        pl.col("values")
        .list.all()
        .alias("all values"),
        pl.col("values")
        .list.any()
        .alias("any values")
    )
)

In [46]:
df = pl.DataFrame({
    "values": [[10, 20], [30, 40, 50], [60]]
})
print(
    df
    .with_columns(
        pl.col("values")
        .list.eval(
            pl.element() > 40,  
            parallel=True,  
        )
        .alias("values > 40")
    )
    .with_columns(  
        pl.col("values > 40")
        .list.all()  
        .alias("all values > 40")
    )
)

In [47]:
df.explode("values")

## Array

### Methods

### Examples

In [51]:
df = pl.DataFrame([
    pl.Series(
        "location",
        ["Paris", "Amsterdam", "Barcelona"],
        dtype=pl.String
    ),
    pl.Series(
        "temperatures",
        [
            [23, 27, 21, 22, 24, 23, 22],
            [17, 19, 15, 22, 18, 20, 21],
            [30, 32, 28, 29, 34, 33, 31]
        ],
        dtype=pl.Array(pl.Int64, width=7),
    ),
])
print(df)

In [52]:
print(
    df
    .with_columns(
        pl.col("temperatures")
        .arr.median()
        .alias("median"),
        pl.col("temperatures")
        .arr.max()
        .alias("max"),
        pl.col("temperatures")
        .arr.arg_max()
        .alias("warmest_weekday")
    )
)

## Structs

### Methods

### Examples

In [56]:
df = pl.DataFrame({
    "struct_column": [
        {"a": 1, "b": 2},
        {"a": 3, "b": 4},
        {"a": 5, "b": 6},
    ]
})
print(df)

In [57]:
df.select(pl.col("struct_column").struct.field("a"))

In [58]:
df = df.unnest("struct_column")
print(df)

In [59]:
df.select(
    "a",
    "b",
    pl.struct(
        pl.col("a"),
        pl.col("b")
    ).alias("struct_column"),
)

In [60]:
df = pl.DataFrame({
    "fruit": ["cherry", "apple", "banana", "banana", "apple", "banana"],
})
print(df)

In [61]:
print(
    df
    .select(
        pl.col("fruit")
        .value_counts(sort=True)
    )
)

In [62]:
print(
    df.select(
        pl.col("fruit")
        .value_counts(sort=True)
    )
    .unnest("fruit")
)

## Conclusion