# Chapter 12: Working with Special Data Types

In [1]:
import polars as pl
pl.show_versions()

--------Version info---------
Polars:               0.20.31
Index type:           UInt32
Platform:             macOS-12.5-arm64-arm-64bit
Python:               3.11.9 (main, Apr  2 2024, 16:11:47) [Clang 14.0.0 (clang-1400.0.29.202)]

----Optional dependencies----
adbc_driver_manager:  0.8.0
cloudpickle:          3.0.0
connectorx:           0.3.2
deltalake:            0.15.0
fastexcel:            0.9.1
fsspec:               2023.12.2
gevent:               23.9.1
hvplot:               0.9.2
matplotlib:           3.8.4
nest_asyncio:         1.6.0
numpy:                1.26.4
openpyxl:             3.1.2
pandas:               2.2.2
pyarrow:              14.0.2
pydantic:             2.5.3
pyiceberg:            0.5.1
pyxlsb:               <not installed>
sqlalchemy:           2.0.25
torch:                <not installed>
xlsx2csv:             0.8.2
xlsxwriter:           3.2.0


## Strings

### Methods

#### Conversion

#### Descriptive and Query Methods

#### Manipulation

### Examples

In [8]:
import polars as pl

df = pl.DataFrame({
    "raw_text": [
        "  Data Science is amazing ",
        "Data_analysis > Data entry",
        " Python&Polars; Fast",
    ]
})
print(df)

shape: (3, 1)
┌────────────────────────────┐
│ raw_text                   │
│ ---                        │
│ str                        │
╞════════════════════════════╡
│   Data Science is amazing  │
│ Data_analysis > Data entry │
│  Python&Polars; Fast       │
└────────────────────────────┘


In [9]:
df = df.with_columns(
    pl.col("raw_text")
    .str.strip_chars()  # <1>
    .str.to_lowercase()  # <2>
    .str.replace_all("_", " ")  # <3>
    .alias("processed_text")  # <4>
)
print(df)

shape: (3, 2)
┌────────────────────────────┬────────────────────────────┐
│ raw_text                   ┆ processed_text             │
│ ---                        ┆ ---                        │
│ str                        ┆ str                        │
╞════════════════════════════╪════════════════════════════╡
│   Data Science is amazing  ┆ data science is amazing    │
│ Data_analysis > Data entry ┆ data analysis > data entry │
│  Python&Polars; Fast       ┆ python&polars; fast        │
└────────────────────────────┴────────────────────────────┘


In [10]:
print(
    df.with_columns(
        pl.col("processed_text")
        .str.slice(0, 5)  # <1>
        .alias("first_5_chars"),
        pl.col("processed_text")
        .str.split(" ")  # <2>
        .list.get(0)  # <3>
        .alias("first_word"),
        pl.col("processed_text")
        .str.split(" ")
        .list.get(1)  # <4>
        .alias("second_word"),
    )
)

shape: (3, 5)
┌─────────────────────────┬─────────────────────────┬───────────────┬────────────────┬─────────────┐
│ raw_text                ┆ processed_text          ┆ first_5_chars ┆ first_word     ┆ second_word │
│ ---                     ┆ ---                     ┆ ---           ┆ ---            ┆ ---         │
│ str                     ┆ str                     ┆ str           ┆ str            ┆ str         │
╞═════════════════════════╪═════════════════════════╪═══════════════╪════════════════╪═════════════╡
│ Data Science is amazing ┆ data science is amazing ┆ data          ┆ data           ┆ science     │
│ Data_analysis > Data    ┆ data analysis > data    ┆ data          ┆ data           ┆ analysis    │
│ entry                   ┆ entry                   ┆               ┆                ┆             │
│  Python&Polars; Fast    ┆ python&polars; fast     ┆ pytho         ┆ python&polars; ┆ fast        │
└─────────────────────────┴─────────────────────────┴───────────────┴────────

In [11]:
print(
    df.with_columns(
        pl.col("processed_text")
        .str.len_chars()  # <1>
        .alias("amount_of_chars"),
        pl.col("processed_text")
        .str.len_bytes()  # <2>
        .alias("amount_of_bytes"),
        pl.col("processed_text")
        .str.count_matches("a")  # <3>
        .alias("count_a"),
    )
)

shape: (3, 5)
┌──────────────────────────┬─────────────────────────┬─────────────────┬─────────────────┬─────────┐
│ raw_text                 ┆ processed_text          ┆ amount_of_chars ┆ amount_of_bytes ┆ count_a │
│ ---                      ┆ ---                     ┆ ---             ┆ ---             ┆ ---     │
│ str                      ┆ str                     ┆ u32             ┆ u32             ┆ u32     │
╞══════════════════════════╪═════════════════════════╪═════════════════╪═════════════════╪═════════╡
│ Data Science is amazing  ┆ data science is amazing ┆ 23              ┆ 23              ┆ 4       │
│ Data_analysis > Data     ┆ data analysis > data    ┆ 26              ┆ 26              ┆ 6       │
│ entry                    ┆ entry                   ┆                 ┆                 ┆         │
│  Python&Polars; Fast     ┆ python&polars; fast     ┆ 19              ┆ 19              ┆ 2       │
└──────────────────────────┴─────────────────────────┴─────────────────┴─────

In [12]:
df = pl.DataFrame({
    "post": ["Loving #python and #polars!", "A boomer post without a hashtag"]
})

hashtag_regex = r"#(\w+)"  # <1>

df.with_columns(
    pl.col("post").str.extract_all(hashtag_regex).alias("hashtags")  # <2>
)

post,hashtags
str,list[str]
"""Loving #python and #polars!""","[""#python"", ""#polars""]"
"""A boomer post without a hashta…",[]


## Categoricals

In [14]:
df1 = pl.DataFrame(
    {"categorical_column": ["value1", "value2", "value3"]},
    schema={"categorical_column": pl.Categorical},
)

print(
    df1.with_columns(
        pl.col("categorical_column")
        .to_physical()
        .alias("categorical_column_physical")
    )
)

shape: (3, 2)
┌────────────────────┬─────────────────────────────┐
│ categorical_column ┆ categorical_column_physical │
│ ---                ┆ ---                         │
│ cat                ┆ u32                         │
╞════════════════════╪═════════════════════════════╡
│ value1             ┆ 0                           │
│ value2             ┆ 1                           │
│ value3             ┆ 2                           │
└────────────────────┴─────────────────────────────┘


### Methods

### Examples

In [17]:
df2 = pl.DataFrame(
    {"categorical_column": ["value4", "value3", "value2"]},
    schema={"categorical_column": pl.Categorical},
)

print(
    df2.with_columns(
        pl.col("categorical_column")
        .to_physical()
        .alias("categorical_column_physical")
    )
)

shape: (3, 2)
┌────────────────────┬─────────────────────────────┐
│ categorical_column ┆ categorical_column_physical │
│ ---                ┆ ---                         │
│ cat                ┆ u32                         │
╞════════════════════╪═════════════════════════════╡
│ value4             ┆ 0                           │
│ value3             ┆ 1                           │
│ value2             ┆ 2                           │
└────────────────────┴─────────────────────────────┘


In [18]:
df1.join(df2, on="categorical_column")

  df1.join(df2, on="categorical_column")


categorical_column
cat
"""value3"""
"""value2"""


In [19]:
with pl.StringCache():
    df1 = pl.DataFrame(
        {
            "categorical_column": ["value3", "value2", "value1"],
            "other": ["a", "b", "c"],
        },
        schema={"categorical_column": pl.Categorical, "other": pl.String},
    )
    df2 = pl.DataFrame(
        {
            "categorical_column": ["value2", "value3", "value4"],
            "other": ["d", "e", "f"],
        },
        schema={"categorical_column": pl.Categorical, "other": pl.String},
    )

# Even outside the global string cache's scope, you can now join the
# two dataframes containing Categorical columns
df1.join(df2, on="categorical_column")

categorical_column,other,other_right
cat,str,str
"""value2""","""b""","""d"""
"""value3""","""a""","""e"""


In [20]:
pl.enable_string_cache()

In [21]:
df2.select(pl.col("categorical_column").cat.get_categories())

categorical_column
str
"""value2"""
"""value3"""
"""value4"""


In [22]:
sorting_comparison_df = (
    df2
    .select(
        pl.col("categorical_column")
        .alias("categorical_lexical")
    )
    .with_columns(
        pl.col("categorical_lexical")
        .to_physical()
        .alias("categorical_physical")
    )
)
print(sorting_comparison_df)

shape: (3, 2)
┌─────────────────────┬──────────────────────┐
│ categorical_lexical ┆ categorical_physical │
│ ---                 ┆ ---                  │
│ cat                 ┆ u32                  │
╞═════════════════════╪══════════════════════╡
│ value2              ┆ 1                    │
│ value3              ┆ 0                    │
│ value4              ┆ 3                    │
└─────────────────────┴──────────────────────┘


In [23]:
print(
    sorting_comparison_df
    .with_columns(
        pl.col("categorical_lexical")
        .cast(pl.Categorical("physical"))  # The default option
    )
    .sort(by="categorical_lexical")
)

shape: (3, 2)
┌─────────────────────┬──────────────────────┐
│ categorical_lexical ┆ categorical_physical │
│ ---                 ┆ ---                  │
│ cat                 ┆ u32                  │
╞═════════════════════╪══════════════════════╡
│ value3              ┆ 0                    │
│ value2              ┆ 1                    │
│ value4              ┆ 3                    │
└─────────────────────┴──────────────────────┘


In [24]:
print(
    sorting_comparison_df
    .with_columns(
        pl.col("categorical_lexical")
        .cast(pl.Categorical("lexical"))
    )
    .sort(by="categorical_lexical")
)

shape: (3, 2)
┌─────────────────────┬──────────────────────┐
│ categorical_lexical ┆ categorical_physical │
│ ---                 ┆ ---                  │
│ cat                 ┆ u32                  │
╞═════════════════════╪══════════════════════╡
│ value2              ┆ 1                    │
│ value3              ┆ 0                    │
│ value4              ┆ 3                    │
└─────────────────────┴──────────────────────┘


### Enum

In [26]:
enum_dtype = pl.Enum(["Polar", "Panda", "Brown"])
enum_series = pl.Series(
    ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=enum_dtype
)

cat_series = pl.Series(
    ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical
)

## Temporal Data

### Methods

#### Conversion

#### Descriptive

#### Manipulation

### Examples

#### Loading from CSV

In [34]:
pl.read_csv("data/all_stocks.csv", try_parse_dates=True)

FileNotFoundError: No such file or directory (os error 2): data/all_stocks.csv

#### Converting to and from string

In [36]:
df = pl.DataFrame({
    "date_str": ["2023-12-31", "2024-02-29"]
})

df = df.with_columns(
    pl.col("date_str").str.strptime(pl.Date, "%Y-%m-%d").alias("date")
)
print(df)

shape: (2, 2)
┌────────────┬────────────┐
│ date_str   ┆ date       │
│ ---        ┆ ---        │
│ str        ┆ date       │
╞════════════╪════════════╡
│ 2023-12-31 ┆ 2023-12-31 │
│ 2024-02-29 ┆ 2024-02-29 │
└────────────┴────────────┘


In [37]:
df = df.with_columns(
    pl.col("date").dt.to_string("%d-%m-%Y").alias("formatted_date")
)

print(df)

shape: (2, 3)
┌────────────┬────────────┬────────────────┐
│ date_str   ┆ date       ┆ formatted_date │
│ ---        ┆ ---        ┆ ---            │
│ str        ┆ date       ┆ str            │
╞════════════╪════════════╪════════════════╡
│ 2023-12-31 ┆ 2023-12-31 ┆ 31-12-2023     │
│ 2024-02-29 ┆ 2024-02-29 ┆ 29-02-2024     │
└────────────┴────────────┴────────────────┘


#### Generating Ranges

In [39]:
from datetime import date
df = pl.DataFrame(
    {
        "date": pl.date_range(
            start=date(2023,12,31),  # <1>
            end=date(2024,1,15),
            interval="1w",  # <2>
            eager=True,  # <3>
        ),
    }
)
print(df)

shape: (3, 1)
┌────────────┐
│ date       │
│ ---        │
│ date       │
╞════════════╡
│ 2023-12-31 │
│ 2024-01-07 │
│ 2024-01-14 │
└────────────┘


#### Time Zones

In [41]:
df = pl.DataFrame(  # <1>
    {
        "utc_mixed_offset_data": [
            "2021-03-27T00:00:00+0100",
            "2021-03-28T00:00:00+0100",
            "2021-03-29T00:00:00+0200",
            "2021-03-30T00:00:00+0200",
        ]
    }
)
df = (
    df.with_columns(
        pl.col("utc_mixed_offset_data")
        .str.to_datetime("%Y-%m-%dT%H:%M:%S%z")  # <2>
        .alias("parsed_data")
    ).with_columns(
        pl.col("parsed_data")
        .dt.convert_time_zone("Europe/Amsterdam")  # <3>
        .alias("converted_data")
    )
)
print(df)

shape: (4, 3)
┌──────────────────────────┬─────────────────────────┬────────────────────────────────┐
│ utc_mixed_offset_data    ┆ parsed_data             ┆ converted_data                 │
│ ---                      ┆ ---                     ┆ ---                            │
│ str                      ┆ datetime[μs, UTC]       ┆ datetime[μs, Europe/Amsterdam] │
╞══════════════════════════╪═════════════════════════╪════════════════════════════════╡
│ 2021-03-27T00:00:00+0100 ┆ 2021-03-26 23:00:00 UTC ┆ 2021-03-27 00:00:00 CET        │
│ 2021-03-28T00:00:00+0100 ┆ 2021-03-27 23:00:00 UTC ┆ 2021-03-28 00:00:00 CET        │
│ 2021-03-29T00:00:00+0200 ┆ 2021-03-28 22:00:00 UTC ┆ 2021-03-29 00:00:00 CEST       │
│ 2021-03-30T00:00:00+0200 ┆ 2021-03-29 22:00:00 UTC ┆ 2021-03-30 00:00:00 CEST       │
└──────────────────────────┴─────────────────────────┴────────────────────────────────┘


## List

### Methods

### Examples

In [45]:
bool_df = pl.DataFrame({
    "values": [[True, True], [False, False, True], [False]]
})
print(
    bool_df
    .with_columns(
        pl.col("values")
        .list.all()
        .alias("all values"),
        pl.col("values")
        .list.any()
        .alias("any values")
    )
)

shape: (3, 3)
┌──────────────────────┬────────────┬────────────┐
│ values               ┆ all values ┆ any values │
│ ---                  ┆ ---        ┆ ---        │
│ list[bool]           ┆ bool       ┆ bool       │
╞══════════════════════╪════════════╪════════════╡
│ [true, true]         ┆ true       ┆ true       │
│ [false, false, true] ┆ false      ┆ true       │
│ [false]              ┆ false      ┆ false      │
└──────────────────────┴────────────┴────────────┘


In [46]:
df = pl.DataFrame({
    "values": [[10, 20], [30, 40, 50], [60]]
})
print(
    df
    .with_columns(
        pl.col("values")
        .list.eval(
            pl.element() > 40,  # <1>
            parallel=True,  # <2>
        )
        .alias("values > 40")
    )
    .with_columns(  # <3>
        pl.col("values > 40")
        .list.all()  # <4>
        .alias("all values > 40")
    )
)

shape: (3, 3)
┌──────────────┬──────────────────────┬─────────────────┐
│ values       ┆ values > 40          ┆ all values > 40 │
│ ---          ┆ ---                  ┆ ---             │
│ list[i64]    ┆ list[bool]           ┆ bool            │
╞══════════════╪══════════════════════╪═════════════════╡
│ [10, 20]     ┆ [false, false]       ┆ false           │
│ [30, 40, 50] ┆ [false, false, true] ┆ false           │
│ [60]         ┆ [true]               ┆ true            │
└──────────────┴──────────────────────┴─────────────────┘


In [47]:
df.explode("values")

values
i64
10
20
30
40
50
60


## Array

### Methods

### Examples

In [51]:
df = pl.DataFrame([
    pl.Series(
        "location",
        ["Paris", "Amsterdam", "Barcelona"],
        dtype=pl.String
    ),
    pl.Series(
        "temperatures",
        [
            [23, 27, 21, 22, 24, 23, 22],
            [17, 19, 15, 22, 18, 20, 21],
            [30, 32, 28, 29, 34, 33, 31]
        ],
        dtype=pl.Array(pl.Int64, width=7),
    ),
])
print(df)

shape: (3, 2)
┌───────────┬────────────────┐
│ location  ┆ temperatures   │
│ ---       ┆ ---            │
│ str       ┆ array[i64, 7]  │
╞═══════════╪════════════════╡
│ Paris     ┆ [23, 27, … 22] │
│ Amsterdam ┆ [17, 19, … 21] │
│ Barcelona ┆ [30, 32, … 31] │
└───────────┴────────────────┘


  dtype=pl.Array(pl.Int64, width=7),


In [52]:
print(
    df
    .with_columns(
        pl.col("temperatures")
        .arr.median()
        .alias("median"),
        pl.col("temperatures")
        .arr.max()
        .alias("max"),
        pl.col("temperatures")
        .arr.arg_max()
        .alias("warmest_weekday")
    )
)

shape: (3, 5)
┌───────────┬────────────────┬────────┬─────┬─────────────────┐
│ location  ┆ temperatures   ┆ median ┆ max ┆ warmest_weekday │
│ ---       ┆ ---            ┆ ---    ┆ --- ┆ ---             │
│ str       ┆ array[i64, 7]  ┆ f64    ┆ i64 ┆ u32             │
╞═══════════╪════════════════╪════════╪═════╪═════════════════╡
│ Paris     ┆ [23, 27, … 22] ┆ 23.0   ┆ 27  ┆ 1               │
│ Amsterdam ┆ [17, 19, … 21] ┆ 19.0   ┆ 22  ┆ 3               │
│ Barcelona ┆ [30, 32, … 31] ┆ 31.0   ┆ 34  ┆ 4               │
└───────────┴────────────────┴────────┴─────┴─────────────────┘


## Structs

### Methods

### Examples

In [56]:
df = pl.DataFrame({
    "struct_column": [
        {"a": 1, "b": 2},
        {"a": 3, "b": 4},
        {"a": 5, "b": 6},
    ]
})
print(df)

shape: (3, 1)
┌───────────────┐
│ struct_column │
│ ---           │
│ struct[2]     │
╞═══════════════╡
│ {1,2}         │
│ {3,4}         │
│ {5,6}         │
└───────────────┘


In [57]:
df.select(pl.col("struct_column").struct.field("a"))

a
i64
1
3
5


In [58]:
df = df.unnest("struct_column")
print(df)

shape: (3, 2)
┌─────┬─────┐
│ a   ┆ b   │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1   ┆ 2   │
│ 3   ┆ 4   │
│ 5   ┆ 6   │
└─────┴─────┘


In [59]:
df.select(
    "a",
    "b",
    pl.struct(
        pl.col("a"),
        pl.col("b")
    ).alias("struct_column"),
)

a,b,struct_column
i64,i64,struct[2]
1,2,"{1,2}"
3,4,"{3,4}"
5,6,"{5,6}"


In [60]:
df = pl.DataFrame({
    "fruit": ["cherry", "apple", "banana", "banana", "apple", "banana"],
})
print(df)

shape: (6, 1)
┌────────┐
│ fruit  │
│ ---    │
│ str    │
╞════════╡
│ cherry │
│ apple  │
│ banana │
│ banana │
│ apple  │
│ banana │
└────────┘


In [61]:
print(
    df
    .select(
        pl.col("fruit")
        .value_counts(sort=True)
    )
)

shape: (3, 1)
┌──────────────┐
│ fruit        │
│ ---          │
│ struct[2]    │
╞══════════════╡
│ {"banana",3} │
│ {"apple",2}  │
│ {"cherry",1} │
└──────────────┘


In [62]:
print(
    df.select(
        pl.col("fruit")
        .value_counts(sort=True)
    )
    .unnest("fruit")
)

shape: (3, 2)
┌────────┬───────┐
│ fruit  ┆ count │
│ ---    ┆ ---   │
│ str    ┆ u32   │
╞════════╪═══════╡
│ banana ┆ 3     │
│ apple  ┆ 2     │
│ cherry ┆ 1     │
└────────┴───────┘


## Conclusion