# Chapter 5: Data Types and Data Structures

In [1]:
import polars as pl
pl.show_versions()

--------Version info---------
Polars:               0.20.18
Index type:           UInt32
Platform:             macOS-12.5-arm64-arm-64bit
Python:               3.11.8 (main, Feb 22 2024, 20:44:55) [Clang 14.0.0 (clang-1400.0.29.202)]

----Optional dependencies----
adbc_driver_manager:  0.8.0
cloudpickle:          3.0.0
connectorx:           0.3.2
deltalake:            0.15.0
fastexcel:            0.9.1
fsspec:               2023.12.2
gevent:               23.9.1
hvplot:               0.9.2
matplotlib:           3.8.3
nest_asyncio:         1.6.0
numpy:                1.26.4
openpyxl:             3.1.2
pandas:               2.2.0
pyarrow:              14.0.2
pydantic:             2.5.3
pyiceberg:            0.5.1
pyxlsb:               <not installed>
sqlalchemy:           2.0.25
xlsx2csv:             0.8.2
xlsxwriter:           3.1.9


## Arrow Data Types

### Nested Data Types

In [4]:
import polars as pl

array_df = pl.DataFrame(
    [
        pl.Series("array_1", [[1, 3], [2, 5]]),
        pl.Series("array_2", [[1, 7, 3], [8, 1, 0]]),
    ],
    schema={
        "array_1": pl.Array(width=2, inner=pl.Int64),
        "array_2": pl.Array(width=3, inner=pl.Int64)
    }
)
array_df

array_1,array_2
"array[i64, 2]","array[i64, 3]"
"[1, 3]","[1, 7, 3]"
"[2, 5]","[8, 1, 0]"


In [5]:
list_df = pl.DataFrame(
    {
        "integer_lists": [[1, 2], [3, 4]],
        "float_lists": [[1.0, 2.0], [3.0, 4.0]],
    }
)
list_df

integer_lists,float_lists
list[i64],list[f64]
"[1, 2]","[1.0, 2.0]"
"[3, 4]","[3.0, 4.0]"


In [6]:
rating_series = pl.Series(
    "ratings",
    [
        {"Movie": "Cars", "Theatre": "NE", "Avg_Rating": 4.5},
        {"Movie": "Toy Story", "Theatre": "ME", "Avg_Rating": 4.9},
    ],
)
rating_series

ratings
struct[3]
"{""Cars"",""NE"",4.5}"
"{""Toy Story"",""ME"",4.9}"


### Missing Values

In [8]:
df = pl.DataFrame(
    {
        "value": [None, 2, 3, 4, None, None, 7, 8, 9, None],
    },
)
print(df)

shape: (10, 1)
┌───────┐
│ value │
│ ---   │
│ i64   │
╞═══════╡
│ null  │
│ 2     │
│ 3     │
│ 4     │
│ null  │
│ null  │
│ 7     │
│ 8     │
│ 9     │
│ null  │
└───────┘


In [9]:
print(
    df
    .with_columns(
        pl.col("value")
        .fill_null(-1)
        .alias("filled_with_lit")
    )
)

shape: (10, 2)
┌───────┬─────────────────┐
│ value ┆ filled_with_lit │
│ ---   ┆ ---             │
│ i64   ┆ i64             │
╞═══════╪═════════════════╡
│ null  ┆ -1              │
│ 2     ┆ 2               │
│ 3     ┆ 3               │
│ 4     ┆ 4               │
│ null  ┆ -1              │
│ null  ┆ -1              │
│ 7     ┆ 7               │
│ 8     ┆ 8               │
│ 9     ┆ 9               │
│ null  ┆ -1              │
└───────┴─────────────────┘


In [10]:
print(
    df
    .with_columns(
        pl.col("value")
        .fill_null(strategy="forward")
        .alias("forward"),
        pl.col("value")
        .fill_null(strategy="backward")
        .alias("backward"),
        pl.col("value")
        .fill_null(strategy="min")
        .alias("min"),
        pl.col("value")
        .fill_null(strategy="max")
        .alias("max"),
        pl.col("value")
        .fill_null(strategy="mean")
        .alias("mean"),
        pl.col("value")
        .fill_null(strategy="zero")
        .alias("zero"),
        pl.col("value")
        .fill_null(strategy="one")
        .alias("one"),
    )
)

shape: (10, 8)
┌───────┬─────────┬──────────┬─────┬─────┬──────┬──────┬─────┐
│ value ┆ forward ┆ backward ┆ min ┆ max ┆ mean ┆ zero ┆ one │
│ ---   ┆ ---     ┆ ---      ┆ --- ┆ --- ┆ ---  ┆ ---  ┆ --- │
│ i64   ┆ i64     ┆ i64      ┆ i64 ┆ i64 ┆ i64  ┆ i64  ┆ i64 │
╞═══════╪═════════╪══════════╪═════╪═════╪══════╪══════╪═════╡
│ null  ┆ null    ┆ 2        ┆ 2   ┆ 9   ┆ 5    ┆ 0    ┆ 1   │
│ 2     ┆ 2       ┆ 2        ┆ 2   ┆ 2   ┆ 2    ┆ 2    ┆ 2   │
│ 3     ┆ 3       ┆ 3        ┆ 3   ┆ 3   ┆ 3    ┆ 3    ┆ 3   │
│ 4     ┆ 4       ┆ 4        ┆ 4   ┆ 4   ┆ 4    ┆ 4    ┆ 4   │
│ null  ┆ 4       ┆ 7        ┆ 2   ┆ 9   ┆ 5    ┆ 0    ┆ 1   │
│ null  ┆ 4       ┆ 7        ┆ 2   ┆ 9   ┆ 5    ┆ 0    ┆ 1   │
│ 7     ┆ 7       ┆ 7        ┆ 7   ┆ 7   ┆ 7    ┆ 7    ┆ 7   │
│ 8     ┆ 8       ┆ 8        ┆ 8   ┆ 8   ┆ 8    ┆ 8    ┆ 8   │
│ 9     ┆ 9       ┆ 9        ┆ 9   ┆ 9   ┆ 9    ┆ 9    ┆ 9   │
│ null  ┆ 9       ┆ null     ┆ 2   ┆ 9   ┆ 5    ┆ 0    ┆ 1   │
└───────┴─────────┴──────────┴─────┴────

In [11]:
print(
    df
    .with_columns(
        pl.col("value")
        .fill_null(pl.col("value").mean())
        .alias("expression_mean")
    )
)

shape: (10, 2)
┌───────┬─────────────────┐
│ value ┆ expression_mean │
│ ---   ┆ ---             │
│ i64   ┆ f64             │
╞═══════╪═════════════════╡
│ null  ┆ 5.5             │
│ 2     ┆ 2.0             │
│ 3     ┆ 3.0             │
│ 4     ┆ 4.0             │
│ null  ┆ 5.5             │
│ null  ┆ 5.5             │
│ 7     ┆ 7.0             │
│ 8     ┆ 8.0             │
│ 9     ┆ 9.0             │
│ null  ┆ 5.5             │
└───────┴─────────────────┘


In [12]:
print(
    df.interpolate()
)

shape: (10, 1)
┌───────┐
│ value │
│ ---   │
│ f64   │
╞═══════╡
│ null  │
│ 2.0   │
│ 3.0   │
│ 4.0   │
│ 5.0   │
│ 6.0   │
│ 7.0   │
│ 8.0   │
│ 9.0   │
│ null  │
└───────┘


## Series, DataFrames, and LazyFrames

## Data Type Conversion

In [15]:
string_df = pl.DataFrame({"id": ["1", "2", "3"]})
print(string_df)
print(f"Estimated size: {string_df.estimated_size('b')} bytes")

shape: (3, 1)
┌─────┐
│ id  │
│ --- │
│ str │
╞═════╡
│ 1   │
│ 2   │
│ 3   │
└─────┘
Estimated size: 3 bytes


In [16]:
int_df = string_df.select(pl.col("id").cast(pl.UInt8))
print(int_df)
print(f"Estimated size: {int_df.estimated_size('b')} bytes")

shape: (3, 1)
┌─────┐
│ id  │
│ --- │
│ u8  │
╞═════╡
│ 1   │
│ 2   │
│ 3   │
└─────┘
Estimated size: 3 bytes


## Conclusion