# Chapter 4: Data Types and Data Structures

In [1]:
import polars as pl
pl.show_versions()  # The book is built with Polars version 1.0.0

## Data Types

### Nested Data Types

In [4]:
import polars as pl

array_df = pl.DataFrame(
    [
        pl.Series("array_1", [[1, 3], [2, 5]]),
        pl.Series("array_2", [[1, 7, 3], [8, 1, 0]]),
    ],
    schema={
        "array_1": pl.Array(width=2, inner=pl.Int64),
        "array_2": pl.Array(width=3, inner=pl.Int64)
    }
)
array_df

In [5]:
list_df = pl.DataFrame(
    {
        "integer_lists": [[1, 2], [3, 4]],
        "float_lists": [[1.0, 2.0], [3.0, 4.0]],
    }
)
list_df

In [6]:
rating_series = pl.Series(
    "ratings",
    [
        {"Movie": "Cars", "Theatre": "NE", "Avg_Rating": 4.5},
        {"Movie": "Toy Story", "Theatre": "ME", "Avg_Rating": 4.9},
    ],
)
rating_series

### Missing Values

In [8]:
df = pl.DataFrame(
    {
        "value": [None, 2, 3, 4, None, None, 7, 8, 9, None],
    },
)
print(df)

In [9]:
print(
    df
    .with_columns(
        pl.col("value")
        .fill_null(-1)
        .alias("filled_with_lit")
    )
)

In [10]:
print(
    df
    .with_columns(
        pl.col("value")
        .fill_null(strategy="forward")
        .alias("forward"),
        pl.col("value")
        .fill_null(strategy="backward")
        .alias("backward"),
        pl.col("value")
        .fill_null(strategy="min")
        .alias("min"),
        pl.col("value")
        .fill_null(strategy="max")
        .alias("max"),
        pl.col("value")
        .fill_null(strategy="mean")
        .alias("mean"),
        pl.col("value")
        .fill_null(strategy="zero")
        .alias("zero"),
        pl.col("value")
        .fill_null(strategy="one")
        .alias("one"),
    )
)

In [11]:
print(
    df
    .with_columns(
        pl.col("value")
        .fill_null(pl.col("value").mean())
        .alias("expression_mean")
    )
)

In [12]:
print(
    df.interpolate()
)

## Series, DataFrames, and LazyFrames

## Data Type Conversion

In [15]:
string_df = pl.DataFrame({"id": ["10000", "20000", "30000"]})
print(string_df)
print(f"Estimated size: {string_df.estimated_size('b')} bytes")

In [16]:
int_df = string_df.select(pl.col("id").cast(pl.UInt16))
print(int_df)
print(f"Estimated size: {int_df.estimated_size('b')} bytes")

In [17]:
df = pl.DataFrame(
    {
        "id": [10000, 20000, 30000],
        "value": [1.0, 2.0, 3.0],
        "value2": ["1", "2", "3"],
    }
)
df.cast(pl.UInt16)

In [18]:
df.cast({"id": pl.UInt16, "value": pl.Float32, "value2": pl.UInt8})

In [19]:
df.cast({pl.Float64: pl.Float32, pl.String: pl.UInt8})

In [20]:
import polars.selectors as cs
df.cast({cs.numeric(): pl.UInt16})

## Takeaways