# Chapter 5: Data Types and Data Structures

In [1]:
import polars as pl
pl.show_versions()

--------Version info---------
Polars:               0.20.7
Index type:           UInt32
Platform:             macOS-12.5-arm64-arm-64bit
Python:               3.11.7 (main, Jan 16 2024, 14:42:22) [Clang 14.0.0 (clang-1400.0.29.202)]

----Optional dependencies----
adbc_driver_manager:  0.8.0
cloudpickle:          3.0.0
connectorx:           0.3.2
deltalake:            0.15.0
fsspec:               2023.12.2
gevent:               23.9.1
hvplot:               0.9.2
matplotlib:           3.8.2
numpy:                1.26.3
openpyxl:             3.1.2
pandas:               2.1.4
pyarrow:              14.0.2
pydantic:             2.5.3
pyiceberg:            0.5.1
pyxlsb:               <not installed>
sqlalchemy:           2.0.25
xlsx2csv:             0.8.2
xlsxwriter:           3.1.9


## Arrow Data Types

### Nested Data Types

In [4]:
import polars as pl

array_df = pl.DataFrame(
    [
        pl.Series("array_1", [[1, 3], [2, 5]]),
        pl.Series("array_2", [[1, 7, 3], [8, 1, 0]]),
    ],
    schema={
        "array_1": pl.Array(width=2, inner=pl.Int64),
        "array_2": pl.Array(width=3, inner=pl.Int64)
    }
)
array_df

array_1,array_2
"array[i64, 2]","array[i64, 3]"
"[1, 3]","[1, 7, 3]"
"[2, 5]","[8, 1, 0]"


In [5]:
list_df = pl.DataFrame(
    {
        "integer_lists": [[1, 2], [3, 4]],
        "float_lists": [[1.0, 2.0], [3.0, 4.0]],
    }
)
list_df

integer_lists,float_lists
list[i64],list[f64]
"[1, 2]","[1.0, 2.0]"
"[3, 4]","[3.0, 4.0]"


In [6]:
rating_series = pl.Series(
    "ratings",
    [
        {"Movie": "Cars", "Theatre": "NE", "Avg_Rating": 4.5},
        {"Movie": "Toy Story", "Theatre": "ME", "Avg_Rating": 4.9},
    ],
)
rating_series

ratings
struct[3]
"{""Cars"",""NE"",4.5}"
"{""Toy Story"",""ME"",4.9}"


## Series, DataFrame, and LazyFrame

## Data Type Conversion

In [9]:
string_df = pl.DataFrame({"id": ["1", "2", "3"]})
print(string_df)
print(f"Estimated size: {string_df.estimated_size('b')} bytes")

shape: (3, 1)
┌─────┐
│ id  │
│ --- │
│ str │
╞═════╡
│ 1   │
│ 2   │
│ 3   │
└─────┘
Estimated size: 48 bytes


In [10]:
int_df = string_df.select(pl.col("id").cast(pl.UInt8))
print(int_df)
print(f"Estimated size: {int_df.estimated_size('b')} bytes")

shape: (3, 1)
┌─────┐
│ id  │
│ --- │
│ u8  │
╞═════╡
│ 1   │
│ 2   │
│ 3   │
└─────┘
Estimated size: 3 bytes


## Conclusion