# Chapter 15: Reshaping

In [1]:
import polars as pl
pl.show_versions()

--------Version info---------
Polars:               0.20.31
Index type:           UInt32
Platform:             macOS-12.5-arm64-arm-64bit
Python:               3.11.9 (main, Apr  2 2024, 16:11:47) [Clang 14.0.0 (clang-1400.0.29.202)]

----Optional dependencies----
adbc_driver_manager:  0.8.0
cloudpickle:          3.0.0
connectorx:           0.3.2
deltalake:            0.15.0
fastexcel:            0.9.1
fsspec:               2023.12.2
gevent:               23.9.1
hvplot:               0.9.2
matplotlib:           3.8.4
nest_asyncio:         1.6.0
numpy:                1.26.4
openpyxl:             3.1.2
pandas:               2.2.2
pyarrow:              14.0.2
pydantic:             2.5.3
pyiceberg:            0.5.1
pyxlsb:               <not installed>
sqlalchemy:           2.0.25
torch:                <not installed>
xlsx2csv:             0.8.2
xlsxwriter:           3.2.0


## Wide Versus Long DataFrames

In [3]:
import polars as pl

df = pl.DataFrame({
    "student": ["Alice", "Bob", "Charlie"],
    "math": [85, 78, 92],
    "science": [90, 82, 85],
    "history": [88, 80, 87]
})
df

student,math,science,history
str,i64,i64,i64
"""Alice""",85,90,88
"""Bob""",78,82,80
"""Charlie""",92,85,87


In [4]:
df = pl.DataFrame({
    "student": ["Alice", "Alice", "Alice", "Bob", "Bob", "Bob", "Charlie",
        "Charlie", "Charlie"],
    "subject": ["Math", "Science", "History", "Math", "Science", "History",
        "Math", "Science", "History"],
    "grade": [85, 90, 88, 78, 82, 80, 92, 85, 87]
})
df

student,subject,grade
str,str,i64
"""Alice""","""Math""",85
"""Alice""","""Science""",90
"""Alice""","""History""",88
"""Bob""","""Math""",78
"""Bob""","""Science""",82
"""Bob""","""History""",80
"""Charlie""","""Math""",92
"""Charlie""","""Science""",85
"""Charlie""","""History""",87


## Pivot to Wider DataFrame

In [6]:
import polars as pl

df = pl.DataFrame({
    "student": ["Alice", "Alice", "Alice", "Bob", "Bob", "Bob", "Charlie",
        "Charlie", "Charlie"],
    "subject": ["Math", "Science", "History", "Math", "Science", "History",
        "Math", "Science", "History"],
    "grade": [85, 90, 88, 78, 82, 80, 92, 85, 87]
})

df

student,subject,grade
str,str,i64
"""Alice""","""Math""",85
"""Alice""","""Science""",90
"""Alice""","""History""",88
"""Bob""","""Math""",78
"""Bob""","""Science""",82
"""Bob""","""History""",80
"""Charlie""","""Math""",92
"""Charlie""","""Science""",85
"""Charlie""","""History""",87


In [7]:
df.pivot(index="student", columns="subject", values="grade")

student,Math,Science,History
str,i64,i64,i64
"""Alice""",85,90,88
"""Bob""",78,82,80
"""Charlie""",92,85,87


In [8]:
df = pl.DataFrame({
    "student": ["Alice", "Alice", "Alice", "Alice", "Alice", "Alice",
                "Bob", "Bob", "Bob", "Bob", "Bob", "Bob"],
    "subject": ["Math", "Math", "Math", "Science", "Science", "Science",
                "Math", "Math", "Math", "Science", "Science", "Science"],
    "grade": [85, 88, 85, 60, 66, 63,
              51, 79, 62, 82, 85, 82]
})

df

student,subject,grade
str,str,i64
"""Alice""","""Math""",85
"""Alice""","""Math""",88
"""Alice""","""Math""",85
"""Alice""","""Science""",60
"""Alice""","""Science""",66
…,…,…
"""Bob""","""Math""",79
"""Bob""","""Math""",62
"""Bob""","""Science""",82
"""Bob""","""Science""",85


In [9]:
df.pivot(
    index="student",
    columns="subject",
    values="grade",
    aggregate_function="mean"
)

student,Math,Science
str,f64,f64
"""Alice""",86.0,63.0
"""Bob""",64.0,83.0


In [10]:
df.pivot(
    index="student",
    columns="subject",
    values="grade",
    aggregate_function=pl.element().max() - pl.element().min()
)

student,Math,Science
str,i64,i64
"""Alice""",3,6
"""Bob""",28,3


## Melt to Longer DataFrame

In [12]:
df = pl.DataFrame({
    "student": ["Alice", "Bob", "Charlie"],
    "math": [85, 78, 92],
    "science": [90, 82, 85],
    "history": [88, 80, 87]
})
df

student,math,science,history
str,i64,i64,i64
"""Alice""",85,90,88
"""Bob""",78,82,80
"""Charlie""",92,85,87


In [13]:
df.melt(
    id_vars=["student"],
    value_vars=["math", "science", "history"],
    variable_name="subject",
    value_name="grade"
)

student,subject,grade
str,str,i64
"""Alice""","""math""",85
"""Bob""","""math""",78
"""Charlie""","""math""",92
"""Alice""","""science""",90
"""Bob""","""science""",82
"""Charlie""","""science""",85
"""Alice""","""history""",88
"""Bob""","""history""",80
"""Charlie""","""history""",87


In [14]:
df = pl.DataFrame({
    "student": ["Alice", "Bob", "Charlie", "Alice", "Bob", "Charlie"],
    "class": ["Math101", "Math101", "Math101", "Math102", "Math102", "Math102"],
    "age": [20, 21, 22, 20, 21, 22],
    "semester": ["Fall", "Fall", "Fall", "Spring", "Spring", "Spring"],
    "math": [85, 78, 92, 88, 79, 95],
    "science": [90, 82, 85, 92, 81, 87],
    "history": [88, 80, 87, 85, 82, 89]
})
df

student,class,age,semester,math,science,history
str,str,i64,str,i64,i64,i64
"""Alice""","""Math101""",20,"""Fall""",85,90,88
"""Bob""","""Math101""",21,"""Fall""",78,82,80
"""Charlie""","""Math101""",22,"""Fall""",92,85,87
"""Alice""","""Math102""",20,"""Spring""",88,92,85
"""Bob""","""Math102""",21,"""Spring""",79,81,82
"""Charlie""","""Math102""",22,"""Spring""",95,87,89


In [15]:
df.melt(
    id_vars=["student", "class", "age", "semester"],
    value_vars=["math", "science", "history"],
    variable_name="subject",
    value_name="grade"
)

student,class,age,semester,subject,grade
str,str,i64,str,str,i64
"""Alice""","""Math101""",20,"""Fall""","""math""",85
"""Bob""","""Math101""",21,"""Fall""","""math""",78
"""Charlie""","""Math101""",22,"""Fall""","""math""",92
"""Alice""","""Math102""",20,"""Spring""","""math""",88
"""Bob""","""Math102""",21,"""Spring""","""math""",79
…,…,…,…,…,…
"""Bob""","""Math101""",21,"""Fall""","""history""",80
"""Charlie""","""Math101""",22,"""Fall""","""history""",87
"""Alice""","""Math102""",20,"""Spring""","""history""",85
"""Bob""","""Math102""",21,"""Spring""","""history""",82


## Transposing

In [17]:
df = pl.DataFrame({
    "student": ["Alice", "Bob", "Charlie"],
    "math": [85, 78, 92],
    "science": [90, 82, 85],
    "history": [88, 80, 87]
})
df

student,math,science,history
str,i64,i64,i64
"""Alice""",85,90,88
"""Bob""",78,82,80
"""Charlie""",92,85,87


In [18]:
df.transpose(
    include_header=True,
    header_name="original_headers",
    column_names=(f"report_{count}" for count in range(1, len(df.columns) + 1))
)

original_headers,report_1,report_2,report_3
str,str,str,str
"""student""","""Alice""","""Bob""","""Charlie"""
"""math""","""85""","""78""","""92"""
"""science""","""90""","""82""","""85"""
"""history""","""88""","""80""","""87"""


## Exploding

In [20]:
df = pl.DataFrame({
    "student": ["Alice", "Bob", "Charlie"],
    "math": [[85, 90, 88], [78, 82, 80], [92, 85, 87]]
})
df

student,math
str,list[i64]
"""Alice""","[85, 90, 88]"
"""Bob""","[78, 82, 80]"
"""Charlie""","[92, 85, 87]"


In [21]:
df.explode("math")

student,math
str,i64
"""Alice""",85
"""Alice""",90
"""Alice""",88
"""Bob""",78
"""Bob""",82
"""Bob""",80
"""Charlie""",92
"""Charlie""",85
"""Charlie""",87


In [22]:
df = pl.DataFrame({
    "student": ["Alice", "Bob", "Charlie"],
    "math": [[85, 90, 88], [78, 82, 80], [92, 85, 87]],
    "science": [[85, 90, 88], [78, 82], [92, 85, 87]],
    "history": [[85, 90, 88], [78, 82], [92, 85, 87]],
})
df

student,math,science,history
str,list[i64],list[i64],list[i64]
"""Alice""","[85, 90, 88]","[85, 90, 88]","[85, 90, 88]"
"""Bob""","[78, 82, 80]","[78, 82]","[78, 82]"
"""Charlie""","[92, 85, 87]","[92, 85, 87]","[92, 85, 87]"


In [23]:
df.explode("math", "science", "history")

ShapeError: exploded columns must have matching element counts

In [24]:
df = pl.DataFrame({
    "id": [1,2],
    "value1": [["a", "b"], ["c"]],
    "value2": [["a"], ["b"]],
})
df.explode("value1", "value2")

ShapeError: exploded columns must have matching element counts

In [25]:
df = pl.DataFrame({
    "id": [1,2],
    "nested_value": [["a", "b"], [["c"], ["d", "e"]]],
}, strict=False)
df

id,nested_value
i64,list[list[str]]
1,"[[""a""], [""b""]]"
2,"[[""c""], [""d"", ""e""]]"


In [26]:
df.explode("nested_value")

id,nested_value
i64,list[str]
1,"[""a""]"
1,"[""b""]"
2,"[""c""]"
2,"[""d"", ""e""]"


In [27]:
df.explode("nested_value").explode("nested_value")

id,nested_value
i64,str
1,"""a"""
1,"""b"""
2,"""c"""
2,"""d"""
2,"""e"""


## Partition into Multiple DataFrames

In [29]:
df = pl.DataFrame({
    "OrderID": [1, 2, 3, 4, 5, 6],
    "Product": ["A", "B", "A", "C", "B", "A"],
    "Quantity": [10, 5, 8, 7, 3, 12],
    "Region": ["North", "South", "North", "West", "South", "West"]
})

In [30]:
df.partition_by("Region")

[shape: (2, 4)
 ┌─────────┬─────────┬──────────┬────────┐
 │ OrderID ┆ Product ┆ Quantity ┆ Region │
 │ ---     ┆ ---     ┆ ---      ┆ ---    │
 │ i64     ┆ str     ┆ i64      ┆ str    │
 ╞═════════╪═════════╪══════════╪════════╡
 │ 1       ┆ A       ┆ 10       ┆ North  │
 │ 3       ┆ A       ┆ 8        ┆ North  │
 └─────────┴─────────┴──────────┴────────┘,
 shape: (2, 4)
 ┌─────────┬─────────┬──────────┬────────┐
 │ OrderID ┆ Product ┆ Quantity ┆ Region │
 │ ---     ┆ ---     ┆ ---      ┆ ---    │
 │ i64     ┆ str     ┆ i64      ┆ str    │
 ╞═════════╪═════════╪══════════╪════════╡
 │ 2       ┆ B       ┆ 5        ┆ South  │
 │ 5       ┆ B       ┆ 3        ┆ South  │
 └─────────┴─────────┴──────────┴────────┘,
 shape: (2, 4)
 ┌─────────┬─────────┬──────────┬────────┐
 │ OrderID ┆ Product ┆ Quantity ┆ Region │
 │ ---     ┆ ---     ┆ ---      ┆ ---    │
 │ i64     ┆ str     ┆ i64      ┆ str    │
 ╞═════════╪═════════╪══════════╪════════╡
 │ 4       ┆ C       ┆ 7        ┆ West   │
 │ 6   

In [31]:
df.partition_by("Region", include_key=False)

[shape: (2, 3)
 ┌─────────┬─────────┬──────────┐
 │ OrderID ┆ Product ┆ Quantity │
 │ ---     ┆ ---     ┆ ---      │
 │ i64     ┆ str     ┆ i64      │
 ╞═════════╪═════════╪══════════╡
 │ 1       ┆ A       ┆ 10       │
 │ 3       ┆ A       ┆ 8        │
 └─────────┴─────────┴──────────┘,
 shape: (2, 3)
 ┌─────────┬─────────┬──────────┐
 │ OrderID ┆ Product ┆ Quantity │
 │ ---     ┆ ---     ┆ ---      │
 │ i64     ┆ str     ┆ i64      │
 ╞═════════╪═════════╪══════════╡
 │ 2       ┆ B       ┆ 5        │
 │ 5       ┆ B       ┆ 3        │
 └─────────┴─────────┴──────────┘,
 shape: (2, 3)
 ┌─────────┬─────────┬──────────┐
 │ OrderID ┆ Product ┆ Quantity │
 │ ---     ┆ ---     ┆ ---      │
 │ i64     ┆ str     ┆ i64      │
 ╞═════════╪═════════╪══════════╡
 │ 4       ┆ C       ┆ 7        │
 │ 6       ┆ A       ┆ 12       │
 └─────────┴─────────┴──────────┘]

In [32]:
dfs = df.partition_by(["Region"], as_dict=True)
dfs

{('North',): shape: (2, 4)
 ┌─────────┬─────────┬──────────┬────────┐
 │ OrderID ┆ Product ┆ Quantity ┆ Region │
 │ ---     ┆ ---     ┆ ---      ┆ ---    │
 │ i64     ┆ str     ┆ i64      ┆ str    │
 ╞═════════╪═════════╪══════════╪════════╡
 │ 1       ┆ A       ┆ 10       ┆ North  │
 │ 3       ┆ A       ┆ 8        ┆ North  │
 └─────────┴─────────┴──────────┴────────┘,
 ('South',): shape: (2, 4)
 ┌─────────┬─────────┬──────────┬────────┐
 │ OrderID ┆ Product ┆ Quantity ┆ Region │
 │ ---     ┆ ---     ┆ ---      ┆ ---    │
 │ i64     ┆ str     ┆ i64      ┆ str    │
 ╞═════════╪═════════╪══════════╪════════╡
 │ 2       ┆ B       ┆ 5        ┆ South  │
 │ 5       ┆ B       ┆ 3        ┆ South  │
 └─────────┴─────────┴──────────┴────────┘,
 ('West',): shape: (2, 4)
 ┌─────────┬─────────┬──────────┬────────┐
 │ OrderID ┆ Product ┆ Quantity ┆ Region │
 │ ---     ┆ ---     ┆ ---      ┆ ---    │
 │ i64     ┆ str     ┆ i64      ┆ str    │
 ╞═════════╪═════════╪══════════╪════════╡
 │ 4       ┆ C 

In [33]:
dfs[("North",)]

OrderID,Product,Quantity,Region
i64,str,i64,str
1,"""A""",10,"""North"""
3,"""A""",8,"""North"""


## Conclusion