# Chapter 7: Beginning Expressions

In [1]:
import polars as pl
pl.show_versions()

--------Version info---------
Polars:               0.20.31
Index type:           UInt32
Platform:             macOS-12.5-arm64-arm-64bit
Python:               3.11.9 (main, Apr  2 2024, 16:11:47) [Clang 14.0.0 (clang-1400.0.29.202)]

----Optional dependencies----
adbc_driver_manager:  0.8.0
cloudpickle:          3.0.0
connectorx:           0.3.2
deltalake:            0.15.0
fastexcel:            0.9.1
fsspec:               2023.12.2
gevent:               23.9.1
hvplot:               0.9.2
matplotlib:           3.8.4
nest_asyncio:         1.6.0
numpy:                1.26.4
openpyxl:             3.1.2
pandas:               2.2.2
pyarrow:              14.0.2
pydantic:             2.5.3
pyiceberg:            0.5.1
pyxlsb:               <not installed>
sqlalchemy:           2.0.25
torch:                <not installed>
xlsx2csv:             0.8.2
xlsxwriter:           3.2.0


## Methods and Namespaces

## Expressions by Example

In [4]:
import polars as pl

fruit = pl.read_csv("data/fruit.csv")
fruit

name,weight,color,is_round,origin
str,i64,str,bool,str
"""Avocado""",200,"""green""",False,"""South America"""
"""Banana""",120,"""yellow""",False,"""Asia"""
"""Blueberry""",1,"""blue""",False,"""North America"""
"""Cantaloupe""",2500,"""orange""",True,"""Africa"""
"""Cranberry""",2,"""red""",False,"""North America"""
"""Elderberry""",1,"""black""",False,"""Europe"""
"""Orange""",130,"""orange""",True,"""Asia"""
"""Papaya""",1000,"""orange""",False,"""South America"""
"""Peach""",150,"""orange""",True,"""Asia"""
"""Watermelon""",5000,"""green""",True,"""Africa"""


### Selecting Columns with Expressions

In [6]:
fruit.select(
    pl.col("name"),  # <1>
    pl.col("^.*or.*$"),  # <2>
    pl.col("weight") / 1000,  # <3>
    "is_round"  # <4>
)

name,color,origin,weight,is_round
str,str,str,f64,bool
"""Avocado""","""green""","""South America""",0.2,False
"""Banana""","""yellow""","""Asia""",0.12,False
"""Blueberry""","""blue""","""North America""",0.001,False
"""Cantaloupe""","""orange""","""Africa""",2.5,True
"""Cranberry""","""red""","""North America""",0.002,False
"""Elderberry""","""black""","""Europe""",0.001,False
"""Orange""","""orange""","""Asia""",0.13,True
"""Papaya""","""orange""","""South America""",1.0,False
"""Peach""","""orange""","""Asia""",0.15,True
"""Watermelon""","""green""","""Africa""",5.0,True


### Creating New Columns with Expressions

In [8]:
fruit.with_columns(
    pl.lit(True).alias("is_fruit"),  # <1>
    pl.col("name").str.ends_with("berry").alias("is_berry")  # <2>
)

name,weight,color,is_round,origin,is_fruit,is_berry
str,i64,str,bool,str,bool,bool
"""Avocado""",200,"""green""",False,"""South America""",True,False
"""Banana""",120,"""yellow""",False,"""Asia""",True,False
"""Blueberry""",1,"""blue""",False,"""North America""",True,True
"""Cantaloupe""",2500,"""orange""",True,"""Africa""",True,False
"""Cranberry""",2,"""red""",False,"""North America""",True,True
"""Elderberry""",1,"""black""",False,"""Europe""",True,True
"""Orange""",130,"""orange""",True,"""Asia""",True,False
"""Papaya""",1000,"""orange""",False,"""South America""",True,False
"""Peach""",150,"""orange""",True,"""Asia""",True,False
"""Watermelon""",5000,"""green""",True,"""Africa""",True,False


### Filtering Rows with Expressions

In [10]:
fruit.filter(
    pl.col("is_round") &  # <1>
    (pl.col("weight") > 1000)  # <2>
)

name,weight,color,is_round,origin
str,i64,str,bool,str
"""Cantaloupe""",2500,"""orange""",True,"""Africa"""
"""Watermelon""",5000,"""green""",True,"""Africa"""


### Aggregating with Expressions

In [12]:
fruit.group_by(
    pl.col("origin").str.split(" ").list.last()  # <1>
).agg(
    pl.count(),  # <2>
    pl.col("weight").mean().alias("average_weight")  # <3>
)

  pl.count(),  # <2>


origin,count,average_weight
str,u32,f64
"""America""",4,300.75
"""Europe""",1,1.0
"""Asia""",3,133.333333
"""Africa""",2,3750.0


### Sorting Rows with Expressions

In [14]:
fruit.sort(
    pl.col("name").str.len_bytes(),  # <1>
    descending=True  # <2>
)

name,weight,color,is_round,origin
str,i64,str,bool,str
"""Cantaloupe""",2500,"""orange""",True,"""Africa"""
"""Elderberry""",1,"""black""",False,"""Europe"""
"""Watermelon""",5000,"""green""",True,"""Africa"""
"""Blueberry""",1,"""blue""",False,"""North America"""
"""Cranberry""",2,"""red""",False,"""North America"""
"""Avocado""",200,"""green""",False,"""South America"""
"""Banana""",120,"""yellow""",False,"""Asia"""
"""Orange""",130,"""orange""",True,"""Asia"""
"""Papaya""",1000,"""orange""",False,"""South America"""
"""Peach""",150,"""orange""",True,"""Asia"""


## What Exactly Is an Expression?

In [16]:
(
    pl.DataFrame({"a": [1, 2, 3], "b": [0.4, 0.5, 0.6]})
    .with_columns(pl.all().mul(10).name.suffix("_times_10"))
)

a,b,a_times_10,b_times_10
i64,f64,i64,f64
1,0.4,10,4.0
2,0.5,20,5.0
3,0.6,30,6.0


In [17]:
pl.all().mul(10).name.suffix("_times_10").meta.has_multiple_outputs()

True

### Properties of Expressions

In [19]:
is_orange = (pl.col("color") == "orange").alias("is_orange")

fruit.with_columns(is_orange)

name,weight,color,is_round,origin,is_orange
str,i64,str,bool,str,bool
"""Avocado""",200,"""green""",False,"""South America""",False
"""Banana""",120,"""yellow""",False,"""Asia""",False
"""Blueberry""",1,"""blue""",False,"""North America""",False
"""Cantaloupe""",2500,"""orange""",True,"""Africa""",True
"""Cranberry""",2,"""red""",False,"""North America""",False
"""Elderberry""",1,"""black""",False,"""Europe""",False
"""Orange""",130,"""orange""",True,"""Asia""",True
"""Papaya""",1000,"""orange""",False,"""South America""",True
"""Peach""",150,"""orange""",True,"""Asia""",True
"""Watermelon""",5000,"""green""",True,"""Africa""",False


In [20]:
fruit.filter(is_orange)

name,weight,color,is_round,origin
str,i64,str,bool,str
"""Cantaloupe""",2500,"""orange""",True,"""Africa"""
"""Orange""",130,"""orange""",True,"""Asia"""
"""Papaya""",1000,"""orange""",False,"""South America"""
"""Peach""",150,"""orange""",True,"""Asia"""


In [21]:
fruit.group_by(is_orange).len()

is_orange,len
bool,u32
True,4
False,6


In [22]:
flowers = pl.DataFrame({
    "name": ["Tiger lily", "Blue flag", "African marigold"],
    "latin": ["Lilium columbianum", "Iris versicolor", "Tagetes erecta"],
    "color": ["orange", "purple", "orange"]
})

flowers.filter(is_orange)

name,latin,color
str,str,str
"""Tiger lily""","""Lilium columbianum""","""orange"""
"""African marigold""","""Tagetes erecta""","""orange"""


## Creating Expressions

### From Existing Columns

In [25]:
fruit.select(pl.col("color")).columns

['color']

In [26]:
fruit.select(pl.col("is_smelly")).columns

ColumnNotFoundError: is_smelly

In [27]:
fruit.select(pl.col("^.*or.*$")).columns

['color', 'origin']

In [28]:
fruit.select(pl.all()).columns

['name', 'weight', 'color', 'is_round', 'origin']

In [29]:
fruit.select(pl.col(pl.String)).columns

['name', 'color', 'origin']

In [30]:
fruit.select(pl.col(pl.Boolean, pl.Int64)).columns

['weight', 'is_round']

In [31]:
fruit.select(pl.col(["name", "color"])).columns

['name', 'color']

In [32]:
fruit.select(pl.col([pl.String, "is_round"])).columns

TypeError: argument 'dtypes': 'str' is not a Polars data type

In [33]:
pl.NUMERIC_DTYPES

frozenset({Decimal,
           Float32,
           Float64,
           Int16,
           Int32,
           Int64,
           Int8,
           UInt16,
           UInt32,
           UInt64,
           UInt8})

In [34]:
(
    fruit
    .with_columns((pl.col("weight") / 1000).alias("weight_kg"))
    .select(pl.col(pl.NUMERIC_DTYPES))
    .head()
)

weight,weight_kg
i64,f64
200,0.2
120,0.12
1,0.001
2500,2.5
2,0.002


### From Literal Values

In [36]:
pl.select(pl.lit(42))

literal
i32
42


In [37]:
pl.select(pl.lit(42).alias("answer"))

answer
i32
42


In [38]:
fruit.with_columns(pl.lit("Earth").alias("planet"))

name,weight,color,is_round,origin,planet
str,i64,str,bool,str,str
"""Avocado""",200,"""green""",False,"""South America""","""Earth"""
"""Banana""",120,"""yellow""",False,"""Asia""","""Earth"""
"""Blueberry""",1,"""blue""",False,"""North America""","""Earth"""
"""Cantaloupe""",2500,"""orange""",True,"""Africa""","""Earth"""
"""Cranberry""",2,"""red""",False,"""North America""","""Earth"""
"""Elderberry""",1,"""black""",False,"""Europe""","""Earth"""
"""Orange""",130,"""orange""",True,"""Asia""","""Earth"""
"""Papaya""",1000,"""orange""",False,"""South America""","""Earth"""
"""Peach""",150,"""orange""",True,"""Asia""","""Earth"""
"""Watermelon""",5000,"""green""",True,"""Africa""","""Earth"""


In [39]:
fruit.with_columns(pl.lit(pl.Series([False, True])).alias("row_is_even"))

ShapeError: unable to add a column of length 2 to a DataFrame of height 10

In [40]:
fruit.with_columns(pl.lit([False, True]).alias("row_is_even"))

name,weight,color,is_round,origin,row_is_even
str,i64,str,bool,str,list[bool]
"""Avocado""",200,"""green""",False,"""South America""","[false, true]"
"""Banana""",120,"""yellow""",False,"""Asia""","[false, true]"
"""Blueberry""",1,"""blue""",False,"""North America""","[false, true]"
"""Cantaloupe""",2500,"""orange""",True,"""Africa""","[false, true]"
"""Cranberry""",2,"""red""",False,"""North America""","[false, true]"
"""Elderberry""",1,"""black""",False,"""Europe""","[false, true]"
"""Orange""",130,"""orange""",True,"""Asia""","[false, true]"
"""Papaya""",1000,"""orange""",False,"""South America""","[false, true]"
"""Peach""",150,"""orange""",True,"""Asia""","[false, true]"
"""Watermelon""",5000,"""green""",True,"""Africa""","[false, true]"


In [41]:
pl.select(
    pl.repeat("Ello", 3).alias("hello"),
    pl.zeros(3),
    pl.ones(3)
)

hello,zeros,ones
str,f64,f64
"""Ello""",0.0,1.0
"""Ello""",0.0,1.0
"""Ello""",0.0,1.0


In [42]:
fruit.with_columns(pl.repeat("Earth", 9).alias("planet"))

ShapeError: unable to add a column of length 9 to a DataFrame of height 10

### From Ranges

In [44]:
pl.select(
    pl.int_range(0, 5).alias("start"),
    pl.arange(0, 10, 2).pow(2).alias("end")
).with_columns(
    pl.int_ranges("start", "end").alias("int_range")
).with_columns(
    pl.col("int_range").list.len().alias("range_length")
)

start,end,int_range,range_length
i64,i64,list[i64],u32
0,0,[],0
1,4,"[1, 2, 3]",3
2,16,"[2, 3, … 15]",14
3,36,"[3, 4, … 35]",33
4,64,"[4, 5, … 63]",60


In [45]:
from datetime import date

pl.select(
    pl.date_range(date(1985, 10, 21), date(1985, 10, 26)).alias("start"),
    pl.repeat(date(2021, 10, 21), 6).alias("end")
).with_columns(
    pl.datetime_ranges("start", "end", interval="1h").alias("range")
)

start,end,range
date,date,list[datetime[μs]]
1985-10-21,2021-10-21,"[1985-10-21 00:00:00, 1985-10-21 01:00:00, … 2021-10-21 00:00:00]"
1985-10-22,2021-10-21,"[1985-10-22 00:00:00, 1985-10-22 01:00:00, … 2021-10-21 00:00:00]"
1985-10-23,2021-10-21,"[1985-10-23 00:00:00, 1985-10-23 01:00:00, … 2021-10-21 00:00:00]"
1985-10-24,2021-10-21,"[1985-10-24 00:00:00, 1985-10-24 01:00:00, … 2021-10-21 00:00:00]"
1985-10-25,2021-10-21,"[1985-10-25 00:00:00, 1985-10-25 01:00:00, … 2021-10-21 00:00:00]"
1985-10-26,2021-10-21,"[1985-10-26 00:00:00, 1985-10-26 01:00:00, … 2021-10-21 00:00:00]"


### Other Functions to Create Expressions

## Renaming Expressions

In [48]:
df = pl.DataFrame({"text": "value", "An integer": 5040, "BOOLEAN": True})
df


text,An integer,BOOLEAN
str,i64,bool
"""value""",5040,True


In [49]:
df.select(
    pl.col("text").name.to_uppercase(),
    pl.col("An integer").alias("int"),
    pl.col("BOOLEAN").name.to_lowercase(),
)

TEXT,int,boolean
str,i64,bool
"""value""",5040,True


In [50]:
df.select(
    pl.all()
    .name.map(lambda s: s.lower().replace(" ", "_"))
)

text,an_integer,boolean
str,i64,bool
"""value""",5040,True


## Expressions Are Idiomatic

In [52]:
fruit.filter(
    (fruit["weight"] > 1000) & fruit["is_round"]
)

name,weight,color,is_round,origin
str,i64,str,bool,str
"""Cantaloupe""",2500,"""orange""",True,"""Africa"""
"""Watermelon""",5000,"""green""",True,"""Africa"""


In [53]:
(
    fruit
    .lazy()
    .filter((pl.col("weight") > 1000) & pl.col("is_round"))
    .with_columns(pl.col("name").str.ends_with("berry").alias("is_berry"))
    .collect()
)

name,weight,color,is_round,origin,is_berry
str,i64,str,bool,str,bool
"""Cantaloupe""",2500,"""orange""",True,"""Africa""",False
"""Watermelon""",5000,"""green""",True,"""Africa""",False


In [54]:
(
    fruit
    .lazy()
    .filter((fruit["weight"] > 1000) & fruit["is_round"])
    .with_columns(fruit["name"].str.ends_with("berry").alias("is_berry"))
    .collect()
)

ShapeError: unable to add a column of length 10 to a DataFrame of height 2

## Conclusion