In [3]:
# chapter 7

import polars as pl

fruit = pl.read_csv("data/fruit.csv")
fruit

name,weight,color,is_round,origin
str,i64,str,bool,str
"""Avocado""",200,"""green""",False,"""South America"""
"""Banana""",120,"""yellow""",False,"""Asia"""
"""Blueberry""",1,"""blue""",False,"""North America"""
"""Cantaloupe""",2500,"""orange""",True,"""Africa"""
"""Cranberry""",2,"""red""",False,"""North America"""
"""Elderberry""",1,"""black""",False,"""Europe"""
"""Orange""",130,"""orange""",True,"""Asia"""
"""Papaya""",1000,"""orange""",False,"""South America"""
"""Peach""",150,"""orange""",True,"""Asia"""
"""Watermelon""",5000,"""green""",True,"""Africa"""


In [4]:
# selecting with and without expressions

fruit.select(
    pl.col("name"),  # this is an expression i guess?
    pl.col("^.*or.*$"),  # you can use regex! cool
    pl.col("weight") / 1000,  # performing arithmatic
    "is_round",  # just plain ole string - can't do anything to it, but less typing
)

name,color,origin,weight,is_round
str,str,str,f64,bool
"""Avocado""","""green""","""South America""",0.2,False
"""Banana""","""yellow""","""Asia""",0.12,False
"""Blueberry""","""blue""","""North America""",0.001,False
"""Cantaloupe""","""orange""","""Africa""",2.5,True
"""Cranberry""","""red""","""North America""",0.002,False
"""Elderberry""","""black""","""Europe""",0.001,False
"""Orange""","""orange""","""Asia""",0.13,True
"""Papaya""","""orange""","""South America""",1.0,False
"""Peach""","""orange""","""Asia""",0.15,True
"""Watermelon""","""green""","""Africa""",5.0,True


In [5]:
# creating new columns with expressions

fruit.with_columns(
    # can start a col witha  literal value- here its true and then name it whatever with alias
    pl.lit(True).alias("is_fruit"),
    is_berry=pl.col("name").str.ends_with("berry"),
)

name,weight,color,is_round,origin,is_fruit,is_berry
str,i64,str,bool,str,bool,bool
"""Avocado""",200,"""green""",False,"""South America""",True,False
"""Banana""",120,"""yellow""",False,"""Asia""",True,False
"""Blueberry""",1,"""blue""",False,"""North America""",True,True
"""Cantaloupe""",2500,"""orange""",True,"""Africa""",True,False
"""Cranberry""",2,"""red""",False,"""North America""",True,True
"""Elderberry""",1,"""black""",False,"""Europe""",True,True
"""Orange""",130,"""orange""",True,"""Asia""",True,False
"""Papaya""",1000,"""orange""",False,"""South America""",True,False
"""Peach""",150,"""orange""",True,"""Asia""",True,False
"""Watermelon""",5000,"""green""",True,"""Africa""",True,False


Filtering based on an expression

In [6]:
fruit.filter(
    pl.col("name").str.ends_with("a")
    | (pl.col("color") == "black")
    | (pl.col("weight"))
)

InvalidOperationError: filter predicate must be of type `Boolean`, got `Int64`

Resolved plan until failure:

	---> FAILED HERE RESOLVING 'filter' failed <---
FILTER [([(col("name").str.ends_with([String(a)])) | ([(col("color")) == (String(black))])]) | (col("weight"))] FROM
  DF ["name", "weight", "color", "is_round"]; PROJECT */5 COLUMNS

In [None]:
fruit.group_by(pl.col("origin").str.split(" ").list.last()).agg(
    pl.len(), average_weight=pl.col("weight").mean()
)

origin,len,average_weight
str,u32,f64
"""Europe""",1,1.0
"""Africa""",2,3750.0
"""America""",4,300.75
"""Asia""",3,133.333333


In [None]:
fruit.with_columns(huh=pl.col("origin").str.split(" ").list.last())

name,weight,color,is_round,origin,huh
str,i64,str,bool,str,str
"""Avocado""",200,"""green""",False,"""South America""","""America"""
"""Banana""",120,"""yellow""",False,"""Asia""","""Asia"""
"""Blueberry""",1,"""blue""",False,"""North America""","""America"""
"""Cantaloupe""",2500,"""orange""",True,"""Africa""","""Africa"""
"""Cranberry""",2,"""red""",False,"""North America""","""America"""
"""Elderberry""",1,"""black""",False,"""Europe""","""Europe"""
"""Orange""",130,"""orange""",True,"""Asia""","""Asia"""
"""Papaya""",1000,"""orange""",False,"""South America""","""America"""
"""Peach""",150,"""orange""",True,"""Asia""","""Asia"""
"""Watermelon""",5000,"""green""",True,"""Africa""","""Africa"""


In [None]:
fruit.sort(
    pl.col("name").str.len_bytes(),
    descending=True,
)

name,weight,color,is_round,origin
str,i64,str,bool,str
"""Cantaloupe""",2500,"""orange""",True,"""Africa"""
"""Elderberry""",1,"""black""",False,"""Europe"""
"""Watermelon""",5000,"""green""",True,"""Africa"""
"""Blueberry""",1,"""blue""",False,"""North America"""
"""Cranberry""",2,"""red""",False,"""North America"""
"""Avocado""",200,"""green""",False,"""South America"""
"""Banana""",120,"""yellow""",False,"""Asia"""
"""Orange""",130,"""orange""",True,"""Asia"""
"""Papaya""",1000,"""orange""",False,"""South America"""
"""Peach""",150,"""orange""",True,"""Asia"""


In [None]:
pl.DataFrame({"a": [1, 2, 3], "b": [0.4, 0.5, 0.6]}).with_columns(
    pl.all().mul(6).name.suffix("_times_6")
)

a,b,a_times_6,b_times_6
i64,f64,i64,f64
1,0.4,6,2.4
2,0.5,12,3.0
3,0.6,18,3.6


expressions

In [None]:
is_orange = (pl.col("color") == "orange").alias("is_orange")

fruit.with_columns(is_orange)
fruit.filter(is_orange)
fruit.group_by(is_orange).len()

is_orange,len
bool,u32
True,4
False,6


In [10]:
flowers = pl.DataFrame(
    {
        "name": ["Tiger lili", "Blue flag", "African marigold"],
        "latin": ["Lilium columbianum", "Iris versicolor", "Tagetes erecta"],
        "color": ["orange", "purple", "orange"],
    }
)

flowers.filter(is_orange)

name,latin,color
str,str,str
"""Tiger lili""","""Lilium columbianum""","""orange"""
"""African marigold""","""Tagetes erecta""","""orange"""


In [11]:
fruit.select(pl.col("color")).columns

['color']

In [None]:
# fruit.select(pl.col("is_smelly")).columns
fruit.select(pl.col("^.*or.*$")).columns
fruit.select(pl.all()).columns
fruit.select(pl.col(pl.String)).columns
fruit.select(pl.col(pl.Boolean, pl.Int64)).columns
fruit.select(pl.col(["name", "color"])).columns

['name', 'color']

literal values

In [18]:
pl.select(pl.lit(42))

literal
i32
42


In [20]:
pl.select(pl.lit(42).alias("answer"))
pl.select(planet=pl.lit(42))

planet
i32
42


In [21]:
fruit.with_columns(planet=pl.lit("Earth"))

name,weight,color,is_round,origin,planet
str,i64,str,bool,str,str
"""Avocado""",200,"""green""",False,"""South America""","""Earth"""
"""Banana""",120,"""yellow""",False,"""Asia""","""Earth"""
"""Blueberry""",1,"""blue""",False,"""North America""","""Earth"""
"""Cantaloupe""",2500,"""orange""",True,"""Africa""","""Earth"""
"""Cranberry""",2,"""red""",False,"""North America""","""Earth"""
"""Elderberry""",1,"""black""",False,"""Europe""","""Earth"""
"""Orange""",130,"""orange""",True,"""Asia""","""Earth"""
"""Papaya""",1000,"""orange""",False,"""South America""","""Earth"""
"""Peach""",150,"""orange""",True,"""Asia""","""Earth"""
"""Watermelon""",5000,"""green""",True,"""Africa""","""Earth"""


In [None]:
fruit.with_columns(row_is_even=pl.lit([False, True]))

name,weight,color,is_round,origin,row_is_even
str,i64,str,bool,str,list[bool]
"""Avocado""",200,"""green""",False,"""South America""","[false, true]"
"""Banana""",120,"""yellow""",False,"""Asia""","[false, true]"
"""Blueberry""",1,"""blue""",False,"""North America""","[false, true]"
"""Cantaloupe""",2500,"""orange""",True,"""Africa""","[false, true]"
"""Cranberry""",2,"""red""",False,"""North America""","[false, true]"
"""Elderberry""",1,"""black""",False,"""Europe""","[false, true]"
"""Orange""",130,"""orange""",True,"""Asia""","[false, true]"
"""Papaya""",1000,"""orange""",False,"""South America""","[false, true]"
"""Peach""",150,"""orange""",True,"""Asia""","[false, true]"
"""Watermelon""",5000,"""green""",True,"""Africa""","[false, true]"


In [None]:
pl.select(pl.repeat("Ella", 3).alias("umbrella"), pl.zeros(3), pl.ones(3))

umbrella,zeros,ones
str,f64,f64
"""Ella""",0.0,1.0
"""Ella""",0.0,1.0
"""Ella""",0.0,1.0


In [None]:
pl.select(start=pl.int_range(0, 5), end=pl.arange(0, 10, 2).pow(2)).with_columns(
    int_range=pl.int_ranges("start", "end")
).with_columns(range_length=pl.col("int_range").list.len())

start,end,int_range,range_length
i64,i64,list[i64],u32
0,0,[],0
1,4,"[1, 2, 3]",3
2,16,"[2, 3, … 15]",14
3,36,"[3, 4, … 35]",33
4,64,"[4, 5, … 63]",60


rename expression

In [28]:
df = pl.DataFrame({"text": "value", "An integer": 5040, "BOOLEAN": True})
df

text,An integer,BOOLEAN
str,i64,bool
"""value""",5040,True


In [31]:
df.select(
    pl.col("text").name.to_uppercase(),
    pl.col("An integer").alias("int"),
    pl.col("BOOLEAN").name.to_lowercase(),
)

TEXT,int,boolean
str,i64,bool
"""value""",5040,True



optimal

In [33]:
fruit.lazy().filter((pl.col("weight") > 1000) & pl.col("is_round")).with_columns(
    is_berry=pl.col("name").str.ends_with("berry")
).collect()

name,weight,color,is_round,origin,is_berry
str,i64,str,bool,str,bool
"""Cantaloupe""",2500,"""orange""",True,"""Africa""",False
"""Watermelon""",5000,"""green""",True,"""Africa""",False


suboptimal