# Chapter 8: Continuing Expressions

In [1]:
import polars as pl
pl.show_versions()

--------Version info---------
Polars:               0.20.31
Index type:           UInt32
Platform:             macOS-12.5-arm64-arm-64bit
Python:               3.11.9 (main, Apr  2 2024, 16:11:47) [Clang 14.0.0 (clang-1400.0.29.202)]

----Optional dependencies----
adbc_driver_manager:  0.8.0
cloudpickle:          3.0.0
connectorx:           0.3.2
deltalake:            0.15.0
fastexcel:            0.9.1
fsspec:               2023.12.2
gevent:               23.9.1
hvplot:               0.9.2
matplotlib:           3.8.4
nest_asyncio:         1.6.0
numpy:                1.26.4
openpyxl:             3.1.2
pandas:               2.2.2
pyarrow:              14.0.2
pydantic:             2.5.3
pyiceberg:            0.5.1
pyxlsb:               <not installed>
sqlalchemy:           2.0.25
torch:                <not installed>
xlsx2csv:             0.8.2
xlsxwriter:           3.2.0


In [2]:
import math
import numpy as np

print(f"{math.pi=}")
rng = np.random.default_rng(1729)
print(f"{rng.random()=}")

math.pi=3.141592653589793
rng.random()=0.03074202960516803


## Types of Operations

### Example A: Element-Wise Operations

In [5]:
penguins = (
    pl.read_csv("data/penguins.csv", null_values="NA")
    .select(
        "species",
        "island",
        "sex",
        "year",
        pl.col("body_mass_g").alias("mass") / 1000
    ))
penguins.with_columns(
    pl.col("mass").sqrt().alias("mass_sqrt"),  # <1>
    pl.col("mass").interpolate().alias("mass_filled")  # <2>
)

species,island,sex,year,mass,mass_sqrt,mass_filled
str,str,str,i64,f64,f64,f64
"""Adelie""","""Torgersen""","""male""",2007,3.75,1.936492,3.75
"""Adelie""","""Torgersen""","""female""",2007,3.8,1.949359,3.8
"""Adelie""","""Torgersen""","""female""",2007,3.25,1.802776,3.25
"""Adelie""","""Torgersen""",,2007,,,3.35
"""Adelie""","""Torgersen""","""female""",2007,3.45,1.857418,3.45
…,…,…,…,…,…,…
"""Chinstrap""","""Dream""","""male""",2009,4.0,2.0,4.0
"""Chinstrap""","""Dream""","""female""",2009,3.4,1.843909,3.4
"""Chinstrap""","""Dream""","""male""",2009,3.775,1.942936,3.775
"""Chinstrap""","""Dream""","""male""",2009,4.1,2.024846,4.1


### Example B: Operations that Summarize to One

In [7]:
penguins.select(
    pl.col("mass").mean(),
    pl.col("island").mode().first()  # <1>
)

mass,island
f64,str
4.201754,"""Biscoe"""


### Example C: Operations that Summarize to One or More

In [9]:
penguins.select(
    pl.col("island").unique()
)

island
str
"""Biscoe"""
"""Torgersen"""
"""Dream"""


### Example D: Operations that Extend

In [11]:
penguins.select(
    pl.col("species")
    .unique()  # <1>
    .repeat_by(3000)  # <2>
    .explode()  # <3>
    .extend_constant("Saiyan", n=1)  # <4>
)

species
str
"""Adelie"""
"""Adelie"""
"""Adelie"""
"""Adelie"""
"""Adelie"""
…
"""Chinstrap"""
"""Chinstrap"""
"""Chinstrap"""
"""Chinstrap"""


## Element-Wise Operations

### Operations That Perform Mathematical Transformations

In [14]:
(
    pl.DataFrame({"x": [-2, 0, 0.5, 1, math.e, 1000]})
    .with_columns(
        abs=pl.col("x").abs(),
        exp=pl.col("x").exp(),
        log2=pl.col("x").log(2),  # <1>
        log10=pl.col("x").log10(),
        log1p=pl.col("x").log1p(),
        sign=pl.col("x").sign(),
        sqrt=pl.col("x").sqrt(),
    )
)

x,abs,exp,log2,log10,log1p,sign,sqrt
f64,f64,f64,f64,f64,f64,i64,f64
-2.0,2.0,0.135335,,,,-1,
0.0,0.0,1.0,-inf,-inf,0.0,0,0.0
0.5,0.5,1.648721,-1.0,-0.30103,0.405465,1,0.707107
1.0,1.0,2.718282,0.0,0.0,0.693147,1,1.0
2.718282,2.718282,15.154262,1.442695,0.434294,1.313262,1,1.648721
1000.0,1000.0,inf,9.965784,3.0,6.908755,1,31.622777


### Operations Related to Trigonometry

In [16]:
(
    pl.DataFrame({"x": [-math.pi, 0, 1, math.pi, 2*math.pi, 90, 180, 360]})
    .with_columns(
        arccos=pl.col("x").arccos(),  # <1>
        cos=pl.col("x").cos(),
        degrees=pl.col("x").degrees(),
        radians=pl.col("x").radians(),
        sin=pl.col("x").sin(),
    )
)

x,arccos,cos,degrees,radians,sin
f64,f64,f64,f64,f64,f64
-3.141593,,-1.0,-180.0,-0.054831,-1.2246e-16
0.0,1.570796,1.0,0.0,0.0,0.0
1.0,0.0,0.540302,57.29578,0.017453,0.841471
3.141593,,-1.0,180.0,0.054831,1.2246e-16
6.283185,,1.0,360.0,0.109662,-2.4493e-16
90.0,,-0.448074,5156.620156,1.570796,0.893997
180.0,,-0.59846,10313.240312,3.141593,-0.801153
360.0,,-0.283691,20626.480625,6.283185,0.958916


### Operations That Round and Categorize

In [18]:
(
    pl.DataFrame({"x": [-6, -0.5, 0, 0.5, math.pi, 9.9, 9.99, 9.999]})
    .with_columns(
        ceil=pl.col("x").ceil(),
        clip=pl.col("x").clip(-1, 1),
        cut=pl.col("x").cut([-1, 1], labels=["bad", "neutral", "good"]),  # <1>
        floor=pl.col("x").floor(),
        qcut=pl.col("x").qcut([0.5], labels=["below median", "above median"]),
        round2=pl.col("x").round(2),
        round0=pl.col("x").round(0),  # <2>
    )
)

x,ceil,clip,cut,floor,qcut,round2,round0
f64,f64,f64,cat,f64,cat,f64,f64
-6.0,-6.0,-1.0,"""bad""",-6.0,"""below median""",-6.0,-6.0
-0.5,-0.0,-0.5,"""neutral""",-1.0,"""below median""",-0.5,-1.0
0.0,0.0,0.0,"""neutral""",0.0,"""below median""",0.0,0.0
0.5,1.0,0.5,"""neutral""",0.0,"""below median""",0.5,1.0
3.141593,4.0,1.0,"""good""",3.0,"""above median""",3.14,3.0
9.9,10.0,1.0,"""good""",9.0,"""above median""",9.9,10.0
9.99,10.0,1.0,"""good""",9.0,"""above median""",9.99,10.0
9.999,10.0,1.0,"""good""",9.0,"""above median""",10.0,10.0


### Operations for Missing or Infinite Values

In [20]:
x = [42, math.nan, None, math.inf, -math.inf]
(
    pl.DataFrame({"x": x})
    .with_columns(
        fill_nan=pl.col("x").fill_nan(999),
        fill_null=pl.col("x").fill_null(0),
        is_finite=pl.col("x").is_finite(),
        is_infinite=pl.col("x").is_finite(),
        is_nan=pl.col("x").is_nan(),
        is_null=pl.col("x").is_null(),
    )
)

x,fill_nan,fill_null,is_finite,is_infinite,is_nan,is_null
f64,f64,f64,bool,bool,bool,bool
42.0,42.0,42.0,True,True,False,False
,999.0,,False,False,True,False
,,0.0,,,,True
inf,inf,inf,False,False,False,False
-inf,-inf,-inf,False,False,False,False


In [21]:
(
    pl.DataFrame({"x": x})
    .with_columns(
        fill_both=pl.col("x").fill_nan(0).fill_null(0),
        is_either=(
            pl.col("x").is_nan() | pl.col("x").is_null()
        ),
    )
)

x,fill_both,is_either
f64,f64,bool
42.0,42.0,False
,0.0,True
,0.0,True
inf,inf,False
-inf,-inf,False


### Other Operations

In [23]:
(
    pl.DataFrame({"x": ["here", "there", "their", "they're"]})
    .with_columns(
        hash=pl.col("x").hash(seed=1337),  # <1>
        repeat_by=pl.col("x").repeat_by(3),
        replace=pl.col("x").replace({
            "here": "there",
            "they're": "they are",
        }),
    )
)

x,hash,repeat_by,replace
str,u64,list[str],str
"""here""",12695211751326448172,"[""here"", ""here"", ""here""]","""there"""
"""there""",17329794691236705436,"[""there"", ""there"", ""there""]","""there"""
"""their""",2663095961041830581,"[""their"", ""their"", ""their""]","""their"""
"""they're""",6743063676290245144,"[""they're"", ""they're"", ""they're""]","""they are"""


## Nonreducing Series-Wise Operations

### Operations That Accumulate

In [26]:
(
    pl.DataFrame({"x": [0, 1, 2, None, 2, np.NaN, -1, 2]})
    .with_columns(
        cum_count=pl.col("x").cum_count(),  # <1>
        cum_max=pl.col("x").cum_max(),
        cum_min=pl.col("x").cum_min(),
        cum_prod=pl.col("x").cum_prod(reverse=True),  # <2>
        cum_sum=pl.col("x").cum_sum(),
        diff=pl.col("x").diff(),
        pct_change=pl.col("x").pct_change(),
    )
)

x,cum_count,cum_max,cum_min,cum_prod,cum_sum,diff,pct_change
f64,u32,f64,f64,f64,f64,f64,f64
0.0,1,0.0,0.0,,0.0,,
1.0,2,1.0,0.0,,1.0,1.0,inf
2.0,3,2.0,0.0,,3.0,1.0,1.0
,3,,,,,,0.0
2.0,4,2.0,0.0,,5.0,,0.0
,5,2.0,0.0,,,,
-1.0,6,2.0,-1.0,-2.0,,,
2.0,7,2.0,-1.0,2.0,,3.0,-3.0


### Operations That Fill and Shift

In [28]:
(
    pl.DataFrame({"x": [-1, 0, 1, None, None, 3, 4, math.nan, 6]})
    .with_columns(
        backward_fill=pl.col("x").backward_fill(),  # <1>
        forward_fill=pl.col("x").forward_fill(limit=1),
        interp1=pl.col("x").interpolate(method="linear"),  # <2>
        interp2=pl.col("x").interpolate(method="nearest"),
        shift1=pl.col("x").shift(1),
        shift2=pl.col("x").shift(-2),
    )
)

x,backward_fill,forward_fill,interp1,interp2,shift1,shift2
f64,f64,f64,f64,f64,f64,f64
-1.0,-1.0,-1.0,-1.0,-1.0,,1.0
0.0,0.0,0.0,0.0,0.0,-1.0,
1.0,1.0,1.0,1.0,1.0,0.0,
,3.0,1.0,1.666667,1.0,1.0,3.0
,3.0,,2.333333,3.0,,4.0
3.0,3.0,3.0,3.0,3.0,,
4.0,4.0,4.0,4.0,4.0,3.0,6.0
,,,,,4.0,
6.0,6.0,6.0,6.0,6.0,,


### Operations Related to Duplicate Values

In [30]:
(
    pl.DataFrame({"x": ["A", "C", "D", "C"]})  # <1>
    .with_columns(
        is_duplicated=pl.col("x").is_duplicated(),
        is_first_distinct=pl.col("x").is_first_distinct(),
        is_last_distinct=pl.col("x").is_last_distinct(),
        is_unique=pl.col("x").is_unique(),
    )
)

x,is_duplicated,is_first_distinct,is_last_distinct,is_unique
str,bool,bool,bool,bool
"""A""",False,True,True,True
"""C""",True,True,False,False
"""D""",False,True,True,True
"""C""",True,False,True,False


### Operations That Compute Rolling Statistics

In [32]:
stock = (
    pl.read_csv("data/stock/nvda/2023.csv", try_parse_dates=True)
    .select("date", "close")
    .with_columns(
        ewm_mean=pl.col("close").ewm_mean(com=7, ignore_nulls=True),  # <1>
        rolling_mean=pl.col("close").rolling_mean(window_size=7),
        rolling_min=pl.col("close").rolling_min(window_size=7),
    )
)
stock

date,close,ewm_mean,rolling_mean,rolling_min
date,f64,f64,f64,f64
2023-01-03,143.149994,143.149994,,
2023-01-04,147.490005,145.464667,,
2023-01-05,142.649994,144.398755,,
2023-01-06,148.589996,145.664782,,
2023-01-09,156.279999,148.388917,,
…,…,…,…,…
2023-06-26,406.320007,407.54911,425.805716,406.320007
2023-06-27,418.76001,408.950473,424.695718,406.320007
2023-06-28,411.170013,409.227915,422.445718,406.320007
2023-06-29,408.220001,409.101926,418.180006,406.320007


In [33]:
from matplotlib.dates import DateFormatter
stock.plot.line(
    x="date",
    y=["close", "ewm_mean", "rolling_mean", "rolling_min"],
    xformatter=DateFormatter("%b %Y")
)

ValueError: ClassSelector parameter 'ElementPlot.xformatter' value must be an instance of (str, TickFormatter, function), not <matplotlib.dates.DateFormatter object at 0x15e61b7d0>.

:NdOverlay   [Variable]
   :Curve   [date]   (value)

### Operations That Sort

In [35]:
(
    pl.DataFrame({
        "x": [1, 3, None, 3, 7],
        "y": ["D", "I", "S", "C", "O"],
    })
    .with_columns(
        arg_sort=pl.col("x").arg_sort(),
        shuffle=pl.col("x").shuffle(seed=7),
        sort=pl.col("x").sort(nulls_last=True),
        sort_by=pl.col("x").sort_by("y"),
        reverse=pl.col("x").reverse(),
        rank=pl.col("x").rank(),
    )
)

x,y,arg_sort,shuffle,sort,sort_by,reverse,rank
i64,str,u32,i64,i64,i64,i64,f64
1.0,"""D""",2,1.0,1.0,3.0,7.0,1.0
3.0,"""I""",0,,3.0,1.0,3.0,2.5
,"""S""",1,3.0,3.0,3.0,,
3.0,"""C""",3,7.0,7.0,7.0,3.0,2.5
7.0,"""O""",4,3.0,,,1.0,4.0


### Other Operations

In [37]:
(
    pl.DataFrame({"x": [33, 33, 27, 33, 60, 60, 60, 33, 60]})
    .with_columns(
        rle_id=pl.col("x").rle_id(),
    )
)

x,rle_id
i64,u32
33,0
33,0
27,1
33,2
60,3
60,3
60,3
33,4
60,5


## Series-Wise Operations that Summarize to One

In [39]:
(
    pl.DataFrame({"x": [1, 3, 3, 7]})
    .with_columns(
        mean=pl.col("x").mean(),
    )
)

x,mean
i64,f64
1,3.5
3,3.5
3,3.5
7,3.5


In [40]:
(
    pl.DataFrame({
        "cluster": ["a", "a", "b", "b"],
        "x": [1, 3, 3, 7]
    })
    .group_by("cluster")
    .agg(
        mean=pl.col("x").mean(),
    )
)

cluster,mean
str,f64
"""a""",2.0
"""b""",5.0


### Operations That Are Quantifiers

In [42]:
(
    pl.DataFrame({
        "x": [True, False, False],
        "y": [True, True, True],
        "z": [False, False, False],
        })
    .select(
        pl.all().all().name.suffix("_all"),
        pl.all().any().name.suffix("_any"),
    )
)

x_all,y_all,z_all,x_any,y_any,z_any
bool,bool,bool,bool,bool,bool
False,True,False,True,True,False


### Operations That Compute Statistics

In [44]:
samples = rng.normal(loc=5, scale=3, size=1_000_000)

(
    pl.DataFrame({"x": samples})
    .select(
        max=pl.col("x").max(),
        mean=pl.col("x").mean(),
        quantile=pl.col("x").quantile(quantile=0.95),
        skew=pl.col("x").skew(),
        std=pl.col("x").std(),
        sum=pl.col("x").sum(),
        var=pl.col("x").var(),
    )
)

max,mean,quantile,skew,std,sum,var
f64,f64,f64,f64,f64,f64,f64
20.752443,4.994978,9.931565,0.003245,2.999926,4995000.0,8.999558


### Operations That Count

In [46]:
samples = pl.Series(rng.integers(low=0, high=10_000, size=1_729))
samples[403] = None  # <1>
df_ints = (
    pl.DataFrame({"x": samples})
    .with_row_index()  # <2>
)
df_ints.slice(400, 6)  # <3>

index,x
u32,i64
400,807.0
401,8634.0
402,2109.0
403,
404,1740.0
405,3333.0


In [47]:
df_ints.select(
    approx_n_unique=pl.col("x").approx_n_unique(),
    count=pl.col("x").count(),
    len=pl.col("x").len(),
    n_unique=pl.col("x").n_unique(),
    null_count=pl.col("x").null_count(),
)

approx_n_unique,count,len,n_unique,null_count
u32,u32,u32,u32,u32
1572,1728,1729,1575,1


### Other Operations

In [49]:
df_ints.select(
    arg_min=pl.col("x").arg_min(),
    first=pl.col("x").first(),
    get=pl.col("x").get(403),  # <1>
    implode=pl.col("x").implode(),
    last=pl.col("x").last(),
    upper_bound=pl.col("x").upper_bound(),
)

arg_min,first,get,implode,last,upper_bound
u32,i64,i64,list[i64],i64,i64
0,0,,"[0, 7245, … 3723]",3723,9223372036854775807


## Series-Wise Operations that Summarize to One or More

### Operations Related to Unique Values

In [52]:
(
    pl.DataFrame({"x": ["A", "C", "D", "C"]})
    .select(
        arg_unique=pl.col("x").arg_unique(),
        unique=pl.col("x").unique(maintain_order=True),  # <1>
        unique_counts=pl.col("x").unique_counts(),
        value_counts=pl.col("x").value_counts(),  # <2>
    )
)

arg_unique,unique,unique_counts,value_counts
u32,str,u32,struct[2]
0,"""A""",1,"{""D"",1}"
1,"""C""",2,"{""A"",1}"
2,"""D""",1,"{""C"",2}"


### Operations That Select

In [54]:
df_ints.select(
    bottom_k=pl.col("x").bottom_k(7),  # <1>
    head=pl.col("x").head(7),
    sample=pl.col("x").sample(7),
    slice=pl.col("x").slice(400, 7),
    gather=pl.col("x").gather([1, 1, 2, 3, 5, 8, 13]),
    gather_every=pl.col("x").gather_every(247),  # <2>
    top_k=pl.col("x").top_k(7),
)

bottom_k,head,sample,slice,gather,gather_every,top_k
i64,i64,i64,i64,i64,i64,i64
,0,343,807.0,7245,0,9998
0.0,7245,7095,8634.0,7245,8680,9988
1.0,5227,333,2109.0,5227,8483,9988
6.0,2747,6744,,2747,8358,9986
7.0,9816,6443,1740.0,2657,1805,9985
10.0,2657,2658,3333.0,5393,3638,9979
21.0,4578,8737,788.0,8203,5843,9975


### Operations That Drop Missing Values

In [56]:
x = [None, 1, 2, 3, np.NaN]
(
    pl.DataFrame({"x": x})
    .select(
        drop_nans=pl.col("x").drop_nans(),
        drop_nulls=pl.col("x").drop_nulls()
    )
)

drop_nans,drop_nulls
f64,f64
,1.0
1.0,2.0
2.0,3.0
3.0,


### Other Operations

In [58]:
numbers = [33, 33, 27, 33, 60, 60, 60, 33, 60]

(
    pl.DataFrame({"x": numbers})
    .select(
        arg_true=(pl.col("x") >= 60).arg_true(),  # <1>
    )
)

arg_true
u32
4
5
6
8


In [59]:
(
    pl.DataFrame({"x": numbers})
    .select(
        mode=pl.col("x").mode(),
    )
)

mode
i64
33
60


In [60]:
(
    pl.DataFrame({"x": numbers})
    .select(
        reshape=pl.col("x").reshape((3, 3)),  # <1>
    )
)

reshape
list[i64]
"[33, 33, 27]"
"[33, 60, 60]"
"[60, 33, 60]"


In [61]:
(
    pl.DataFrame({"x": numbers})
    .select(
        rle=pl.col("x").rle(),  # <1>
    )
)

rle
struct[2]
"{2,33}"
"{1,27}"
"{1,33}"
"{3,60}"
"{1,33}"
"{1,60}"


In [62]:
(
    pl.DataFrame({"x": numbers})
    .select(
        rle=pl.col("x").sort().search_sorted(42),  # <1>
    )
)

rle
u32
5


## Series-Wise Operations that Extend

In [64]:
(
    pl.DataFrame({
        "x": [["a", "b"], ["c", "d"]],
    })
    .select(
        explode=pl.col("x").explode()
    )
)

explode
str
"""a"""
"""b"""
"""c"""
"""d"""


## Conclusion