In [None]:
import polars as pl
>>> df = pl.DataFrame(
...     {
...         "A": [1, 2, 3, 4, 5],
...         "fruits": ["banana", "banana", "apple", "apple", "banana"],
...         "B": [5, 4, 3, 2, 1],
...         "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
...     }
... )

In [None]:
df.sort("fruits").select(
...     [
...         "fruits",
...         "cars",
...         pl.lit("fruits").alias("literal_string_fruits"),
...         pl.col("B").filter(pl.col("cars") == "beetle").sum(),
...         pl.col("A").filter(pl.col("B") > 2).sum().over("cars").alias("sum_A_by_cars"),
...         pl.col("A").sum().over("fruits").alias("sum_A_by_fruits"),
...         pl.col("A").reverse().over("fruits").alias("rev_A_by_fruits"),
...         pl.col("A").sort_by("B").over("fruits").alias("sort_A_by_B_by_fruits"),
...     ]
... )

In [None]:
df = pl.read_csv("https://j.mp/iriscsv")
print(df.filter(pl.col("sepal_length") > 5)
      .groupby("species", maintain_order=True)
      .agg(pl.all().sum())
)

## User Guide:
  * https://pola-rs.github.io/polars-book/user-guide/quickstart/intro.html

In [None]:
print(
    pl.read_csv("https://j.mp/iriscsv")
    .lazy()
    .filter(pl.col("sepal_length") > 5)
    .groupby("species", maintain_order=True)
    .agg(pl.all().sum())
    .collect()
)

In [None]:
(
    df.lazy()
    .filter(pl.col("sepal_length") > 5)
    .groupby("species", maintain_order=True)
    .agg(pl.all().sum())
    .collect()
)

## Selecting data
  * https://pola-rs.github.io/polars-book/user-guide/howcani/selecting_data/selecting_data_expressions.html

In [None]:
df = pl.DataFrame(
    {
        "id": [1, 2, 3],
        "color": ["blue", "red", "green"],
        "size": ["small", "medium", "large"],
    }
)
print(df)


In [None]:
filter_df = df.filter(pl.col("id") <= 2)
print(filter_df)


In [None]:
multi_filter_df = df.filter((pl.col("id") <= 2) & (pl.col("size") == "small"))
print(multi_filter_df)


### Selecting columns with the select method

In [None]:
single_select_df = df.select("id")
print(single_select_df)


In [None]:
list_select_df = df.select(["id", "color"])
print(list_select_df)


In [None]:
list_select_df = df.select(["id", "color"])
print(list_select_df)


In [None]:
dtype_select_df = df.select(pl.col(pl.Int64))
print(dtype_select_df)


### Selecting rows and columns

In [None]:
expression_df = df.filter(pl.col("id") <= 2).select(["id", "color"])
print(expression_df)


In [None]:
df = pl.DataFrame(
    {
        "A": [1, 2, 3, 4, 5],
        "fruits": ["banana", "banana", "apple", "apple", "banana"],
        "B": [5, 4, 3, 2, 1],
        "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
        "optional": [28, 300, None, 2, -30],
    }
)
df


In [None]:
# We can select by name

(df.select([
    pl.col("A"),
    "B",      # the col part is inferred
    pl.lit("B"),  # we must tell polars we mean the literal "B"
    pl.col("fruits"),
]))


## Regex -- Column Starts with A and ends with B 

In [None]:
# you can select columns with a regex if it starts with '^' and ends with '$'

(df.select([
    pl.col("^A|B$").sum()
]))


In [None]:
# We select everything in normal order
# Then we select everything in reversed order

(df.select([
    pl.all(),
    pl.all().reverse().suffix("_reverse")
]))


### Folds
Polars provides expressions/methods for horizontal aggregations like sum, min, mean, etc. by setting the argument axis=1. However, when you need a more complex aggregation the default methods provided by the Polars library may not be sufficient. That's when folds come in handy.

The Polars fold expression operates on columns for maximum speed. It utilizes the data layout very efficiently and often has vectorized execution.

Let's start with an example by implementing the sum operation ourselves, with a fold.

In [None]:
df = pl.DataFrame(
    {
        "a": [1, 2, 3],
        "b": [10, 20, 30],
    }
)

out = df.select(
    pl.fold(acc=pl.lit(0), f=lambda acc, x: acc + x, exprs=pl.col("*")).alias("sum"),
)
print(out)


In [None]:
df = pl.DataFrame(
    {
        "a": [1, 2, 3],
        "b": [0, 1, 2],
    }
)

out = df.filter(
    pl.fold(
        acc=pl.lit(True),
        f=lambda acc, x: acc & x,
        exprs=pl.col("*") > 1,
    )
)
print(out)


In [None]:
url = 'https://theunitedstates.io/congress-legislators/legislators-current.csv'
dtypes = {
    "first_name": pl.Categorical,
    "gender": pl.Categorical,
    "type": pl.Categorical,
    "state": pl.Categorical,
    "party": pl.Categorical,
}

dataset = pl.read_csv(url, dtypes=dtypes).with_column(pl.col("birthday").str.strptime(pl.Date, strict=False))


In [None]:
#from .dataset import dataset

q = (
    dataset.lazy()
    .groupby("first_name")
    .agg(
        [
            pl.count(),
            pl.col("gender").list(),
            pl.first("last_name"),
        ]
    )
    .sort("count", reverse=True)
    .limit(5)
)

df = q.collect()

In [None]:
q = (
    dataset.lazy()
    .groupby("state")
    .agg(
        [
            (pl.col("party") == "Anti-Administration").sum().alias("anti"),
            (pl.col("party") == "Pro-Administration").sum().alias("pro"),
        ]
    )
    .sort("pro", reverse=True)
    .limit(5)
)

df = q.collect()

In [None]:
df.head(5)

In [None]:
q = (
    dataset.lazy()
    .groupby(["state", "party"])
    .agg([pl.count("party").alias("count")])
    .filter((pl.col("party") == "Anti-Administration") | (pl.col("party") == "Pro-Administration"))
    .sort("count", reverse=True)
    .limit(5)
)

df = q.collect()

In [None]:
df.head()

In [None]:
import polars as pl

# then let's load some csv data with information about pokemon
df = pl.read_csv(
    "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv"
)


In [None]:
df.head(5)

In [None]:
out = df.select(
    [
        "Type 1",
        "Type 2",
        pl.col("Attack").mean().over("Type 1").alias("avg_attack_by_type"),
        pl.col("Defense").mean().over(["Type 1", "Type 2"]).alias("avg_defense_by_type_combination"),
        pl.col("Attack").mean().alias("avg_attack"),
    ]
)


In [None]:
out.head(5)

In [None]:
filtered = df.filter(pl.col("Type 2") == "Psychic").select(
    [
        "Name",
        "Type 1",
        "Speed",
    ]
)
print(filtered)


In [None]:
out = filtered.with_columns(
    [
        pl.col(["Name", "Speed"]).sort(reverse=True).over("Type 1"),
    ]
)
print(out)


In [None]:
# aggregate and broadcast within a group
# output type: -> Int32
pl.sum("foo").over("groups")

# sum within a group and multiply with group elements
# output type: -> Int32
(pl.col("x").sum() * pl.col("y")).over("groups")

# sum within a group and multiply with group elements 
# and aggregate the group to a list
# output type: -> List(Int32)
(pl.col("x").sum() * pl.col("y")).list().over("groups")

# note that it will require an explicit `list()` call
# sum within a group and multiply with group elements 
# and aggregate the group to a list
# the flatten call explodes that list

# This is the fastest method to do things over groups when the groups are sorted
(pl.col("x").sum() * pl.col("y")).list().over("groups").flatten()


In [None]:
import polars as pl
import pandas as pd

In [None]:
df = pd.DataFrame({
    "type": ["m", "n", "o", "m", "m", "n", "n"],
    "c": [1, 1, 1, 2, 2, 2, 2],
})

df["size"] = df.groupby("c")["type"].transform(len)

In [None]:
pl_df = pl.from_pandas(df)

In [None]:
pl_df.head()

In [None]:
pl_df = pl.from_pandas(df)
#pl_df.select([
#    pl.all(),
#    pl.col("type").count().over("c").alias("size")
#])


In [None]:
pl_df.columns

In [None]:
pl_df['type']

In [None]:
pl_df.select([
    #pl.all(), pl.col('type')]),
    pl.all(),
    pl.col("type").alias("size").count().over("c").alias('typeCount'),
    ]
)

In [13]:
import polars as pl

# to enrich the examples in this quickstart with dates
from datetime import datetime, timedelta, date
# to generate data for the examples
import numpy as np 


In [14]:
df1 = pl.DataFrame(
    {
        "dt": [date(2022, 9, 1), date(2022, 9, 2), date(2022, 9, 3)],
        "x": [3.5, 4.0, 1.0],
        "y": [10.0, 2.5, 1.5],
    }
)
df2 = pl.DataFrame(
    {
        "dt": [date(2022, 9, 2), date(2022, 9, 3), date(2022, 9, 1)],
        "x": [8.0, 1.0, 3.5],
        "y": [1.5, 12.0, 5.0],
    }
)
df3 = pl.DataFrame(
    {
        "dt": [date(2022, 9, 3), date(2022, 9, 2)],
        "x": [2.0, 5.0],
        "y": [2.5, 2.0],
    }
)  

#pl.Config.set_tbl_formatting("UTF8_FULL")  
#pl.Config.set_tbl_formatting("ASCII_MARKDOWN")  
#pl.Config.set_tbl_formatting("NOTHING")  

#cfg = pl.Config.restore_defaults()  
#df1 = df1.with_column(pl.col('dt').str.strptime(pl.Date, fmt='%y-%m-%d').cast(pl.Datetime))
#df1 = df1.with_column(
#    pl.col("dt").cast(pl.Date)
#)

#df1 = df1.with_column(pl.col("dt").cast(pl.Datetime, strict=False))

df1.head()




dt,x,y
date,f64,f64
2022-09-01,3.5,10.0
2022-09-02,4.0,2.5
2022-09-03,1.0,1.5


In [15]:
af1, af2, af3 = pl.align_frames(
    df1, df2, df3, on="dt"
)  

In [16]:
af1, af2, af3 = pl.align_frames(
    df1, df2, df3, on="dt", select=["x", "y"]
)  

In [17]:
(af1 * af2 * af3).fill_null(0).select(pl.sum(pl.col("*")).alias("dot"))

dot
f64
0.0
167.5
47.0


In [None]:
#import polars as pl

dataset = pl.DataFrame({"date": ["2020-01-02", "2020-01-03", "2020-01-04"], "index": [1, 2, 3]})

q = dataset.lazy().with_column(pl.col("date").str.strptime(pl.Date, "%Y-%m-%d"))

df = q.collect()
df.head()

In [None]:
df2.head()

In [None]:
df3

In [None]:
# align frames by the "dt" column:
af1, af2, af3 = pl.align_frames(
    df1, df2, df3, on="dt"
)

# Done - ZZZZZZ

## Query optimization
In lazy mode the query optimizer may be able to optimize the query based on the expressions.

In this example we scan a CSV file with many columns using scan_csv and then select a subset of them. The query optimizer creates a query plan that causes only the selected columns to be read from the CSV - see how the Project part of the query plan below states that only 1 of 2 columns will be read:

In [None]:
lazy_select_df = pl.scan_csv("data/appleStock.csv").select(["Date"])
print(lazy_select_df.describe_optimized_plan())