# Groupby

Using a python `lambda` on groups will kill parallelization due to running Python code and therefore being subject to the Python Global Interpreter Lock. This notebook is about approaches to get around this in Polars.

In [1]:
import polars as pl
from polars import col

Load the [US congress dataset](https://github.com/unitedstates/congress-legislators).

In [2]:
dtypes = {
    "first_name": pl.Categorical,
    "gender": pl.Categorical,
    "type": pl.Categorical,
    "state": pl.Categorical,
    "party": pl.Categorical,
}
url = 'https://theunitedstates.io/congress-legislators/legislators-current.csv'
dataset = pl.read_csv(url, dtype=dtypes).with_column(pl.col("birthday").str.strptime(pl.Date, strict=False))

  dataset = pl.read_csv(url, dtype=dtypes).with_column(pl.col("birthday").str.strptime(pl.Date, strict=False))


In [3]:
dataset.head()

last_name,first_name,middle_name,suffix,nickname,full_name,birthday,gender,type,state,district,senate_class,party,url,address,phone,contact_form,rss_url,twitter,facebook,youtube,youtube_id,bioguide_id,thomas_id,opensecrets_id,lis_id,fec_ids,cspan_id,govtrack_id,votesmart_id,ballotpedia_id,washington_post_id,icpsr_id,wikipedia_id
str,cat,str,str,str,str,date,cat,cat,cat,i64,i64,cat,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,i64,i64,i64,str,str,i64,str
"""Brown""","""Sherrod""",,,,"""Sherrod Brown""",1952-11-09,"""M""","""sen""","""OH""",,1,"""Democrat""","""https://www.br...","""503 Hart Senat...","""202-224-2315""","""https://www.br...","""http://www.bro...","""SenSherrodBrow...","""SenatorSherrod...","""SherrodBrownOh...","""UCgy8jfERh-t_i...","""B000944""",136,"""N00003535""","""S307""","""H2OH13033,S6OH...",5051,400050,27018,"""Sherrod Brown""",,29389,"""Sherrod Brown"""
"""Cantwell""","""Maria""",,,,"""Maria Cantwell...",1958-10-13,"""F""","""sen""","""WA""",,1,"""Democrat""","""https://www.ca...","""511 Hart Senat...","""202-224-3441""","""https://www.ca...","""http://www.can...","""SenatorCantwel...","""senatorcantwel...","""SenatorCantwel...","""UCN52UDqKgvHRk...","""C000127""",172,"""N00007836""","""S275""","""S8WA00194,H2WA...",26137,300018,27122,"""Maria Cantwell...",,39310,"""Maria Cantwell..."
"""Cardin""","""Benjamin""","""L.""",,,"""Benjamin L. Ca...",1943-10-05,"""M""","""sen""","""MD""",,1,"""Democrat""","""https://www.ca...","""509 Hart Senat...","""202-224-4524""","""https://www.ca...","""http://www.car...","""SenatorCardin""","""senatorbencard...","""senatorcardin""","""UCiQaJnMzlfzzG...","""C000141""",174,"""N00001955""","""S308""","""H6MD03177,S6MD...",4004,400064,26888,"""Ben Cardin""",,15408,"""Ben Cardin"""
"""Carper""","""Thomas""","""Richard""",,,"""Thomas R. Carp...",1947-01-23,"""M""","""sen""","""DE""",,1,"""Democrat""","""https://www.ca...","""513 Hart Senat...","""202-224-2441""","""https://www.ca...","""http://www.car...","""SenatorCarper""","""tomcarper""","""senatorcarper""","""UCgLnvbKwu4B3n...","""C000174""",179,"""N00012508""","""S277""","""S8DE00079""",663,300019,22421,"""Tom Carper""",,15015,"""Tom Carper"""
"""Casey""","""Robert""","""P.""","""Jr.""","""Bob""","""Robert P. Case...",1960-04-13,"""M""","""sen""","""PA""",,1,"""Democrat""","""https://www.ca...","""393 Russell Se...","""202-224-6324""","""https://www.ca...","""http://www.cas...","""SenBobCasey""","""SenatorBobCase...","""SenatorBobCase...","""UCtVssXhx-KuZa...","""C001070""",1828,"""N00027503""","""S309""","""S6PA00217""",47036,412246,2541,"""Bob Casey, Jr....",,40703,"""Bob Casey Jr."""


You can chain together as many aggregations as you like:

In [4]:
q = (
    dataset.lazy()
    .groupby("first_name")
    .agg([
        pl.count(),
        col("gender").list(),
        pl.first("last_name"),
    ])
    .sort("count", reverse=True)
    .limit(5)
)

q.collect()

first_name,count,gender,last_name
cat,u32,list[cat],str
"""John""",19,"[""M"", ""M"", ... ""M""]","""Barrasso"""
"""Mike""",14,"[""M"", ""M"", ... ""M""]","""Kelly"""
"""Michael""",11,"[""M"", ""M"", ... ""M""]","""Bennet"""
"""David""",10,"[""M"", ""M"", ... ""M""]","""Cicilline"""
"""James""",9,"[""M"", ""M"", ... ""M""]","""Inhofe"""


In [5]:
q = (
    dataset.lazy()
    .groupby("state")
    .agg([
        (col("party") == "Republican").sum().alias("dem"),
        (col("party") == "Democrat").sum().alias("rep"),
    ])
    .sort("rep", reverse=True)
    .limit(5)
)

q.collect()

state,dem,rep
cat,u32,u32
"""CA""",11,44
"""NY""",8,21
"""IL""",5,15
"""TX""",26,12
"""NJ""",2,12


You can filter groups

In [6]:
q = (
    dataset.lazy()
    .groupby(["state", "party"])
    .agg([pl.count("party").alias("count")])
    .filter((pl.col("party") == "Democrat") | (pl.col("party") == "Republican"))  # only show some groups
    .sort("count", reverse=True)
    .limit(5)
)

q.collect()

state,party,count
cat,cat,u32
"""CA""","""Democrat""",44
"""TX""","""Republican""",26
"""NY""","""Democrat""",21
"""FL""","""Republican""",18
"""IL""","""Democrat""",15


You can safely use functions for clarity, as long as they return Polars expressions

In [7]:
from datetime import datetime

def compute_age() -> pl.Expr:
    return datetime.now().year - col("birthday").dt.year()

def avg_birthday(gender: str) -> pl.Expr:
    return compute_age().filter(col("gender") == gender).mean().alias(f"avg {gender} birthday")

q = (
    dataset.lazy()
    .groupby(["state"])
    .agg([
        avg_birthday("M"),
        avg_birthday("F"),
        (col("gender") == "M").sum().alias("# male"),
        (col("gender") == "F").sum().alias("# female"),
    ])
    .sort("# female", reverse=True)
    .limit(5)
)

q.collect()

state,avg M birthday,avg F birthday,# male,# female
cat,f64,f64,u32,u32
"""CA""",58.323529,68.904762,34,21
"""NY""",54.947368,53.7,19,10
"""FL""",58.789474,57.0,19,9
"""WA""",57.25,58.0,4,8
"""TX""",58.066667,62.25,30,8


## Sorting

It is common to sort a dataframe for the purpose of managing ordering during a groupby.

We can also sort within the context of a particular group after grabbing the youngest/oldest.

In [8]:
def get_person() -> pl.Expr:
    return col("first_name") + pl.lit(" ") + col("last_name")

[This example](https://pola-rs.github.io/polars-book/user-guide/dsl/groupby.html#sorting) doesn't seem to work as I expect...

In [9]:
q = (
    dataset.lazy()
    .sort("birthday")
    .groupby(["state"])
    .agg([
        get_person().first().alias("youngest"),  # uses the global sort-by 
        get_person().first().alias("oldest"),  # uses the global sort-by
        get_person().sort().last().alias("alphabetical_first"),  
        # I'd expect this to give the gender of the first person. TODO: WTF
        col("gender").sort_by("first_name").last().alias("alphabetical_first_gender")  
    ])
    # I'd expect this to return the states in alphabetical order at the very end. TODO: WTF
    .sort("state", reverse=False)  
    .limit(5)
)

q.collect()

state,youngest,oldest,alphabetical_first,alphabetical_first_gender
cat,str,str,str,cat
"""OH""","""Marcy Kaptur""","""Marcy Kaptur""","""Warren Davidso...","""F"""
"""WA""","""Patty Murray""","""Patty Murray""","""Suzan DelBene""","""F"""
"""MD""","""Steny Hoyer""","""Steny Hoyer""","""Steny Hoyer""","""M"""
"""DE""","""Thomas Carper""","""Thomas Carper""","""Thomas Carper""","""F"""
"""PA""","""Mike Kelly""","""Mike Kelly""","""Susan Wild""","""M"""


Thomas Carper is not a woman!

In [10]:
def filter_person(first_name, last_name) -> pl.Expr:
    return (col("first_name") == first_name) & (col("last_name") == last_name)

first_name, last_name = "Thomas", "Carper"  # is a Male!

dataset.select([
    col("first_name").filter(filter_person(first_name, last_name)),
    col("last_name").filter(filter_person(first_name, last_name)),
    col("gender").filter(filter_person(first_name, last_name)),
    col("state").filter(filter_person(first_name, last_name)),
])

first_name,last_name,gender,state
cat,str,cat,cat
"""Thomas""","""Carper""","""M""","""DE"""
