# Chapter 8: Grouping and Aggregates


In [1]:
import os

from dotenv import load_dotenv
from sqlalchemy import create_engine, URL, select, func
from sqlalchemy.orm import Session
import pandas as pd

from utils import print_sql_statement


load_dotenv()

url_object = URL.create(
    os.environ["DB_ENGINE"],
    username=os.environ["DB_USER"],
    password=os.environ["DB_PASSWD"],
    host=os.environ["DB_HOST"],
    database=os.environ["DB_NAME"],
)

engine = create_engine(url_object)

## Grouping by expressions

Count the number of employees hired by branch and year

In [2]:
from collections import Counter

from sqlalchemy import literal

from model import Branch, Employee


with Session(engine) as session:

    # Using a raw query
    df = pd.read_sql_query(
        """
        SELECT
            b.name branch
            , EXTRACT(YEAR FROM e.start_date) year
            , COUNT(*) how_many
        FROM employee
        e JOIN branch b ON e.assigned_branch_id = b.branch_id
        GROUP BY EXTRACT(YEAR FROM e.start_date), branch
        ORDER BY EXTRACT(YEAR FROM e.start_date) ASC
        """,
        con=session.connection()
    )

    # Using SQLAlchemy functions
    statement = (
        select(
            Branch.name.label("branch"),
            func.extract("YEAR", Employee.start_date).label("year"),
            func.count(literal("*")).label("how_many")
        )
        .select_from(Employee)
        .join(
            Branch,
            Branch.branch_id == Employee.assigned_branch_id
        )
        .group_by(
            func.extract("YEAR", Employee.start_date),
            Branch.name
        )
        .order_by(
            func.extract("YEAR", Employee.start_date),
            Branch.name
        )
    )
    print_sql_statement(statement)
    results = session.execute(statement).all()

    # Using the SQLalchemy ORM relationships with native Python
    new_results = sorted(
        [
            # Each row is name, year, count
            (*key, count) for (key, count) in
            # Count the group (name, year)
            Counter(
                (emp.employee_branch.name, emp.start_date.year)
                for emp in session.query(Employee)
            )
            .items()
        ],
        # Order by year then name
        key=lambda tup: tuple(reversed(tup[:2]))
    )

print(df)
print(results)
assert results == new_results

"""SELECT branch.name AS branch, EXTRACT(YEAR FROM employee.start_date) AS year, count(:param_1) AS how_many 
FROM employee JOIN branch ON branch.branch_id = employee.assigned_branch_id GROUP BY EXTRACT(YEAR FROM employee.start_date), branch.name ORDER BY EXTRACT(YEAR FROM employee.start_date), branch.name"""
           branch  year  how_many
0    Headquarters  2000         1
1   Quincy Branch  2000         1
2   Woburn Branch  2000         1
3    Headquarters  2001         1
4   So. NH Branch  2001         1
5    Headquarters  2002         4
6   Quincy Branch  2002         1
7   So. NH Branch  2002         2
8   Woburn Branch  2002         1
9    Headquarters  2003         1
10  Quincy Branch  2003         1
11  Woburn Branch  2003         1
12   Headquarters  2004         2
[('Headquarters', 2000, 1), ('Quincy Branch', 2000, 1), ('Woburn Branch', 2000, 1), ('Headquarters', 2001, 1), ('So. NH Branch', 2001, 1), ('Headquarters', 2002, 4), ('Quincy Branch', 2002, 1), ('So. NH Branch', 2

# Rollup

Find the total balance of every product per opening branch and the sum total of the product itself.

Using the rollup functionality, a column can be aggregated per grouping.

In [20]:
from typing import Final

from model import Account, Branch


with Session(engine) as session:

    df = pd.read_sql_query(
        """
        SELECT
            a.product_cd
            , b.name
            , SUM(a.avail_balance) tot_balance
        FROM
            account
        a JOIN branch b ON a.open_branch_id = b.branch_id
        GROUP BY a.product_cd, b.name WITH ROLLUP;
        """,
        con=session.connection()
    )

    statement = (
        select(
            Account.product_cd,
            Branch.name,
            func.sum(Account.avail_balance).label("tot_balance")
        )
        .select_from(Account)
        .join(Branch, Account.open_branch_id == Branch.branch_id)
        .group_by(
            func.rollup(
                Account.product_cd,
                Branch.name
            )
        )
    )
    print_sql_statement(statement)
    results = session.execute(statement).all()

    # Using the SQLalchemy ORM relationships with native Python
    key_all_products_rollup: Final[tuple] = (None, None)
    total_balance_data: dict[tuple, float] = {
        key_all_products_rollup: 0.
    }
    open_branch: Branch
    for acct in session.query(Account):
        open_branch = acct.account_open_branch
        product_cd = acct.product_cd
        key = (product_cd, open_branch.name)
        key_rollup = (product_cd, None)
        if key not in total_balance_data:
            total_balance_data[key] = 0.
        if key_rollup not in total_balance_data:
            total_balance_data[key_rollup] = 0.
        total_balance_data[key] += acct.avail_balance
        total_balance_data[key_rollup] += acct.avail_balance
        total_balance_data[key_all_products_rollup] += acct.avail_balance
    new_results = set(
        (*key, round(value, 2))
        for (key, value) in total_balance_data.items()
    )

print(df)
print(results)
assert set(results) == new_results

"""SELECT account.product_cd, branch.name, sum(account.avail_balance) AS tot_balance 
FROM account JOIN branch ON account.open_branch_id = branch.branch_id GROUP BY ROLLUP(account.product_cd, branch.name)"""
   product_cd           name  tot_balance
0         BUS  So. NH Branch         0.00
1         BUS  Woburn Branch      9345.55
2         BUS           None      9345.55
3          CD   Headquarters     11500.00
4          CD  Woburn Branch      8000.00
5          CD           None     19500.00
6         CHK   Headquarters       782.16
7         CHK  Quincy Branch      1057.75
8         CHK  So. NH Branch     67852.33
9         CHK  Woburn Branch      3315.77
10        CHK           None     73008.01
11         MM   Headquarters     14832.64
12         MM  Quincy Branch      2212.50
13         MM           None     17045.14
14        SAV   Headquarters       767.77
15        SAV  So. NH Branch       387.99
16        SAV  Woburn Branch       700.00
17        SAV           None      18

# Group Filtering Conditions

Find all the active accounts and find the total balances greater than 10,000.

This is where the GROUP BY - HAVING syntax comes into play. The aggregate filter condition must be in the HAVING clause as the WHERE clause is evaluated before the GROUP BY clause

In [31]:
from model import AccountStatusEnum

with Session(engine) as session:

    df = pd.read_sql_query(
        """
        SELECT
            a.product_cd
            , SUM(a.avail_balance) prod_balance
        FROM
            account a
        WHERE a.status = 'ACTIVE'
        GROUP BY a.product_cd
        HAVING SUM(a.avail_balance) >= 10000
        ;
        """,
        con=session.connection()
    )
    statement = (
        select(
            Account.product_cd,
            func.sum(Account.avail_balance)
        )
        .where(Account.status == AccountStatusEnum.ACTIVE)
        .group_by(Account.product_cd)
        .having(func.sum(Account.avail_balance) >= 10_000)
    )
    print_sql_statement(statement)
    results = session.execute(statement).all()

    # Using the SQLalchemy ORM relationships with native Python
    prod_balance_data: dict[str, float] = {}
    for acct in session.query(Account):
        if acct.status != AccountStatusEnum.ACTIVE:
            continue
        product_cd = acct.product_cd
        if product_cd not in prod_balance_data:
            prod_balance_data[product_cd] = 0.
        prod_balance_data[product_cd] += acct.avail_balance
    new_results = set(
        (product_cd, round(balance, 2))
        for (product_cd, balance) in prod_balance_data.items()
        if balance >= 10_000
    )

print(df)
print(results)
assert new_results == set(results)

"""SELECT account.product_cd, sum(account.avail_balance) AS sum_1 
FROM account 
WHERE account.status = :status_1 GROUP BY account.product_cd 
HAVING sum(account.avail_balance) >= :sum_2"""
  product_cd  prod_balance
0         CD      19500.00
1        CHK      73008.01
2         MM      17045.14
3        SBL      50000.00
[('CD', 19500.0), ('CHK', 73008.01), ('MM', 17045.14), ('SBL', 50000.0)]


# Exercises

## 8-1
Construct a query that counts the number of rows in the account table.

## 8-2
Modify your query from exercise 8-1 to count the numer of account held by each customer. Show the customer ID and the number of accounts for each customer.

## 8-3
Modify your query from exercise 8-2 to only include those customers having at least two accounts

## 8-4
Find the total available balance by product and branch when there is more than account per product and branch. Order the results by total balance (highest to lowest).


In [33]:
## 8-1

with Session(engine) as session:

    df = pd.read_sql_query(
        """
            SELECT
                COUNT(*) cnt
            FROM
                account;
        """,
        con=session.connection()
    )
    results = session.query(Account).count()

print(df)
print(results)

   cnt
0   24
24


In [40]:
## 8-2
from collections import Counter


with Session(engine) as session:

    df = pd.read_sql_query(
        """
            SELECT
                a.cust_id cust_id
                , COUNT(a.cust_id) n_acct
            FROM
                account a
            GROUP BY a.cust_id
        """,
        con=session.connection()
    )
    results = Counter(
        acct.cust_id
        for acct in session.query(Account)
    )

print(df)
print(results)

    cust_id  n_acct
0         1       3
1         2       2
2         3       2
3         4       3
4         5       1
5         6       2
6         7       1
7         8       2
8         9       3
9        10       2
10       11       1
11       12       1
12       13       1
Counter({1: 3, 4: 3, 9: 3, 2: 2, 3: 2, 6: 2, 8: 2, 10: 2, 5: 1, 7: 1, 11: 1, 12: 1, 13: 1})


In [42]:
## 8-3
from collections import Counter


with Session(engine) as session:

    df = pd.read_sql_query(
        """
            SELECT
                a.cust_id cust_id
                , COUNT(a.cust_id) n_acct
            FROM
                account a
            GROUP BY a.cust_id
            HAVING COUNT(a.cust_id) > 1
        """,
        con=session.connection()
    )
    results = sorted(
        (
            (cust_id, count)
            for cust_id, count in Counter(
                acct.cust_id
                for acct in session.query(Account)
            ).items()
            if count > 1
        ),
        key=lambda tup: tup[0]
    )

print(df)
print(results)

   cust_id  n_acct
0        1       3
1        2       2
2        3       2
3        4       3
4        6       2
5        8       2
6        9       3
7       10       2
[(1, 3), (2, 2), (3, 2), (4, 3), (6, 2), (8, 2), (9, 3), (10, 2)]


In [56]:
## 8-4
"""Find the total available balance by product and branch when there is more than account per product and branch. Order the results by total balance (highest to lowest)."""
from collections import defaultdict


with Session(engine) as session:

    df = pd.read_sql_query(
        """
            SELECT
                a.product_cd product_cd
                , b.name branch_name
                , SUM(a.avail_balance) tot_balance
            FROM
                account
            a JOIN branch b ON a.open_branch_id = b.branch_id
            GROUP BY a.product_cd, b.name
            HAVING COUNT(*) > 1
            ORDER BY tot_balance DESC;
        """,
        con=session.connection()
    )
    product_balances_data: dict[tuple, float] = defaultdict(float)
    product_cd_branch_list = []
    for acct in session.query(Account):
        key = acct.product_cd, acct.account_open_branch.name
        product_balances_data[key] += acct.avail_balance
        product_cd_branch_list.append(key)
    product_cd_branch_counter = Counter(product_cd_branch_list)
    results = sorted(
        (
            (product_cd, name, total_balance)
            for ((product_cd, name), total_balance)
            in product_balances_data.items()
            if product_cd_branch_counter[(product_cd, name)] > 1
        ),
        key=lambda tup: tup[-1],
        reverse=True
    )

print(df)
print(results)

  product_cd    branch_name  tot_balance
0        CHK  So. NH Branch     67852.33
1         MM   Headquarters     14832.64
2         CD   Headquarters     11500.00
3         CD  Woburn Branch      8000.00
4        CHK  Woburn Branch      3315.77
5        CHK   Headquarters       782.16
6        SAV  Woburn Branch       700.00
[('CHK', 'So. NH Branch', 67852.33), ('MM', 'Headquarters', 14832.64), ('CD', 'Headquarters', 11500.0), ('CD', 'Woburn Branch', 8000.0), ('CHK', 'Woburn Branch', 3315.77), ('CHK', 'Headquarters', 782.16), ('SAV', 'Woburn Branch', 700.0)]
