In [1]:
import os

from dotenv import load_dotenv
from sqlalchemy import create_engine, URL, select, func, literal, CHAR
from sqlalchemy.sql.selectable import Select as SQLSelect
from sqlalchemy.orm import Session
import pandas as pd

import model
from model import Customer, Employee, Department, Branch, Account


load_dotenv()

url_object = URL.create(
    os.environ["DB_ENGINE"],
    username=os.environ["DB_USER"],
    password=os.environ["DB_PASSWD"],
    host=os.environ["DB_HOST"],
    database=os.environ["DB_NAME"],
)

engine = create_engine(url_object)

In [2]:
def print_sql_statement(sql_select_statement: SQLSelect) -> None:
    print('"""' + str(sql_select_statement) + '"""')

# Column Aliases

SQL allows for cacluated columns to have aliases. This is straight forward in a raw query, but looks weird in SQLalchemy

In [3]:
with Session(engine) as session:
    df = pd.read_sql_query(
        """
        SELECT
            emp_id
            , 'ACTIVE' status
            , emp_id * 3.14159 empid_x_pi
            , UPPER(lname) last_name_upper
        FROM
            employee;
        """,
        con=session.connection()
    )

    statement = (
        select(
            Employee.emp_id,
            literal("ACTIVE").label("status"),
            (Employee.emp_id * 3.14159).label("empid_x_pi"),
            func.upper(Employee.lname).label("last_name_upper")
        ).select_from(Employee)
    )
    print_sql_statement(statement)
    results = session.execute(statement).all()

print(results)
print(df)

"""SELECT employee.emp_id, :param_1 AS status, employee.emp_id * :emp_id_1 AS empid_x_pi, upper(employee.lname) AS last_name_upper 
FROM employee"""
[(1, 'ACTIVE', 3.14159, 'SMITH'), (2, 'ACTIVE', 6.28318, 'BARKER'), (3, 'ACTIVE', 9.42477, 'TYLER'), (4, 'ACTIVE', 12.56636, 'HAWTHORNE'), (5, 'ACTIVE', 15.70795, 'GOODING'), (6, 'ACTIVE', 18.84954, 'FLEMING'), (7, 'ACTIVE', 21.99113, 'TUCKER'), (8, 'ACTIVE', 25.13272, 'PARKER'), (9, 'ACTIVE', 28.27431, 'GROSSMAN'), (10, 'ACTIVE', 31.4159, 'ROBERTS'), (11, 'ACTIVE', 34.55749, 'ZIEGLER'), (12, 'ACTIVE', 37.69908, 'JAMESON'), (13, 'ACTIVE', 40.84067, 'BLAKE'), (14, 'ACTIVE', 43.98226, 'MASON'), (15, 'ACTIVE', 47.12385, 'PORTMAN'), (16, 'ACTIVE', 50.26544, 'MARKHAM'), (17, 'ACTIVE', 53.40703, 'FOWLER'), (18, 'ACTIVE', 56.54862, 'TULMAN')]
    emp_id  status  empid_x_pi last_name_upper
0        1  ACTIVE     3.14159           SMITH
1        2  ACTIVE     6.28318          BARKER
2        3  ACTIVE     9.42477           TYLER
3        4  ACTIVE 

# Removing Duplicates with DISTINCT

Find all the unique accounts in the account table

A naive attempt shows all the accounts with repeated customers

In [4]:
with Session(engine) as session:
    df = pd.read_sql_query(
        """
        SELECT
            cust_id
        FROM
            account;
        """,
        con=session.connection()
    )

df.head(10)

Unnamed: 0,cust_id
0,1
1,1
2,1
3,2
4,2
5,3
6,3
7,4
8,4
9,4


We can exclude/remove duplicate customers using the DISTINCT keyword

In [5]:
with Session(engine) as session:
    df = pd.read_sql_query(
        """
        SELECT DISTINCT
            cust_id
        FROM
            account;
        """,
        con=session.connection()
    )

    statement = (
        select(Account.cust_id).distinct().select_from(Account)
    )
    print_sql_statement(statement)
    results = session.execute(statement).all()

print(results)
print(df.head(10))

"""SELECT DISTINCT account.cust_id 
FROM account"""
[(1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,), (10,), (11,), (12,), (13,)]
   cust_id
0        1
1        2
2        3
3        4
4        5
5        6
6        7
7        8
8        9
9       10


# Returning the Duplicates

Find all customers who opened multiple accounts on the same date.

In this query, the cust_id an open_date in the accounts could be identical.

In [6]:
with Session(engine) as session:
    df = pd.read_sql_query(
        """
        SELECT
            a.cust_id
            , a.open_date
            , COUNT(*)
        FROM
            account a
        GROUP BY a.cust_id, a.open_date
        HAVING COUNT(*) > 1;
        """,
        con=session.connection()
    )

    statement = (
        select(
            Account.cust_id,
            Account.open_date,
            func.count("*")
        ).select_from(Account)
        .group_by(Account.cust_id, Account.open_date)
        .having(func.count("*") > 1)
    )
    print_sql_statement(statement)
    results = session.execute(statement).all()

print(results)
print(df.head(10))

"""SELECT account.cust_id, account.open_date, count(:count_2) AS count_1 
FROM account GROUP BY account.cust_id, account.open_date 
HAVING count(:count_3) > :count_4"""
[(1, datetime.date(2000, 1, 15), 2), (2, datetime.date(2001, 3, 12), 2), (8, datetime.date(2001, 5, 23), 2)]
   cust_id   open_date  COUNT(*)
0        1  2000-01-15         2
1        2  2001-03-12         2
2        8  2001-05-23         2


# Subquery Generated Tables

Get all employees ID, first name, and last name all capitalized. While this subquery is useless since the subquery could have been performed in the outer select, this goes to show the power of subqueries.


In [7]:

with Session(engine) as session:
    df = pd.read_sql_query(
        """
        SELECT
            eu.emp_id_upper ID
            , eu.fname_upper FNAME
            , eu.lname_upper LNAME
        FROM (
            SELECT
                UPPER(CAST(e.emp_id AS CHAR)) emp_id_upper
                , UPPER(e.fname) fname_upper
                , UPPER(e.lname) lname_upper
            FROM employee e
        ) eu;
        """,
        con=session.connection()
    )

    eu = select(
        func.upper(func.cast(Employee.emp_id, CHAR)).label("emp_id_upper"),
        func.upper(Employee.fname).label("fname_upper"),
        func.upper(Employee.lname).label("lname_upper")
    ).select_from(Employee).subquery("eu")
    # Note that since special labels have been introduced, the subquery
    # column operator 'c' must be used
    statement = select(
        eu.c["emp_id_upper"].label("ID"),
        eu.c["fname_upper"].label("FNAME"),
        eu.c["lname_upper"].label("LNAME")
    ).select_from(eu)

    print_sql_statement(statement)
    results = session.execute(statement).all()

print(results)
print(df.head(10))

"""SELECT eu.emp_id_upper AS "ID", eu.fname_upper AS "FNAME", eu.lname_upper AS "LNAME" 
FROM (SELECT upper(CAST(employee.emp_id AS CHAR)) AS emp_id_upper, upper(employee.fname) AS fname_upper, upper(employee.lname) AS lname_upper 
FROM employee) AS eu"""
[('1', 'MICHAEL', 'SMITH'), ('2', 'SUSAN', 'BARKER'), ('3', 'ROBERT', 'TYLER'), ('4', 'SUSAN', 'HAWTHORNE'), ('5', 'JOHN', 'GOODING'), ('6', 'HELEN', 'FLEMING'), ('7', 'CHRIS', 'TUCKER'), ('8', 'SARAH', 'PARKER'), ('9', 'JANE', 'GROSSMAN'), ('10', 'PAULA', 'ROBERTS'), ('11', 'THOMAS', 'ZIEGLER'), ('12', 'SAMANTHA', 'JAMESON'), ('13', 'JOHN', 'BLAKE'), ('14', 'CINDY', 'MASON'), ('15', 'FRANK', 'PORTMAN'), ('16', 'THERESA', 'MARKHAM'), ('17', 'BETH', 'FOWLER'), ('18', 'RICK', 'TULMAN')]
   ID    FNAME      LNAME
0   1  MICHAEL      SMITH
1   2    SUSAN     BARKER
2   3   ROBERT      TYLER
3   4    SUSAN  HAWTHORNE
4   5     JOHN    GOODING
5   6    HELEN    FLEMING
6   7    CHRIS     TUCKER
7   8    SARAH     PARKER
8   9     JANE   GRO

# Views (AKA Virtual Tables)

A view is a query stored in the data dictionary. It does not hold any data so it can be thought of as a "virtual table".

The SQLAlcehmy recipe to create a view is complex and won't be discussed here, but the recipe is
https://web.archive.org/web/20230107211001/https://github.com/sqlalchemy/sqlalchemy/wiki/Views

In [8]:
import mysql.connector

mysql_connection = mysql.connector.connect(
    user=os.environ["DB_USER"],
    password=os.environ["DB_PASSWD"],
    host=os.environ["DB_HOST"],
    database=os.environ["DB_NAME"]
)

with mysql_connection.cursor() as cursor:
    create_view_statement = (
        """
        CREATE VIEW employee_vw AS
        SELECT
            emp_id
            , fname
            , lname
            , YEAR(start_date) start_year
        FROM employee;
        """
    )
    select_from_view_statement = (
        """
        SELECT
             emp_id, start_year
         FROM
             employee_vw;
        """
    )
    drop_view_statement = (
        """DROP VIEW IF EXISTS employee_vw"""
    )
    cursor.execute(drop_view_statement)
    cursor.execute(create_view_statement)
    cursor.execute(select_from_view_statement)
    employee_ids, start_years = zip(
        *tuple(result for result in cursor)
    )
    df = pd.DataFrame(
        data={
            "emp_id": employee_ids,
            "start_year": start_years
        }
    )
    cursor.execute(drop_view_statement)

print(df)

    emp_id  start_year
0        1        2001
1        2        2002
2        3        2000
3        4        2002
4        5        2003
5        6        2004
6        7        2004
7        8        2002
8        9        2002
9       10        2002
10      11        2000
11      12        2003
12      13        2000
13      14        2002
14      15        2003
15      16        2001
16      17        2002
17      18        2002


# Table Links

Get all employees and their associated department name


In [9]:
with Session(engine) as session:
    df = pd.read_sql_query(
        """
        SELECT
            e.emp_id
            , e.fname
            , e.lname
            , d.name
        FROM
            employee e JOIN department d ON e.dept_id = d.dept_id
        ;
        """,
        con=session.connection()
    )

    statement = (
        select(
            Employee.emp_id,
            Employee.fname,
            Employee.lname,
            Department.name
        )
        .select_from(
            Employee
        )
        .join(
            Department,
            Employee.dept_id == Department.dept_id
        )
    )
    print_sql_statement(statement)
    results = session.execute(statement).all()

print(df)
print(results)

"""SELECT employee.emp_id, employee.fname, employee.lname, department.name 
FROM employee JOIN department ON employee.dept_id = department.dept_id"""
    emp_id     fname      lname            name
0        4     Susan  Hawthorne      Operations
1        6     Helen    Fleming      Operations
2        7     Chris     Tucker      Operations
3        8     Sarah     Parker      Operations
4        9      Jane   Grossman      Operations
5       10     Paula    Roberts      Operations
6       11    Thomas    Ziegler      Operations
7       12  Samantha    Jameson      Operations
8       13      John      Blake      Operations
9       14     Cindy      Mason      Operations
10      15     Frank    Portman      Operations
11      16   Theresa    Markham      Operations
12      17      Beth     Fowler      Operations
13      18      Rick     Tulman      Operations
14       5      John    Gooding           Loans
15       1   Michael      Smith  Administration
16       2     Susan     Barker  A

# Filter by Condition

Get all head tellers starting on the year 2002 and tellers starting after the year 2003

In [10]:
from datetime import date
from sqlalchemy import and_, or_


with Session(engine) as session:
    df = pd.read_sql_query(
        """
        SELECT
            e.emp_id
            , e.fname
            , e.lname
            , e.start_date
            , e.title
        FROM
            employee e
        WHERE (
            (e.title = 'Head Teller' AND e.start_date > '2002-01-01')
            OR
            (e.title = 'Teller' AND e.start_date > '2003-01-01')
        )
        ;
        """,
        con=session.connection()
    )

    statement = (
        select(
            Employee.emp_id,
            Employee.fname,
            Employee.lname,
            Employee.start_date,
            Employee.title
        )
        .select_from(Employee)
        .where(
            or_(
                and_(Employee.title == "Head Teller", Employee.start_date > date(2002, 1, 1)),
                and_(Employee.title == "Teller", Employee.start_date > date(2003, 1, 1)),
            )
        )
    )

    print_sql_statement(statement)
    results = session.execute(statement).all()

print(df)
print(results)

"""SELECT employee.emp_id, employee.fname, employee.lname, employee.start_date, employee.title 
FROM employee 
WHERE employee.title = :title_1 AND employee.start_date > :start_date_1 OR employee.title = :title_2 AND employee.start_date > :start_date_2"""
   emp_id     fname    lname  start_date        title
0       6     Helen  Fleming  2004-03-17  Head Teller
1       7     Chris   Tucker  2004-09-15       Teller
2      10     Paula  Roberts  2002-07-27  Head Teller
3      12  Samantha  Jameson  2003-01-08       Teller
4      15     Frank  Portman  2003-04-01       Teller
[(6, 'Helen', 'Fleming', datetime.date(2004, 3, 17), 'Head Teller'), (7, 'Chris', 'Tucker', datetime.date(2004, 9, 15), 'Teller'), (10, 'Paula', 'Roberts', datetime.date(2002, 7, 27), 'Head Teller'), (12, 'Samantha', 'Jameson', datetime.date(2003, 1, 8), 'Teller'), (15, 'Frank', 'Portman', datetime.date(2003, 4, 1), 'Teller')]


# Sorting via Expressions

Get all customers ordered by the last three digits of the federal ID

In [11]:

with Session(engine) as session:
    df = pd.read_sql_query(
        """
        SELECT
            c.cust_id
            , c.cust_type_cd
            , c.city
            , c.state
            , c.fed_id
        FROM customer c
        ORDER BY RIGHT(c.fed_id, 3);
        """,
        con=session.connection()
    )

    statement = (
        select(
            Customer.cust_id,
            Customer.cust_type_cd,
            Customer.city,
            Customer.state,
            Customer.fed_id
        )
        .select_from(Customer)
        .order_by(func.right(Customer.fed_id, 3))
    )
    print_sql_statement(statement)
    results = session.execute(statement).all()

print(df)
print(results)

"""SELECT customer.cust_id, customer.cust_type_cd, customer.city, customer.state, customer.fed_id 
FROM customer ORDER BY right(customer.fed_id, :right_1)"""
    cust_id cust_type_cd        city state       fed_id
0         1            I   Lynnfield    MA  111-11-1111
1        10            B       Salem    NH   04-1111111
2         2            I      Woburn    MA  222-22-2222
3        11            B  Wilmington    MA   04-2222222
4         3            I      Quincy    MA  333-33-3333
5        12            B       Salem    NH   04-3333333
6         4            I     Waltham    MA  444-44-4444
7        13            B      Quincy    MA   04-4444444
8         5            I       Salem    NH  555-55-5555
9         6            I     Waltham    MA  666-66-6666
10        7            I  Wilmington    MA  777-77-7777
11        8            I       Salem    NH  888-88-8888
12        9            I      Newton    MA  999-99-9999
[(1, <CustomerTypeEnum.I: 'I'>, 'Lynnfield', 'MA', '111-11

# Exercises

## 3-1

Retrieve the employee ID, first name, and last name for all bank employees. Sort by last name then first name

## 3-2

Retrieve the account ID, customer ID, and available balance for all accounts whose status equals 'ACTIVE' and whose available balance is greater than $2,500.

## 3-3

Write a query again the account table that returns the IDs of the employees who opened the accounts (use the account.open_emp_id columns). Include a single row each distinct employee.

## 3-4

Fill in the banks (denoted by <#>) for this multi-data-set query to achieve the results shown below:

```sql
SELECT p.product_cd, a.cust_id, a.avail_balanace
FROM product p INNER JOIN account <1>
    ON product_cd = <2>
WHERE p.<3> = 'ACCOUNT'
```

In [17]:
# 3-1

with Session(engine) as session:
    df = pd.read_sql_query(
        """
        SELECT
            e.emp_id
            , e.fname
            , e.lname
        FROM employee e
        ORDER BY e.lname, e.fname;
        """,
        con=session.connection()
    )

    statement = (
        select(
            Employee.emp_id,
            Employee.fname,
            Employee.lname
        )
        .select_from(Employee)
        .order_by(Employee.lname, Employee.fname)
    )
    print_sql_statement(statement)
    results = session.execute(statement).all()

print(df)
print(results)

"""SELECT employee.emp_id, employee.fname, employee.lname 
FROM employee ORDER BY employee.lname, employee.fname"""
    emp_id     fname      lname
0        2     Susan     Barker
1       13      John      Blake
2        6     Helen    Fleming
3       17      Beth     Fowler
4        5      John    Gooding
5        9      Jane   Grossman
6        4     Susan  Hawthorne
7       12  Samantha    Jameson
8       16   Theresa    Markham
9       14     Cindy      Mason
10       8     Sarah     Parker
11      15     Frank    Portman
12      10     Paula    Roberts
13       1   Michael      Smith
14       7     Chris     Tucker
15      18      Rick     Tulman
16       3    Robert      Tyler
17      11    Thomas    Ziegler
[(2, 'Susan', 'Barker'), (13, 'John', 'Blake'), (6, 'Helen', 'Fleming'), (17, 'Beth', 'Fowler'), (5, 'John', 'Gooding'), (9, 'Jane', 'Grossman'), (4, 'Susan', 'Hawthorne'), (12, 'Samantha', 'Jameson'), (16, 'Theresa', 'Markham'), (14, 'Cindy', 'Mason'), (8, 'Sarah', 'Parker')

We can also use the ORM in list-comprehension

In [20]:
with Session(engine) as session:
    new_results = sorted(
        [
            (emp.emp_id, emp.fname, emp.lname)
            for emp in session.query(Employee)
        ],
        key=lambda tup: (tup[-1], tup[-2])
    )

assert new_results == results


In [23]:
# 3-2
from sqlalchemy import and_

from model import AccountStatusEnum


with Session(engine) as session:
    df = pd.read_sql_query(
        """
        SELECT
            a.account_id
            , a.cust_id
            , a.avail_balance
        FROM account a
        WHERE (
            a.status = 'ACTIVE'
            AND
            a.avail_balance > 2500
        )
        ;
        """,
        con=session.connection()
    )

    statement = (
        select(
            Account.account_id,
            Account.cust_id,
            Account.avail_balance
        )
        .select_from(Account)
        .where(
            and_(
                Account.status == AccountStatusEnum.ACTIVE,
                Account.avail_balance > 2500
            )
        )
    )
    print_sql_statement(statement)
    results = session.execute(statement).all()

print(df)
print(results)

"""SELECT account.account_id, account.cust_id, account.avail_balance 
FROM account 
WHERE account.status = :status_1 AND account.avail_balance > :avail_balance_1"""
   account_id  cust_id  avail_balance
0           3        1        3000.00
1          12        4        5487.09
2          15        6       10000.00
3          17        7        5000.00
4          18        8        3487.19
5          22        9        9345.55
6          24       10       23575.12
7          27       11        9345.55
8          28       12       38552.05
9          29       13       50000.00
[(3, 1, 3000.0), (12, 4, 5487.09), (15, 6, 10000.0), (17, 7, 5000.0), (18, 8, 3487.19), (22, 9, 9345.55), (24, 10, 23575.12), (27, 11, 9345.55), (28, 12, 38552.05), (29, 13, 50000.0)]


We can also use the ORM in list-comprehension

In [26]:
with Session(engine) as session:
    new_results = (
        [
            (acc.account_id, acc.cust_id, acc.avail_balance)
            for acc in session.query(Account)
            if (acc.status == AccountStatusEnum.ACTIVE and acc.avail_balance > 2500)
        ]
    )
    assert new_results == results


In [27]:
# 3-3


with Session(engine) as session:
    df = pd.read_sql_query(
        """
        SELECT DISTINCT
            a.open_emp_id
        FROM account a
        WHERE (
            a.open_emp_id IS NOT NULL
        )
        ;
        """,
        con=session.connection()
    )

    statement = (
        select(
            Account.open_emp_id
        )
        .distinct()
        .select_from(Account)
        .where(
            Account.open_emp_id.is_not(None)
        )
    )
    print_sql_statement(statement)
    results = session.execute(statement).all()

print(df)
print(results)

"""SELECT DISTINCT account.open_emp_id 
FROM account 
WHERE account.open_emp_id IS NOT NULL"""
   open_emp_id
0            1
1           10
2           13
3           16
[(1,), (10,), (13,), (16,)]


We can also use the ORM in list-comprehension

In [31]:
with Session(engine) as session:
    new_results = set(
        (acc.open_emp_id,)
        for acc in session.query(Account)
    )

assert new_results == set(results)

In [33]:
# 3-4
from model import Product


with Session(engine) as session:
    df = pd.read_sql_query(
        """
        SELECT
            p.product_cd
            , a.cust_id
            , a.avail_balance
        FROM
            product p INNER JOIN account a
            ON p.product_cd = a.product_cd
        WHERE p.product_type_cd = 'ACCOUNT'
        """,
        con=session.connection()
    )
    statement = (
        select(
            Product.product_cd,
            Account.cust_id,
            Account.avail_balance
        )
        .select_from(Product).join(Account, Product.product_cd == Account.product_cd)
        .where(Product.product_type_cd == "ACCOUNT")
    )
    print_sql_statement(statement)
    results = session.execute(statement).all()

print(df)
print(results)

"""SELECT product.product_cd, account.cust_id, account.avail_balance 
FROM product JOIN account ON product.product_cd = account.product_cd 
WHERE product.product_type_cd = :product_type_cd_1"""
   product_cd  cust_id  avail_balance
0          CD        1        3000.00
1          CD        6       10000.00
2          CD        7        5000.00
3          CD        9        1500.00
4         CHK        1        1057.75
5         CHK        2        2258.02
6         CHK        3        1057.75
7         CHK        4         534.12
8         CHK        5        2237.97
9         CHK        6         122.37
10        CHK        8        3487.19
11        CHK        9         125.67
12        CHK       10       23575.12
13        CHK       12       38552.05
14         MM        3        2212.50
15         MM        4        5487.09
16         MM        9        9345.55
17        SAV        1         500.00
18        SAV        2         200.00
19        SAV        4         767.77
20      

We can also use the ORM in list-comprehension

In [36]:
with Session(engine) as session:
    new_results = [
        (acc.account_product.product_cd, acc.cust_id, acc.avail_balance)
        for acc in session.query(Account)
        if acc.account_product.product_type_cd == "ACCOUNT"
    ]

# print(new_results)
# print(results)
assert set(new_results) == set(results)