In [1]:
import os

from dotenv import load_dotenv
from sqlalchemy import create_engine, URL, select, func, literal, CHAR
from sqlalchemy.sql.selectable import Select as SQLSelect
from sqlalchemy.orm import Session
import pandas as pd

import model
from model import Customer, Employee, Department, Branch, Account


load_dotenv()

url_object = URL.create(
    os.environ["DB_ENGINE"],
    username=os.environ["DB_USER"],
    password=os.environ["DB_PASSWD"],
    host=os.environ["DB_HOST"],
    database=os.environ["DB_NAME"],
)

engine = create_engine(url_object)

In [2]:
def print_sql_statement(sql_select_statement: SQLSelect) -> None:
    print('"""' + str(sql_select_statement) + '"""')

# Column Aliases

SQL allows for cacluated columns to have aliases. This is straight forward in a raw query, but looks weird in SQLalchemy

In [3]:
with Session(engine) as session:
    df = pd.read_sql_query(
        """
        SELECT
            emp_id
            , 'ACTIVE' status
            , emp_id * 3.14159 empid_x_pi
            , UPPER(lname) last_name_upper
        FROM
            employee;
        """,
        con=session.connection()
    )

    statement = (
        select(
            Employee.emp_id,
            literal("ACTIVE").label("status"),
            (Employee.emp_id * 3.14159).label("empid_x_pi"),
            func.upper(Employee.lname).label("last_name_upper")
        ).select_from(Employee)
    )
    print_sql_statement(statement)
    results = session.execute(statement).all()

print(results)
print(df)

"""SELECT employee.emp_id, :param_1 AS status, employee.emp_id * :emp_id_1 AS empid_x_pi, upper(employee.lname) AS last_name_upper 
FROM employee"""
[(1, 'ACTIVE', 3.14159, 'SMITH'), (2, 'ACTIVE', 6.28318, 'BARKER'), (3, 'ACTIVE', 9.42477, 'TYLER'), (4, 'ACTIVE', 12.56636, 'HAWTHORNE'), (5, 'ACTIVE', 15.70795, 'GOODING'), (6, 'ACTIVE', 18.84954, 'FLEMING'), (7, 'ACTIVE', 21.99113, 'TUCKER'), (8, 'ACTIVE', 25.13272, 'PARKER'), (9, 'ACTIVE', 28.27431, 'GROSSMAN'), (10, 'ACTIVE', 31.4159, 'ROBERTS'), (11, 'ACTIVE', 34.55749, 'ZIEGLER'), (12, 'ACTIVE', 37.69908, 'JAMESON'), (13, 'ACTIVE', 40.84067, 'BLAKE'), (14, 'ACTIVE', 43.98226, 'MASON'), (15, 'ACTIVE', 47.12385, 'PORTMAN'), (16, 'ACTIVE', 50.26544, 'MARKHAM'), (17, 'ACTIVE', 53.40703, 'FOWLER'), (18, 'ACTIVE', 56.54862, 'TULMAN')]
    emp_id  status  empid_x_pi last_name_upper
0        1  ACTIVE     3.14159           SMITH
1        2  ACTIVE     6.28318          BARKER
2        3  ACTIVE     9.42477           TYLER
3        4  ACTIVE 

# Removing Duplicates with DISTINCT

Find all the unique accounts in the account table

A naive attempt shows all the accounts with repeated customers

In [4]:
with Session(engine) as session:
    df = pd.read_sql_query(
        """
        SELECT
            cust_id
        FROM
            account;
        """,
        con=session.connection()
    )

df.head(10)

Unnamed: 0,cust_id
0,1
1,1
2,1
3,2
4,2
5,3
6,3
7,4
8,4
9,4


We can exclude/remove duplicate customers using the DISTINCT keyword

In [5]:
with Session(engine) as session:
    df = pd.read_sql_query(
        """
        SELECT DISTINCT
            cust_id
        FROM
            account;
        """,
        con=session.connection()
    )

    statement = (
        select(Account.cust_id).distinct().select_from(Account)
    )
    print_sql_statement(statement)
    results = session.execute(statement).all()

print(results)
print(df.head(10))

"""SELECT DISTINCT account.cust_id 
FROM account"""
[(1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,), (10,), (11,), (12,), (13,)]
   cust_id
0        1
1        2
2        3
3        4
4        5
5        6
6        7
7        8
8        9
9       10


# Returning the Duplicates

Find all customers who opened multiple accounts on the same date.

In this query, the cust_id an open_date in the accounts could be identical.

In [6]:
with Session(engine) as session:
    df = pd.read_sql_query(
        """
        SELECT
            a.cust_id
            , a.open_date
            , COUNT(*)
        FROM
            account a
        GROUP BY a.cust_id, a.open_date
        HAVING COUNT(*) > 1;
        """,
        con=session.connection()
    )

    statement = (
        select(
            Account.cust_id,
            Account.open_date,
            func.count("*")
        ).select_from(Account)
        .group_by(Account.cust_id, Account.open_date)
        .having(func.count("*") > 1)
    )
    print_sql_statement(statement)
    results = session.execute(statement).all()

print(results)
print(df.head(10))

"""SELECT account.cust_id, account.open_date, count(:count_2) AS count_1 
FROM account GROUP BY account.cust_id, account.open_date 
HAVING count(:count_3) > :count_4"""
[(1, datetime.date(2000, 1, 15), 2), (2, datetime.date(2001, 3, 12), 2), (8, datetime.date(2001, 5, 23), 2)]
   cust_id   open_date  COUNT(*)
0        1  2000-01-15         2
1        2  2001-03-12         2
2        8  2001-05-23         2


# Subquery Generated Tables

Get all employees ID, first name, and last name all capitalized. While this subquery is useless since the subquery could have been performed in the outer select, this goes to show the power of subqueries.


In [7]:
from sqlalchemy.orm import aliased

with Session(engine) as session:
    df = pd.read_sql_query(
        """
        SELECT
            eu.emp_id_upper ID
            , eu.fname_upper FNAME
            , eu.lname_upper LNAME
        FROM (
            SELECT
                UPPER(CAST(e.emp_id AS CHAR)) emp_id_upper
                , UPPER(e.fname) fname_upper
                , UPPER(e.lname) lname_upper
            FROM employee e
        ) eu;
        """,
        con=session.connection()
    )

    eu = select(
        func.upper(func.cast(Employee.emp_id, CHAR)).label("emp_id_upper"),
        func.upper(Employee.fname).label("fname_upper"),
        func.upper(Employee.lname).label("lname_upper")
    ).select_from(Employee).subquery("eu")
    # Note that since special labels have been introduced, the subquery
    # column operator 'c' must be used
    statement = select(
        eu.c["emp_id_upper"].label("ID"),
        eu.c["fname_upper"].label("FNAME"),
        eu.c["lname_upper"].label("LNAME")
    ).select_from(eu)

    print_sql_statement(statement)
    results = session.execute(statement).all()

print(results)
print(df.head(10))

"""SELECT eu.emp_id_upper AS "ID", eu.fname_upper AS "FNAME", eu.lname_upper AS "LNAME" 
FROM (SELECT upper(CAST(employee.emp_id AS CHAR)) AS emp_id_upper, upper(employee.fname) AS fname_upper, upper(employee.lname) AS lname_upper 
FROM employee) AS eu"""
[('1', 'MICHAEL', 'SMITH'), ('2', 'SUSAN', 'BARKER'), ('3', 'ROBERT', 'TYLER'), ('4', 'SUSAN', 'HAWTHORNE'), ('5', 'JOHN', 'GOODING'), ('6', 'HELEN', 'FLEMING'), ('7', 'CHRIS', 'TUCKER'), ('8', 'SARAH', 'PARKER'), ('9', 'JANE', 'GROSSMAN'), ('10', 'PAULA', 'ROBERTS'), ('11', 'THOMAS', 'ZIEGLER'), ('12', 'SAMANTHA', 'JAMESON'), ('13', 'JOHN', 'BLAKE'), ('14', 'CINDY', 'MASON'), ('15', 'FRANK', 'PORTMAN'), ('16', 'THERESA', 'MARKHAM'), ('17', 'BETH', 'FOWLER'), ('18', 'RICK', 'TULMAN')]
   ID    FNAME      LNAME
0   1  MICHAEL      SMITH
1   2    SUSAN     BARKER
2   3   ROBERT      TYLER
3   4    SUSAN  HAWTHORNE
4   5     JOHN    GOODING
5   6    HELEN    FLEMING
6   7    CHRIS     TUCKER
7   8    SARAH     PARKER
8   9     JANE   GRO

# Views (AKA Virtual Tables)

A view is a query stored in the data dictionary. It does not hold any data so it can be thought of as a "virtual table".

The SQLAlcehmy recipe to create a view is complex and won't be discussed here, but the recipe is
https://web.archive.org/web/20230107211001/https://github.com/sqlalchemy/sqlalchemy/wiki/Views

In [12]:
import mysql.connector

mysql_connection = mysql.connector.connect(
    user=os.environ["DB_USER"],
    password=os.environ["DB_PASSWD"],
    host=os.environ["DB_HOST"],
    database=os.environ["DB_NAME"]
)

with mysql_connection.cursor() as cursor:
    create_view_statement = (
        """
        CREATE VIEW employee_vw AS
        SELECT
            emp_id
            , fname
            , lname
            , YEAR(start_date) start_year
        FROM employee;
        """
    )
    select_from_view_statement = (
        """
        SELECT
             emp_id, start_year
         FROM
             employee_vw;
        """
    )
    drop_view_statement = (
        """DROP VIEW IF EXISTS employee_vw"""
    )
    cursor.execute(drop_view_statement)
    cursor.execute(create_view_statement)
    cursor.execute(select_from_view_statement)
    employee_ids, start_years = zip(
        *tuple(result for result in cursor)
    )
    df = pd.DataFrame(
        data={
            "emp_id": employee_ids,
            "start_year": start_years
        }
    )
    cursor.execute(drop_view_statement)

print(df)

    emp_id  start_year
0        1        2001
1        2        2002
2        3        2000
3        4        2002
4        5        2003
5        6        2004
6        7        2004
7        8        2002
8        9        2002
9       10        2002
10      11        2000
11      12        2003
12      13        2000
13      14        2002
14      15        2003
15      16        2001
16      17        2002
17      18        2002
