# Writing Advanced SQL Queries

As part of this section we will understand how to write queries using some of the advanced features.

* Nested Sub Queries and Views
* Advanced DML Operations
* Pivoting Rows into Columns
* Overview of Analytic Functions
* Define Problem Statement – Top 5 Daily Products
* Analytic Functions – Aggregations
* Analytic Functions – Windowing
* Analytic Functions – Ranking
* Final Solution – Top 5 Daily Products

## Nested Sub Queries

In [1]:
%load_ext sql

In [2]:
%env DATABASE_URL=postgresql://itversity_retail_user:retail_password@localhost:5432/itversity_retail_db

env: DATABASE_URL=postgresql://itversity_retail_user:retail_password@localhost:5432/itversity_retail_db


In [80]:
%%sql

CREATE OR REPLACE VIEW orders_v
AS
SELECT * FROM orders

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
Done.


[]

In [81]:
%%sql

UPDATE orders_v
SET order_status = lower(order_status)

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
68883 rows affected.


[]

In [82]:
%%sql

SELECT * FROM orders LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


order_id,order_date,order_customer_id,order_status
4068,2013-08-17 00:00:00,12293,pending
8926,2013-09-19 00:00:00,10517,on_hold
14047,2013-10-20 00:00:00,6473,closed
19552,2013-11-23 00:00:00,7057,on_hold
20471,2013-11-29 00:00:00,9957,complete
24016,2013-12-21 00:00:00,604,complete
24408,2013-12-23 00:00:00,5799,closed
25063,2013-12-27 00:00:00,5593,complete
26112,2014-01-03 00:00:00,333,canceled
50404,2014-06-06 00:00:00,8773,processing


In [83]:
%%sql

UPDATE orders_v
SET order_status = upper(order_status)

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
68883 rows affected.


[]

In [70]:
%%sql

CREATE OR REPLACE VIEW order_details_v
AS
SELECT * FROM orders o
    JOIN order_items oi
        on o.order_id = oi.order_item_order_id

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
Done.


[]

In [71]:
%%sql

SELECT * FROM order_details_v LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


order_id,order_date,order_customer_id,order_status,order_item_id,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price
1,2013-07-25 00:00:00,11599,CLOSED,1,1,957,1,299.98,299.98
2,2013-07-25 00:00:00,256,PENDING_PAYMENT,2,2,1073,1,199.99,199.99
2,2013-07-25 00:00:00,256,PENDING_PAYMENT,3,2,502,5,250.0,50.0
2,2013-07-25 00:00:00,256,PENDING_PAYMENT,4,2,403,1,129.99,129.99
4,2013-07-25 00:00:00,8827,CLOSED,5,4,897,2,49.98,24.99
4,2013-07-25 00:00:00,8827,CLOSED,6,4,365,5,299.95,59.99
4,2013-07-25 00:00:00,8827,CLOSED,7,4,502,3,150.0,50.0
4,2013-07-25 00:00:00,8827,CLOSED,8,4,1014,4,199.92,49.98
5,2013-07-25 00:00:00,11318,COMPLETE,9,5,957,1,299.98,299.98
5,2013-07-25 00:00:00,11318,COMPLETE,10,5,365,5,299.95,59.99


In [72]:
%%sql

SELECT count(1) FROM order_details_v

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
1 rows affected.


count
172198


In [74]:
%%sql

SELECT order_date,
    order_item_product_id,
    round(sum(order_item_subtotal)::numeric, 2) AS revenue
FROM order_details_v 
GROUP BY order_date,
    order_item_product_id
ORDER BY order_date,
    revenue DESC
LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


order_date,order_item_product_id,revenue
2013-07-25 00:00:00,1004,10799.46
2013-07-25 00:00:00,957,9599.36
2013-07-25 00:00:00,191,8499.15
2013-07-25 00:00:00,365,7558.74
2013-07-25 00:00:00,1073,6999.65
2013-07-25 00:00:00,1014,6397.44
2013-07-25 00:00:00,403,5589.57
2013-07-25 00:00:00,502,5100.0
2013-07-25 00:00:00,627,2879.28
2013-07-25 00:00:00,226,599.99


In [63]:
%%sql

WITH order_details_v AS (
    SELECT * FROM orders o
        JOIN order_items oi
            on o.order_id = oi.order_item_order_id
) SELECT * FROM order_details_v LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


order_id,order_date,order_customer_id,order_status,order_item_id,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price
1,2013-07-25 00:00:00,11599,CLOSED,1,1,957,1,299.98,299.98
2,2013-07-25 00:00:00,256,PENDING_PAYMENT,2,2,1073,1,199.99,199.99
2,2013-07-25 00:00:00,256,PENDING_PAYMENT,3,2,502,5,250.0,50.0
2,2013-07-25 00:00:00,256,PENDING_PAYMENT,4,2,403,1,129.99,129.99
4,2013-07-25 00:00:00,8827,CLOSED,5,4,897,2,49.98,24.99
4,2013-07-25 00:00:00,8827,CLOSED,6,4,365,5,299.95,59.99
4,2013-07-25 00:00:00,8827,CLOSED,7,4,502,3,150.0,50.0
4,2013-07-25 00:00:00,8827,CLOSED,8,4,1014,4,199.92,49.98
5,2013-07-25 00:00:00,11318,COMPLETE,9,5,957,1,299.98,299.98
5,2013-07-25 00:00:00,11318,COMPLETE,10,5,365,5,299.95,59.99


In [75]:
%%sql

WITH order_details_v AS (
    SELECT * FROM orders o
        JOIN order_items oi
            on o.order_id = oi.order_item_order_id
) SELECT order_date,
    order_item_product_id,
    round(sum(order_item_subtotal)::numeric, 2) AS revenue
FROM order_details_v 
GROUP BY order_date,
    order_item_product_id
ORDER BY order_date,
    revenue DESC
LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


order_date,order_item_product_id,revenue
2013-07-25 00:00:00,1004,10799.46
2013-07-25 00:00:00,957,9599.36
2013-07-25 00:00:00,191,8499.15
2013-07-25 00:00:00,365,7558.74
2013-07-25 00:00:00,1073,6999.65
2013-07-25 00:00:00,1014,6397.44
2013-07-25 00:00:00,403,5589.57
2013-07-25 00:00:00,502,5100.0
2013-07-25 00:00:00,627,2879.28
2013-07-25 00:00:00,226,599.99


In [66]:
%%sql

SELECT * FROM order_items oi
WHERE oi.order_item_order_id 
    NOT IN (
        SELECT order_id FROM orders o
        WHERE o.order_id = oi.order_item_order_id
    )
LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
0 rows affected.


order_item_id,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price


In [67]:
%%sql

SELECT * FROM order_items oi
WHERE oi.order_item_order_id 
    IN (
        SELECT order_id FROM orders o
        WHERE o.order_id = oi.order_item_order_id
    )
LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


order_item_id,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price
1,1,957,1,299.98,299.98
2,2,1073,1,199.99,199.99
3,2,502,5,250.0,50.0
4,2,403,1,129.99,129.99
5,4,897,2,49.98,24.99
6,4,365,5,299.95,59.99
7,4,502,3,150.0,50.0
8,4,1014,4,199.92,49.98
9,5,957,1,299.98,299.98
10,5,365,5,299.95,59.99


In [68]:
%%sql

SELECT * FROM order_items oi
WHERE NOT EXISTS (
        SELECT 1 FROM orders o
        WHERE o.order_id = oi.order_item_order_id
    )
LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
0 rows affected.


order_item_id,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price


In [69]:
%%sql

SELECT * FROM order_items oi
WHERE EXISTS (
        SELECT 1 FROM orders o
        WHERE o.order_id = oi.order_item_order_id
    )
LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


order_item_id,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price
1,1,957,1,299.98,299.98
2,2,1073,1,199.99,199.99
3,2,502,5,250.0,50.0
4,2,403,1,129.99,129.99
5,4,897,2,49.98,24.99
6,4,365,5,299.95,59.99
7,4,502,3,150.0,50.0
8,4,1014,4,199.92,49.98
9,5,957,1,299.98,299.98
10,5,365,5,299.95,59.99


## Advanced DML Operations

In [1]:
%load_ext sql

In [2]:
%env DATABASE_URL=postgresql://itversity_retail_user:retail_password@localhost:5432/itversity_retail_db

env: DATABASE_URL=postgresql://itversity_retail_user:retail_password@localhost:5432/itversity_retail_db


In [3]:
%%sql

CREATE TABLE customer_order_metrics_dly (
    customer_id INT,
    order_date DATE,
    order_count INT,
    order_revenue FLOAT
)

Done.


[]

In [14]:
%%sql

ALTER TABLE customer_order_metrics_dly
    ADD PRIMARY KEY (customer_id, order_date)

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
Done.


[]

In [6]:
%%sql

INSERT INTO customer_order_metrics_dly
SELECT o.order_customer_id,
    o.order_date,
    count(1) order_count,
    NULL
FROM orders o 
    JOIN order_items oi
        ON o.order_id = oi.order_item_order_id
WHERE o.order_date BETWEEN '2013-08-01' AND '2013-08-31'
GROUP BY o.order_customer_id,
    o.order_date

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
4708 rows affected.


[]

In [7]:
%%sql

UPDATE customer_order_metrics_dly comd
SET 
    (order_count, order_revenue) = (
        SELECT count(1),
            round(sum(order_item_subtotal)::numeric, 2)
        FROM orders o 
            JOIN order_items oi
                ON o.order_id = oi.order_item_order_id
            JOIN customers c
                ON c.customer_id = o.order_customer_id
        WHERE o.order_date BETWEEN '2013-08-01' AND '2013-08-31'
            AND o.order_customer_id = comd.customer_id
            AND o.order_date = comd.order_date
        GROUP BY c.customer_id,
            o.order_date
    )

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
4708 rows affected.


[]

In [17]:
%%sql

SELECT * FROM customer_order_metrics_dly
ORDER BY order_date,
    customer_id
LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


customer_id,order_date,order_count,order_revenue
10993,2013-08-01,2,449.89
11060,2013-08-01,5,919.93
11063,2013-08-01,1,179.97
11106,2013-08-01,2,259.98
11218,2013-08-01,5,1187.87
11310,2013-08-01,1,199.95
11358,2013-08-01,1,59.99
11399,2013-08-01,4,999.9
11560,2013-08-01,4,649.89
11571,2013-08-01,2,503.94


In [6]:
%%sql

INSERT INTO customer_order_metrics_dly
SELECT o.order_customer_id,
    o.order_date,
    count(1) order_count,
    NULL
FROM orders o 
    JOIN order_items oi
        ON o.order_id = oi.order_item_order_id
WHERE o.order_date BETWEEN '2013-08-01' AND '2013-08-31'
GROUP BY o.order_customer_id,
    o.order_date

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
569 rows affected.


[]

## Merging Data

In [3]:
%load_ext sql

In [4]:
%env DATABASE_URL=postgresql://itversity_retail_user:retail_password@localhost:5432/itversity_retail_db

env: DATABASE_URL=postgresql://itversity_retail_user:retail_password@localhost:5432/itversity_retail_db


In [17]:
%sql TRUNCATE TABLE customer_order_metrics_dly

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
Done.


[]

In [18]:
%%sql

INSERT INTO customer_order_metrics_dly
SELECT o.order_customer_id,
    o.order_date,
    count(1) order_count,
    NULL
FROM orders o 
    JOIN order_items oi
        ON o.order_id = oi.order_item_order_id
WHERE o.order_date BETWEEN '2013-08-01' AND '2013-08-31'
GROUP BY o.order_customer_id,
    o.order_date

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
4708 rows affected.


[]

In [19]:
%%sql

UPDATE customer_order_metrics_dly comd
SET 
    (order_count, order_revenue) = (
        SELECT count(1),
            round(sum(oi.order_item_subtotal)::numeric, 2)
        FROM orders o 
            JOIN order_items oi
                ON o.order_id = oi.order_item_order_id
        WHERE o.order_date BETWEEN '2013-08-01' AND '2013-10-31'
            AND o.order_customer_id = comd.customer_id
            AND o.order_date = comd.order_date
        GROUP BY o.order_customer_id,
            o.order_date
    )

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
4708 rows affected.


[]

In [20]:
%%sql

INSERT INTO customer_order_metrics_dly
SELECT o.order_customer_id AS customer_id,
    o.order_date,
    count(1) order_count,
    round(sum(order_item_subtotal)::numeric, 2)
FROM orders o 
    JOIN order_items oi
        ON o.order_id = oi.order_item_order_id
WHERE o.order_date BETWEEN '2013-08-01' AND '2013-10-31'
    AND NOT EXISTS (
        SELECT 1 FROM customer_order_metrics_dly codm
        WHERE o.order_customer_id = codm.customer_id
            AND o.order_date = codm.order_date
    )
GROUP BY o.order_customer_id,
    o.order_date

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
9265 rows affected.


[]

In [21]:
%sql TRUNCATE TABLE customer_order_metrics_dly

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
Done.


[]

In [22]:
%%sql

INSERT INTO customer_order_metrics_dly
SELECT o.order_customer_id,
    o.order_date,
    count(1) order_count,
    NULL
FROM orders o 
    JOIN order_items oi
        ON o.order_id = oi.order_item_order_id
WHERE o.order_date BETWEEN '2013-08-01' AND '2013-08-31'
GROUP BY o.order_customer_id,
    o.order_date

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
4708 rows affected.


[]

In [23]:
%%sql

INSERT INTO customer_order_metrics_dly
SELECT o.order_customer_id,
    o.order_date,
    count(1) order_count,
    round(sum(order_item_subtotal)::numeric, 2) AS order_revenue
FROM orders o 
    JOIN order_items oi
        ON o.order_id = oi.order_item_order_id
WHERE o.order_date BETWEEN '2013-08-01' AND '2013-10-31'
GROUP BY o.order_customer_id,
    o.order_date
ON CONFLICT (customer_id, order_date) DO UPDATE SET
    order_count = EXCLUDED.order_count,
    order_revenue = EXCLUDED.order_revenue

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
13973 rows affected.


[]

## Pivoting Rows into Columns

* Install `tablefunc` as Postgres superuser to expose functions like crosstab - `CREATE EXTENSION tablefunc;`

In [3]:
%load_ext sql

In [4]:
%env DATABASE_URL=postgresql://itversity_retail_user:retail_password@localhost:5432/itversity_retail_db

env: DATABASE_URL=postgresql://itversity_retail_user:retail_password@localhost:5432/itversity_retail_db


In [19]:
%%sql

SELECT order_date,
    order_status,
    count(1)
FROM orders
GROUP BY order_date,
    order_status
ORDER BY order_date,
    order_status
LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


order_date,order_status,count
2013-07-25 00:00:00,CANCELED,1
2013-07-25 00:00:00,CLOSED,20
2013-07-25 00:00:00,COMPLETE,42
2013-07-25 00:00:00,ON_HOLD,5
2013-07-25 00:00:00,PAYMENT_REVIEW,3
2013-07-25 00:00:00,PENDING,13
2013-07-25 00:00:00,PENDING_PAYMENT,41
2013-07-25 00:00:00,PROCESSING,16
2013-07-25 00:00:00,SUSPECTED_FRAUD,2
2013-07-26 00:00:00,CANCELED,3


In [29]:
%%sql

SELECT * FROM crosstab(
    'SELECT order_date,
        order_status,
        count(1) AS order_count
    FROM orders
    GROUP BY order_date,
        order_status
    ORDER BY order_date,
        order_status',
    'SELECT DISTINCT order_status FROM orders ORDER BY 1'
) AS (
    order_date DATE,
    "CANCELED" INT,
    "CLOSED" INT,
    "COMPLETE" INT,
    "ON_HOLD" INT,
    "PAYMENT_REVIEW" INT,
    "PENDING" INT,
    "PENDING_PAYMENT" INT,
    "PROCESSING" INT,
    "SUSPECTED_FRAUD" INT
)
LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


order_date,CANCELED,CLOSED,COMPLETE,ON_HOLD,PAYMENT_REVIEW,PENDING,PENDING_PAYMENT,PROCESSING,SUSPECTED_FRAUD
2013-07-25,1.0,20,42,5,3.0,13,41,16,2
2013-07-26,3.0,29,87,19,6.0,31,59,30,5
2013-07-27,,26,66,6,3.0,20,46,31,4
2013-07-28,6.0,18,63,12,4.0,18,37,26,3
2013-07-29,5.0,18,88,13,2.0,30,66,29,2
2013-07-30,4.0,22,89,11,,20,43,27,11
2013-07-31,3.0,28,80,15,1.0,29,61,29,6
2013-08-01,5.0,21,76,10,1.0,44,56,24,9
2013-08-02,2.0,25,81,8,3.0,24,48,25,8
2013-08-03,3.0,19,65,10,3.0,17,35,26,5


## Overview of Analytic Functions

Let us get an overview of Analytics or Windowing Functions as part of **SQL**.

* Aggregate Functions (`sum`, `min`, `max`, `avg`)
* Window Functions (`lead`, `lag`, `first_value`, `last_value`)
* Rank Functions (`rank`, `dense_rank`, `row_number` etc)
* For all the functions we use `OVER` clause.
* For aggregate functions we typically use `PARTITION BY`
* For global ranking and windowing functions we can use `ORDER BY sorting_column` and for ranking and windowing with in a partition or group we can use `PARTITION BY partition_column ORDER BY sorting_column`.

### Create tables to get daily revenue

Let us create couple of tables which will be used for the demonstrations of Windowing and Ranking functions.

* We have **ORDERS** and **ORDER_ITEMS** tables.
* Let us take care of computing daily revenue as well as daily product revenue.
* As we will be using same data set several times, let us create the tables to pre compute the data.
* **daily_revenue** will have the **order_date** and **revenue**, where data is aggregated using **order_date** as partition key.
* **daily_product_revenue** will have **order_date**, **order_item_product_id** and **revenue**. In this case data is aggregated using **order_date** and **order_item_product_id** as partition keys.

```{note}
Let us create table using CTAS to save daily revenue.
```

In [24]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [25]:
%env DATABASE_URL=postgresql://itversity_retail_user:retail_password@localhost:5432/itversity_retail_db

env: DATABASE_URL=postgresql://itversity_retail_user:retail_password@localhost:5432/itversity_retail_db


In [28]:
%%sql

DROP TABLE IF EXISTS daily_revenue

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
Done.


[]

In [29]:
%%sql

CREATE TABLE daily_revenue
AS
SELECT o.order_date,
       round(sum(oi.order_item_subtotal)::numeric, 2) AS revenue
FROM orders o JOIN order_items oi
ON o.order_id = oi.order_item_order_id
WHERE o.order_status IN ('COMPLETE', 'CLOSED')
GROUP BY o.order_date

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
364 rows affected.


[]

In [33]:
%%sql

SELECT * FROM daily_revenue
LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


order_date,revenue
2014-07-17 00:00:00,36384.77
2014-07-14 00:00:00,29937.52
2013-09-03 00:00:00,44379.1
2014-07-01 00:00:00,40165.66
2013-08-04 00:00:00,35093.01
2014-05-10 00:00:00,45317.7
2014-07-24 00:00:00,50885.19
2013-10-15 00:00:00,38585.85
2014-05-19 00:00:00,23506.14
2014-01-02 00:00:00,21872.7


```{note}
Let us create table using CTAS to save daily product revenue.
```

In [30]:
%%sql

DROP TABLE IF EXISTS daily_product_revenue

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
Done.


[]

In [32]:
%%sql

CREATE TABLE daily_product_revenue
AS
SELECT o.order_date,
       oi.order_item_product_id,
       round(sum(oi.order_item_subtotal)::numeric, 2) AS revenue
FROM orders o JOIN order_items oi
ON o.order_id = oi.order_item_order_id
WHERE o.order_status IN ('COMPLETE', 'CLOSED')
GROUP BY o.order_date, oi.order_item_product_id

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
9120 rows affected.


[]

In [34]:
%%sql

SELECT * FROM daily_product_revenue
LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


order_date,order_item_product_id,revenue
2014-02-21 00:00:00,78,199.98
2014-06-26 00:00:00,24,239.97
2014-05-24 00:00:00,804,79.96
2014-04-21 00:00:00,403,4029.69
2014-06-07 00:00:00,502,4950.0
2014-06-02 00:00:00,295,99.95
2013-10-25 00:00:00,924,63.96
2013-09-26 00:00:00,677,399.96
2013-11-03 00:00:00,627,2559.36
2013-11-04 00:00:00,835,127.96


## Define Problem Statement – Top 5 Daily Products

## Analytic Functions – Aggregations

Let us see how we can perform aggregations with in a partition or group using Windowing/Analytics Functions.

* For simple aggregations where we have to get grouping key and aggregated results we can use **GROUP BY**.
* If we want to get the raw data along with aggregated results, then using **GROUP BY** is not possible or overly complicated.
* Using aggregate functions with **OVER** Clause not only simplifies the process of writing query, but also better with respect to performance.
* Let us take an example of getting employee salary percentage when compared to department salary expense.

In [None]:
%%sql

SELECT employee_id, department_id, salary 
FROM employees 
ORDER BY department_id, salary
LIMIT 10

```{note}
Let us write the query using `GROUP BY` approach.
```

In [None]:
%%sql

SELECT department_id,
       sum(salary) AS department_salary_expense
FROM employees
GROUP BY department_id
ORDER BY department_id

In [None]:
%%sql

SELECT e.employee_id, e.department_id, e.salary,
       ae.department_salary_expense,
       ae.avg_salary_expense
FROM employees e JOIN (
     SELECT department_id, 
            sum(salary) AS department_salary_expense,
            avg(salary) AS avg_salary_expense
     FROM employees
     GROUP BY department_id
) ae
ON e.department_id = ae.department_id
ORDER BY department_id, salary

```{note}
Let us see how we can get it using Analytics/Windowing Functions. 
```

* We can use all standard aggregate functions such as `count`, `sum`, `min`, `max`, `avg` etc.

In [None]:
%%sql

SELECT e.employee_id, e.department_id, e.salary,
       sum(e.salary) 
         OVER (PARTITION BY e.department_id)
         AS department_salary_expense
FROM employees e
ORDER BY e.department_id

In [None]:
%%sql

SELECT e.employee_id, e.department_id, e.salary,
    sum(e.salary) OVER (PARTITION BY e.department_id) AS sum_sal_expense,
    avg(e.salary) OVER (PARTITION BY e.department_id) AS avg_sal_expense,
    min(e.salary) OVER (PARTITION BY e.department_id) AS min_sal_expense,
    max(e.salary) OVER (PARTITION BY e.department_id) AS max_sal_expense,
    count(e.salary) OVER (PARTITION BY e.department_id) AS cnt_sal_expense
FROM employees e
ORDER BY e.department_id

In [38]:
%%sql

SELECT
    order_date,
    order_item_product_id,
    revenue,
    sum(revenue) OVER (PARTITION BY order_date) AS sum_revenue,
    min(revenue) OVER (PARTITION BY order_date) AS min_revenue,
    max(revenue) OVER (PARTITION BY order_date) AS max_revenue
FROM daily_product_revenue
ORDER BY order_date,
    revenue DESC
LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


order_date,order_item_product_id,revenue,sum_revenue,min_revenue,max_revenue
2013-07-25 00:00:00,1004,5599.72,31547.23,49.98,5599.72
2013-07-25 00:00:00,191,5099.49,31547.23,49.98,5599.72
2013-07-25 00:00:00,957,4499.7,31547.23,49.98,5599.72
2013-07-25 00:00:00,365,3359.44,31547.23,49.98,5599.72
2013-07-25 00:00:00,1073,2999.85,31547.23,49.98,5599.72
2013-07-25 00:00:00,1014,2798.88,31547.23,49.98,5599.72
2013-07-25 00:00:00,403,1949.85,31547.23,49.98,5599.72
2013-07-25 00:00:00,502,1650.0,31547.23,49.98,5599.72
2013-07-25 00:00:00,627,1079.73,31547.23,49.98,5599.72
2013-07-25 00:00:00,226,599.99,31547.23,49.98,5599.72


## Analytic Functions – Windowing

### Getting LEAD and LAG values

Let us understand LEAD and LAG functions to get column values from following or prior records.

Here is the example where we can get prior or following records based on **ORDER BY** within **OVER** Clause.

In [39]:
%%sql

SELECT t.*,
  lead(order_date) OVER (ORDER BY order_date DESC) AS prior_date,
  lead(revenue) OVER (ORDER BY order_date DESC) AS prior_revenue,
  lag(order_date) OVER (ORDER BY order_date) AS lag_prior_date,
  lag(revenue) OVER (ORDER BY order_date) AS lag_prior_revenue
FROM daily_revenue AS t
ORDER BY order_date DESC
LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


order_date,revenue,prior_date,prior_revenue,lag_prior_date,lag_prior_revenue
2014-07-24 00:00:00,50885.19,2014-07-23 00:00:00,38795.23,2014-07-23 00:00:00,38795.23
2014-07-23 00:00:00,38795.23,2014-07-22 00:00:00,36717.24,2014-07-22 00:00:00,36717.24
2014-07-22 00:00:00,36717.24,2014-07-21 00:00:00,51427.7,2014-07-21 00:00:00,51427.7
2014-07-21 00:00:00,51427.7,2014-07-20 00:00:00,60047.45,2014-07-20 00:00:00,60047.45
2014-07-20 00:00:00,60047.45,2014-07-19 00:00:00,38420.99,2014-07-19 00:00:00,38420.99
2014-07-19 00:00:00,38420.99,2014-07-18 00:00:00,43856.6,2014-07-18 00:00:00,43856.6
2014-07-18 00:00:00,43856.6,2014-07-17 00:00:00,36384.77,2014-07-17 00:00:00,36384.77
2014-07-17 00:00:00,36384.77,2014-07-16 00:00:00,43011.92,2014-07-16 00:00:00,43011.92
2014-07-16 00:00:00,43011.92,2014-07-15 00:00:00,53480.23,2014-07-15 00:00:00,53480.23
2014-07-15 00:00:00,53480.23,2014-07-14 00:00:00,29937.52,2014-07-14 00:00:00,29937.52


In [40]:
%%sql

SELECT t.*,
  lead(order_date, 7) OVER (ORDER BY order_date DESC) AS prior_date,
  lead(revenue, 7) OVER (ORDER BY order_date DESC) AS prior_revenue
FROM daily_revenue t
ORDER BY order_date DESC
LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


order_date,revenue,prior_date,prior_revenue
2014-07-24 00:00:00,50885.19,2014-07-17 00:00:00,36384.77
2014-07-23 00:00:00,38795.23,2014-07-16 00:00:00,43011.92
2014-07-22 00:00:00,36717.24,2014-07-15 00:00:00,53480.23
2014-07-21 00:00:00,51427.7,2014-07-14 00:00:00,29937.52
2014-07-20 00:00:00,60047.45,2014-07-13 00:00:00,40410.99
2014-07-19 00:00:00,38420.99,2014-07-12 00:00:00,38449.77
2014-07-18 00:00:00,43856.6,2014-07-11 00:00:00,29596.32
2014-07-17 00:00:00,36384.77,2014-07-10 00:00:00,47826.02
2014-07-16 00:00:00,43011.92,2014-07-09 00:00:00,36929.91
2014-07-15 00:00:00,53480.23,2014-07-08 00:00:00,50434.81


In [41]:
%%sql

SELECT t.*,
  lead(order_date, 7) OVER (ORDER BY order_date DESC) AS prior_date,
  lead(revenue, 7) OVER (ORDER BY order_date DESC) AS prior_revenue
FROM daily_revenue t
ORDER BY order_date
LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


order_date,revenue,prior_date,prior_revenue
2013-07-25 00:00:00,31547.23,,
2013-07-26 00:00:00,54713.23,,
2013-07-27 00:00:00,48411.48,,
2013-07-28 00:00:00,35672.03,,
2013-07-29 00:00:00,54579.7,,
2013-07-30 00:00:00,49329.29,,
2013-07-31 00:00:00,59212.49,,
2013-08-01 00:00:00,49160.08,2013-07-25 00:00:00,31547.23
2013-08-02 00:00:00,50688.58,2013-07-26 00:00:00,54713.23
2013-08-03 00:00:00,43416.74,2013-07-27 00:00:00,48411.48


In [44]:
%%sql

SELECT t.*,
  lead(order_date, 7) OVER (ORDER BY order_date DESC) AS prior_date,
  lead(revenue, 7, 0.0) OVER (ORDER BY order_date DESC) AS prior_revenue
FROM daily_revenue t
ORDER BY order_date
LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


order_date,revenue,prior_date,prior_revenue
2013-07-25 00:00:00,31547.23,,0.0
2013-07-26 00:00:00,54713.23,,0.0
2013-07-27 00:00:00,48411.48,,0.0
2013-07-28 00:00:00,35672.03,,0.0
2013-07-29 00:00:00,54579.7,,0.0
2013-07-30 00:00:00,49329.29,,0.0
2013-07-31 00:00:00,59212.49,,0.0
2013-08-01 00:00:00,49160.08,2013-07-25 00:00:00,31547.23
2013-08-02 00:00:00,50688.58,2013-07-26 00:00:00,54713.23
2013-08-03 00:00:00,43416.74,2013-07-27 00:00:00,48411.48


In [45]:
%%sql

SELECT * FROM daily_product_revenue 
ORDER BY order_date, revenue DESC
LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


order_date,order_item_product_id,revenue
2013-07-25 00:00:00,1004,5599.72
2013-07-25 00:00:00,191,5099.49
2013-07-25 00:00:00,957,4499.7
2013-07-25 00:00:00,365,3359.44
2013-07-25 00:00:00,1073,2999.85
2013-07-25 00:00:00,1014,2798.88
2013-07-25 00:00:00,403,1949.85
2013-07-25 00:00:00,502,1650.0
2013-07-25 00:00:00,627,1079.73
2013-07-25 00:00:00,226,599.99


In [48]:
%%sql

SELECT t.*,
  LEAD(order_item_product_id) OVER (
    PARTITION BY order_date 
    ORDER BY revenue DESC
  ) next_product_id,
  LEAD(revenue) OVER (
    PARTITION BY order_date 
    ORDER BY revenue DESC
  ) next_revenue
FROM daily_product_revenue t
ORDER BY order_date, revenue DESC
LIMIT 30

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
30 rows affected.


order_date,order_item_product_id,revenue,next_product_id,next_revenue
2013-07-25 00:00:00,1004,5599.72,191.0,5099.49
2013-07-25 00:00:00,191,5099.49,957.0,4499.7
2013-07-25 00:00:00,957,4499.7,365.0,3359.44
2013-07-25 00:00:00,365,3359.44,1073.0,2999.85
2013-07-25 00:00:00,1073,2999.85,1014.0,2798.88
2013-07-25 00:00:00,1014,2798.88,403.0,1949.85
2013-07-25 00:00:00,403,1949.85,502.0,1650.0
2013-07-25 00:00:00,502,1650.0,627.0,1079.73
2013-07-25 00:00:00,627,1079.73,226.0,599.99
2013-07-25 00:00:00,226,599.99,24.0,319.96


### Getting first and last values

Let us see how we can get first and last value based on the criteria. We can also use min or max as well.

Here is the example of using first_value.

In [61]:
%%sql

SELECT t.*,
  first_value(order_item_product_id) OVER (
    PARTITION BY order_date ORDER BY revenue DESC
  ) first_product_id,
  first_value(revenue) OVER (
    PARTITION BY order_date ORDER BY revenue DESC
  ) first_revenue
FROM daily_product_revenue t
ORDER BY order_date, revenue DESC
LIMIT 10

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
10 rows affected.


order_date,order_item_product_id,revenue,first_product_id,first_revenue
2013-07-25 00:00:00,1004,5599.72,1004,5599.72
2013-07-25 00:00:00,191,5099.49,1004,5599.72
2013-07-25 00:00:00,957,4499.7,1004,5599.72
2013-07-25 00:00:00,365,3359.44,1004,5599.72
2013-07-25 00:00:00,1073,2999.85,1004,5599.72
2013-07-25 00:00:00,1014,2798.88,1004,5599.72
2013-07-25 00:00:00,403,1949.85,1004,5599.72
2013-07-25 00:00:00,502,1650.0,1004,5599.72
2013-07-25 00:00:00,627,1079.73,1004,5599.72
2013-07-25 00:00:00,226,599.99,1004,5599.72


Let us see an example with last_value. While using last_value we need to specify **ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING/PRECEEDING**.
* By default it uses `ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW`.
* The last value with in `UNBOUNDED PRECEDING AND CURRENT ROW` will be current record.
* To get the right value, we have to change the windowing clause to `ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING`.

In [50]:
%%sql

SELECT t.*,
    last_value(order_item_product_id) OVER (
        PARTITION BY order_date ORDER BY revenue
        ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
    ) last_product_id,
    last_value(revenue) OVER (
        PARTITION BY order_date ORDER BY revenue
        ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
    ) last_revenue
FROM daily_product_revenue AS t
ORDER BY order_date, revenue DESC
LIMIT 30

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
30 rows affected.


order_date,order_item_product_id,revenue,last_product_id,last_revenue
2013-07-25 00:00:00,1004,5599.72,1004,5599.72
2013-07-25 00:00:00,191,5099.49,1004,5599.72
2013-07-25 00:00:00,957,4499.7,1004,5599.72
2013-07-25 00:00:00,365,3359.44,1004,5599.72
2013-07-25 00:00:00,1073,2999.85,1004,5599.72
2013-07-25 00:00:00,1014,2798.88,1004,5599.72
2013-07-25 00:00:00,403,1949.85,1004,5599.72
2013-07-25 00:00:00,502,1650.0,1004,5599.72
2013-07-25 00:00:00,627,1079.73,1004,5599.72
2013-07-25 00:00:00,226,599.99,1004,5599.72


## Analytic Functions – Ranking

Let us see how we can get sparse ranks using **rank** function.

* If we have to get ranks globally, we just need to specify **ORDER BY**
* If we have to get ranks with in a key then we need to specify **PARTITION BY** and then **ORDER BY**.
* By default **ORDER BY** will sort the data in ascending order. We can change the order by passing **DESC** after order by.

Here is an example to assign sparse ranks using daily_product_revenue with in each day based on revenue.

In [51]:
%%sql

SELECT t.*,
  rank() OVER (
    PARTITION BY order_date
    ORDER BY revenue DESC
  ) AS rnk
FROM daily_product_revenue t
ORDER BY order_date, revenue DESC
LIMIT 30

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
30 rows affected.


order_date,order_item_product_id,revenue,rnk
2013-07-25 00:00:00,1004,5599.72,1
2013-07-25 00:00:00,191,5099.49,2
2013-07-25 00:00:00,957,4499.7,3
2013-07-25 00:00:00,365,3359.44,4
2013-07-25 00:00:00,1073,2999.85,5
2013-07-25 00:00:00,1014,2798.88,6
2013-07-25 00:00:00,403,1949.85,7
2013-07-25 00:00:00,502,1650.0,8
2013-07-25 00:00:00,627,1079.73,9
2013-07-25 00:00:00,226,599.99,10


Here is another example to assign sparse ranks using employees data set with in each department.

In [None]:
%%sql

SELECT
  employee_id,
  department_id,
  salary,
  rank() OVER (
    PARTITION BY department_id
    ORDER BY salary DESC
  ) rnk,
  dense_rank() OVER (
    PARTITION BY department_id
    ORDER BY salary DESC
  ) drnk,
  row_number() OVER (
    PARTITION BY department_id
    ORDER BY salary DESC
  ) rn
FROM employees
ORDER BY department_id, salary DESC

In [None]:
%%sql

SELECT * FROM employees ORDER BY salary LIMIT 10

In [None]:
%%sql

SELECT employee_id, salary,
    dense_rank() OVER (ORDER BY salary DESC) AS drnk
FROM employees

Let us understand the difference between **rank**, **dense_rank** and **row_number**.

* We can either of the functions to generate ranks when the rank field does not have duplicates.
* When rank field have duplicates then row_number should not be used as it generate unique number for each record with in the partition.
* **rank** will skip the ranks in between if multiple people get the same rank while **dense_rank** continue with the next number.

In [55]:
%%sql

SELECT
  t.*,
  rank() OVER (
    PARTITION BY order_date
    ORDER BY revenue DESC
  ) rnk,
  dense_rank() OVER (
    PARTITION BY order_date
    ORDER BY revenue DESC
  ) drnk,
  row_number() OVER (
    PARTITION BY order_date
    ORDER BY revenue DESC
  ) rn
FROM daily_product_revenue AS t
ORDER BY order_date, revenue DESC
LIMIT 30

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
30 rows affected.


order_date,order_item_product_id,revenue,rnk,drnk,rn
2013-07-25 00:00:00,1004,5599.72,1,1,1
2013-07-25 00:00:00,191,5099.49,2,2,2
2013-07-25 00:00:00,957,4499.7,3,3,3
2013-07-25 00:00:00,365,3359.44,4,4,4
2013-07-25 00:00:00,1073,2999.85,5,5,5
2013-07-25 00:00:00,1014,2798.88,6,6,6
2013-07-25 00:00:00,403,1949.85,7,7,7
2013-07-25 00:00:00,502,1650.0,8,8,8
2013-07-25 00:00:00,627,1079.73,9,9,9
2013-07-25 00:00:00,226,599.99,10,10,10


## Final Solution – Top 5 Daily Products

In [57]:
%%sql

SELECT * FROM (SELECT q.*,
  dense_rank() OVER (
    PARTITION BY order_date
    ORDER BY revenue DESC
  ) AS drnk
FROM (SELECT o.order_date,
        oi.order_item_product_id,
        round(sum(oi.order_item_subtotal)::numeric, 2) AS revenue
      FROM orders o JOIN order_items oi
      ON o.order_id = oi.order_item_order_id
      WHERE o.order_status IN ('COMPLETE', 'CLOSED')
      GROUP BY o.order_date, oi.order_item_product_id) q) q1
WHERE drnk <= 5
ORDER BY order_date, revenue DESC
LIMIT 20

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
20 rows affected.


order_date,order_item_product_id,revenue,drnk
2013-07-25 00:00:00,1004,5599.72,1
2013-07-25 00:00:00,191,5099.49,2
2013-07-25 00:00:00,957,4499.7,3
2013-07-25 00:00:00,365,3359.44,4
2013-07-25 00:00:00,1073,2999.85,5
2013-07-26 00:00:00,1004,10799.46,1
2013-07-26 00:00:00,365,7978.67,2
2013-07-26 00:00:00,957,6899.54,3
2013-07-26 00:00:00,191,6799.32,4
2013-07-26 00:00:00,1014,4798.08,5


In [59]:
%%sql

SELECT * FROM (SELECT dpr.*,
  dense_rank() OVER (
    PARTITION BY order_date
    ORDER BY revenue DESC
  ) AS drnk
FROM daily_product_revenue AS dpr) q
WHERE drnk <= 5
ORDER BY order_date, revenue DESC
LIMIT 20

 * postgresql://itversity_retail_user:***@localhost:5432/itversity_retail_db
20 rows affected.


order_date,order_item_product_id,revenue,drnk
2013-07-25 00:00:00,1004,5599.72,1
2013-07-25 00:00:00,191,5099.49,2
2013-07-25 00:00:00,957,4499.7,3
2013-07-25 00:00:00,365,3359.44,4
2013-07-25 00:00:00,1073,2999.85,5
2013-07-26 00:00:00,1004,10799.46,1
2013-07-26 00:00:00,365,7978.67,2
2013-07-26 00:00:00,957,6899.54,3
2013-07-26 00:00:00,191,6799.32,4
2013-07-26 00:00:00,1014,4798.08,5
