#### Code for blog at **[How to use nested data types effectively in SQL](https://www.startdataengineering.com/post/use-structs-sql/)**

# [SETUP] 

In [None]:
! python ./setup.py --db_file tpch_solutions.db

## Connect to DuckDB

In [None]:
import duckdb
import pandas as pd

%load_ext sql
conn = duckdb.connect("tpch_solutions.db")
%sql conn --alias duckdb_solutions

In [None]:
%%sql
show tables;

# Exercise 1

In [None]:
# exercise, to the above query add region attributes

In [None]:
%%sql
-- Hierarchical data with region attributes
SELECT 
    l.*, 
    struct_pack(
        id := c.c_custkey,
        name := c.c_name,
        address := c.c_address,
        nationkey := c.c_nationkey,
        phone := c.c_phone,
        acctbal := c.c_acctbal,
        mktsegment := c.c_mktsegment,
        comment := c.c_comment,
        nation := struct_pack(
            nationkey := n.n_nationkey,
            name := n.n_name,
            regionkey := n.n_regionkey,
            comment := n.n_comment,
            region := struct_pack(
                regionkey := r.r_regionkey,
                name := r.r_name,
                comment := r.r_comment
            )
        )
    ) AS customer,
    struct_pack(
        id := s.s_suppkey,
        name := s.s_name,
        address := s.s_address,
        nationkey := s.s_nationkey,
        phone := s.s_phone,
        acctbal := s.s_acctbal,
        comment := s.s_comment,
        nation := struct_pack(
            nationkey := sn.n_nationkey,
            name := sn.n_name,
            regionkey := sn.n_regionkey,
            comment := sn.n_comment,
            region := struct_pack(
                regionkey := sr.r_regionkey,
                name := sr.r_name,
                comment := sr.r_comment
            )
        )
    ) AS supplier
FROM 
    lineitem l
LEFT JOIN 
    orders o ON l.l_orderkey = o.o_orderkey
LEFT JOIN 
    customer c ON o.o_custkey = c.c_custkey
LEFT JOIN
    nation n ON c.c_nationkey = n.n_nationkey
LEFT JOIN
    region r ON n.n_regionkey = r.r_regionkey
LEFT JOIN 
    supplier s ON l.l_suppkey = s.s_suppkey
LEFT JOIN
    nation sn ON s.s_nationkey = sn.n_nationkey
LEFT JOIN
    region sr ON sn.n_regionkey = sr.r_regionkey
LIMIT 5;

# Exercise 2

In [None]:
%%sql
CREATE TABLE IF NOT EXISTS wide_orders AS 
WITH line_items as (
SELECT 
    l_orderkey as orderkey,
    array_agg(struct_pack(
        lineitemkey := l.l_linenumber,
        partkey := l.l_partkey,
        suppkey := l.l_suppkey,
        quantity := l.l_quantity,
        extendedprice := l.l_extendedprice,
        discount := l.l_discount,
        tax := l.l_tax,
        returnflag := l.l_returnflag,
        linestatus := l.l_linestatus,
        shipdate := l.l_shipdate,
        commitdate := l.l_commitdate,
        receiptdate := l.l_receiptdate,
        shipinstruct := l.l_shipinstruct,
        shipmode := l.l_shipmode,
        comment := l.l_comment
    )) AS lineitems
FROM 
    lineitem l 
GROUP BY 
    l_orderkey)
SELECT 
    o.*,
    l.lineitems,
    struct_pack(
        id := c.c_custkey,
        name := c.c_name,
        address := c.c_address,
        nationkey := c.c_nationkey,
        phone := c.c_phone,
        acctbal := c.c_acctbal,
        mktsegment := c.c_mktsegment,
        comment := c.c_comment,
        nation := struct_pack(
            nationkey := n.n_nationkey,
            name := n.n_name,
            regionkey := n.n_regionkey,
            comment := n.n_comment
        )
    ) AS customer
FROM 
    orders o
LEFT JOIN 
    line_items l ON o.o_orderkey = l.orderkey
LEFT JOIN 
    customer c ON o.o_custkey = c.c_custkey
LEFT JOIN
    nation n ON c.c_nationkey = n.n_nationkey;

In [None]:
# exercise: Sort ARRAY of lineitems, how is it sorted? What do you think is the sort order based on?

In [None]:
%%sql
SELECT 
    o.*, 
    array_sort(o.lineitems) AS sorted_lineitems, 
    o.customer 
FROM 
    wide_orders o
LIMIT 5;

we can use the array_sort function in DuckDB. This function sorts the array based on the lexicographic ordering of the elements within each struct, which means it **starts sorting by the first field in the struct, and if they are equal, it moves to the next field**.

In our case, the lineitems array contains multiple fields (like lineitemkey, partkey, suppkey, etc.). The sort order will be based on the first field listed within each struct_pack, which in this case is lineitemkey. DuckDB will sort the lineitems array based on this key unless otherwise specified.

# Exercise 3

In [None]:
# exercise: create a wide_orders_v2 table that combines all the tables in the TPCH data model

In [None]:
%%sql
CREATE TABLE IF NOT EXISTS wide_orders_v2 AS 
WITH line_items AS (
    SELECT 
        l_orderkey AS orderkey,
        array_agg(struct_pack(
            lineitemkey := l.l_linenumber,
            partkey := l.l_partkey,
            suppkey := l.l_suppkey,
            quantity := l.l_quantity,
            extendedprice := l.l_extendedprice,
            discount := l.l_discount,
            tax := l.l_tax,
            returnflag := l.l_returnflag,
            linestatus := l.l_linestatus,
            shipdate := l.l_shipdate,
            commitdate := l.l_commitdate,
            receiptdate := l.l_receiptdate,
            shipinstruct := l.l_shipinstruct,
            shipmode := l.l_shipmode,
            comment := l.l_comment,
            part := struct_pack(
                id := p.p_partkey,
                name := p.p_name,
                mfgr := p.p_mfgr,
                brand := p.p_brand,
                type := p.p_type,
                size := p.p_size,
                container := p.p_container,
                retailprice := p.p_retailprice,
                comment := p.p_comment
            ),
            supplier := struct_pack(
                id := s.s_suppkey,
                name := s.s_name,
                address := s.s_address,
                nationkey := s.s_nationkey,
                phone := s.s_phone,
                acctbal := s.s_acctbal,
                comment := s.s_comment,
                nation := struct_pack(
                    nationkey := sn.n_nationkey,
                    name := sn.n_name,
                    regionkey := sn.n_regionkey,
                    comment := sn.n_comment,
                    region := struct_pack(
                        regionkey := sr.r_regionkey,
                        name := sr.r_name,
                        comment := sr.r_comment
                    )
                )
            ),
            partsupp := struct_pack(
                partkey := ps.ps_partkey,
                suppkey := ps.ps_suppkey,
                availqty := ps.ps_availqty,
                supplycost := ps.ps_supplycost,
                comment := ps.ps_comment
            )
        )) AS lineitems
    FROM 
        lineitem l
    LEFT JOIN 
        part p ON l.l_partkey = p.p_partkey
    LEFT JOIN 
        supplier s ON l.l_suppkey = s.s_suppkey
    LEFT JOIN 
        nation sn ON s.s_nationkey = sn.n_nationkey
    LEFT JOIN 
        region sr ON sn.n_regionkey = sr.r_regionkey
    LEFT JOIN 
        partsupp ps ON l.l_partkey = ps.ps_partkey AND l.l_suppkey = ps.ps_suppkey
    GROUP BY 
        l.l_orderkey
)
SELECT 
    o.*,
    l.lineitems,
    struct_pack(
        id := c.c_custkey,
        name := c.c_name,
        address := c.c_address,
        nationkey := c.c_nationkey,
        phone := c.c_phone,
        acctbal := c.c_acctbal,
        mktsegment := c.c_mktsegment,
        comment := c.c_comment,
        nation := struct_pack(
            nationkey := n.n_nationkey,
            name := n.n_name,
            regionkey := n.n_regionkey,
            comment := n.n_comment,
            region := struct_pack(
                regionkey := r.r_regionkey,
                name := r.r_name,
                comment := r.r_comment
            )
        )
    ) AS customer
FROM 
    orders o
LEFT JOIN 
    line_items l ON o.o_orderkey = l.orderkey
LEFT JOIN 
    customer c ON o.o_custkey = c.c_custkey
LEFT JOIN 
    nation n ON c.c_nationkey = n.n_nationkey
LEFT JOIN 
    region r ON n.n_regionkey = r.r_regionkey;


In [None]:
%%sql
select * from wide_orders_v2 limit 1;