# Setup

In [74]:
spark

# DDL 

In [75]:
# Drop existing tables if they exist
spark.sql("DROP TABLE IF EXISTS prod.db.customer")
spark.sql("DROP TABLE IF EXISTS prod.db.lineitem")
spark.sql("DROP TABLE IF EXISTS prod.db.nation")
spark.sql("DROP TABLE IF EXISTS prod.db.orders")
spark.sql("DROP TABLE IF EXISTS prod.db.part")
spark.sql("DROP TABLE IF EXISTS prod.db.partsupp")
spark.sql("DROP TABLE IF EXISTS prod.db.region")
spark.sql("DROP TABLE IF EXISTS prod.db.supplier")

DataFrame[]

In [76]:
# Create tables using Iceberg format
spark.sql("""
CREATE TABLE IF NOT EXISTS prod.db.customer (
  c_custkey    BIGINT,
  c_name       STRING,
  c_address    STRING,
  c_nationkey  BIGINT,
  c_phone      STRING,
  c_acctbal    DECIMAL(15,2),
  c_mktsegment STRING,
  c_comment    STRING
) USING iceberg
TBLPROPERTIES (
  'format-version' = '2'
)
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS prod.db.lineitem (
  l_orderkey      BIGINT,
  l_partkey       BIGINT,
  l_suppkey       BIGINT,
  l_linenumber    INT,
  l_quantity      DECIMAL(15,2),
  l_extendedprice DECIMAL(15,2),
  l_discount      DECIMAL(15,2),
  l_tax           DECIMAL(15,2),
  l_returnflag    STRING,
  l_linestatus    STRING,
  l_shipdate      DATE,
  l_commitdate    DATE,
  l_receiptdate   DATE,
  l_shipinstruct  STRING,
  l_shipmode      STRING,
  l_comment       STRING
) USING iceberg
TBLPROPERTIES (
  'format-version' = '2'
)
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS prod.db.nation (
  n_nationkey INT,
  n_name      STRING,
  n_regionkey INT,
  n_comment   STRING
) USING iceberg
TBLPROPERTIES (
  'format-version' = '2'
)
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS prod.db.orders (
  o_orderkey      BIGINT,
  o_custkey       BIGINT,
  o_orderstatus   STRING,
  o_totalprice    DECIMAL(15,2),
  o_orderdate     DATE,
  o_orderpriority STRING,
  o_clerk         STRING,
  o_shippriority  INT,
  o_comment       STRING
) USING iceberg
TBLPROPERTIES (
  'format-version' = '2'
)
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS prod.db.part (
  p_partkey     BIGINT,
  p_name        STRING,
  p_mfgr        STRING,
  p_brand       STRING,
  p_type        STRING,
  p_size        INT,
  p_container   STRING,
  p_retailprice DECIMAL(15,2),
  p_comment     STRING
) USING iceberg
TBLPROPERTIES (
  'format-version' = '2'
)
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS prod.db.partsupp (
  ps_partkey    BIGINT,
  ps_suppkey    BIGINT,
  ps_availqty   INT,
  ps_supplycost DECIMAL(15,2),
  ps_comment    STRING
) USING iceberg
TBLPROPERTIES (
  'format-version' = '2'
)
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS prod.db.region (
  r_regionkey INT,
  r_name      STRING,
  r_comment   STRING
) USING iceberg
TBLPROPERTIES (
  'format-version' = '2'
)
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS prod.db.supplier (
  s_suppkey   BIGINT,
  s_name      STRING,
  s_address   STRING,
  s_nationkey BIGINT,
  s_phone     STRING,
  s_acctbal   DECIMAL(15,2),
  s_comment   STRING
) USING iceberg
TBLPROPERTIES (
  'format-version' = '2'
)
""")

DataFrame[]

# Insert data into upstream tables

In [77]:
from pathlib import Path
def upsert_data(data_name, data_path = Path('/home/iceberg/notebooks/data')):
    csv_path = data_path / f'{data_name}.csv'
    print(f'Reading {data_name} data from {str(csv_path)}')
    df = spark.read.format("csv").option("header", "true").option("delimiter", ",").option("inferSchema", "true").load(str(csv_path))
    df.writeTo(f"prod.db.{data_name}").overwritePartitions()
    

In [78]:
upsert_data('customer')
upsert_data('lineitem')
upsert_data("nation")
upsert_data("orders")
upsert_data("part")
upsert_data("partsupp")
upsert_data("region")
upsert_data("supplier")

Reading customer data from /home/iceberg/notebooks/data/customer.csv
Reading lineitem data from /home/iceberg/notebooks/data/lineitem.csv


                                                                                

Reading nation data from /home/iceberg/notebooks/data/nation.csv
Reading orders data from /home/iceberg/notebooks/data/orders.csv


                                                                                

Reading part data from /home/iceberg/notebooks/data/part.csv
Reading partsupp data from /home/iceberg/notebooks/data/partsupp.csv
Reading region data from /home/iceberg/notebooks/data/region.csv
Reading supplier data from /home/iceberg/notebooks/data/supplier.csv


In [79]:
%%sql
use prod.db;

# [GET DATA] Use SELECT...FROM, LIMIT, WHERE, & ORDER BY to read the required data from tables

The most common use for querying is to read data in our tables. We can do this using a `SELECT ... FROM` statement, as shown below.



In [80]:
%%sql
SELECT
  *
FROM
  customer
LIMIT
  1;

c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment
1,Customer#000000001,j5JsirBM9PsCy0O1m,15,25-989-741-2988,711.56,BUILDING,y final requests wake slyly quickly special accounts. blithely


In [81]:
%%sql
-- use * to specify all columns
SELECT
  *
FROM
  orders
LIMIT
  5;

o_orderkey,o_custkey,o_orderstatus,o_totalprice,o_orderdate,o_orderpriority,o_clerk,o_shippriority,o_comment
1,9226,O,217279.39,1996-01-02,5-LOW,Clerk#000000951,0,ly express platelets. deposits acc
2,19501,O,58634.64,1996-12-01,1-URGENT,Clerk#000000880,0,ve the furiously fluffy dependencies. carefully regular
3,30829,F,226541.68,1993-10-14,5-LOW,Clerk#000000955,0,after the asymptotes. instructions cajole after the foxes. carefully unu
4,34195,O,29259.46,1995-10-11,5-LOW,Clerk#000000124,0,st the furiously bold pinto beans. furiously pending theodolites cajol
5,11122,F,127549.56,1994-07-30,5-LOW,Clerk#000000925,0,onic requests. carefully daring foxes among the carefu


However, running a `SELECT ... FROM` statement can cause issues when the data set is extensive.


In [82]:
%%sql
-- use * to specify all columns
SELECT
  *
FROM
  orders
LIMIT
  5;

o_orderkey,o_custkey,o_orderstatus,o_totalprice,o_orderdate,o_orderpriority,o_clerk,o_shippriority,o_comment
1,9226,O,217279.39,1996-01-02,5-LOW,Clerk#000000951,0,ly express platelets. deposits acc
2,19501,O,58634.64,1996-12-01,1-URGENT,Clerk#000000880,0,ve the furiously fluffy dependencies. carefully regular
3,30829,F,226541.68,1993-10-14,5-LOW,Clerk#000000955,0,after the asymptotes. instructions cajole after the foxes. carefully unu
4,34195,O,29259.46,1995-10-11,5-LOW,Clerk#000000124,0,st the furiously bold pinto beans. furiously pending theodolites cajol
5,11122,F,127549.56,1994-07-30,5-LOW,Clerk#000000925,0,onic requests. carefully daring foxes among the carefu


In [83]:
%%sql
-- use column c_names only to read data from those columns
SELECT
  o_orderkey,
  o_totalprice
FROM
  orders
LIMIT
  5;

o_orderkey,o_totalprice
1,217279.39
2,58634.64
3,226541.68
4,29259.46
5,127549.56



We can use the `WHERE` clause if we want to get the rows that match specific criteria. We can specify one or more filters within the `WHERE` clause. 

The `WHERE` clause with more than one filter can use combinations of `AND` and `OR` criteria to combine the filter criteria, as shown below.



In [84]:
%%sql
-- all customer rows that have c_nationkey = 20
SELECT
  *
FROM
  customer
WHERE
  c_nationkey = 20
LIMIT
  10;

c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment
6,Customer#000000006,"g1s,pzDenUEBW3O,2 pxu0f9n2g64rJrt5E",20,30-114-968-4951,7638.57,AUTOMOBILE,quickly silent asymptotes are slyly regular excuses. instructions wake furiously? quickly bold courts p
81,Customer#000000081,9jUFbrThIIeoUNd8 9,20,30-165-277-3269,2023.71,BUILDING,s against the ironic packages haggle carefully above the slyly express pinto beans
100,Customer#000000100,MBy6qq3OEGpV4u,20,30-749-445-4907,9889.89,FURNITURE,"dazzle carefully furiously final foxes. express, ironic packages among the qui"
210,Customer#000000210,",XOlfSzkZDAkm96adR41j,",20,30-876-248-9750,7250.14,HOUSEHOLD,es cajole bravely across the blithely
223,Customer#000000223,MyQxUcG0P QCetmG00GlF,20,30-193-643-1517,7476.2,BUILDING,"xcuses. silent theodolites across the carefully bold excuses sleep ironic, final courts. regular excuses"
228,Customer#000000228,"rZ1wxvHNByT71bUJWZjXMDROzlAch6FVu,dj8Zfq",20,30-435-915-1603,6868.12,FURNITURE,es. blithely permanent sentim
247,Customer#000000247,eSAW4XaakYFj2WToKU,20,30-151-905-3513,8495.92,HOUSEHOLD,"tes nag according to the blithe, even packages. sometimes unusual packages integrate"
278,Customer#000000278,XHAfHlrYQM3elmhJ,20,30-445-570-5841,7621.56,BUILDING,"ely unusual accounts. stealthily special instructions affix blithely. regular, ironic packages sleep even platelet"
285,Customer#000000285,rB6fTQKle64k3MvCCatad8DfMgR5OZA G4r,20,30-235-130-1313,7276.72,FURNITURE,slyly according to the blithely special instructions. ironic ideas against the blithely furious pac
321,Customer#000000321,LX0SKs3jqo9wH1yixIdGWp2ItclDiuL,20,30-114-675-9153,7718.77,FURNITURE,"ng the final, bold requests. furiously regular accounts inside the furiously pending"


In [85]:
%%sql
-- all customer rows that have c_nationkey = 20 and c_acctbal > 1000
SELECT
  *
FROM
  customer
WHERE
  c_nationkey = 20
  AND c_acctbal > 1000
LIMIT
  10;

c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment
6,Customer#000000006,"g1s,pzDenUEBW3O,2 pxu0f9n2g64rJrt5E",20,30-114-968-4951,7638.57,AUTOMOBILE,quickly silent asymptotes are slyly regular excuses. instructions wake furiously? quickly bold courts p
81,Customer#000000081,9jUFbrThIIeoUNd8 9,20,30-165-277-3269,2023.71,BUILDING,s against the ironic packages haggle carefully above the slyly express pinto beans
100,Customer#000000100,MBy6qq3OEGpV4u,20,30-749-445-4907,9889.89,FURNITURE,"dazzle carefully furiously final foxes. express, ironic packages among the qui"
210,Customer#000000210,",XOlfSzkZDAkm96adR41j,",20,30-876-248-9750,7250.14,HOUSEHOLD,es cajole bravely across the blithely
223,Customer#000000223,MyQxUcG0P QCetmG00GlF,20,30-193-643-1517,7476.2,BUILDING,"xcuses. silent theodolites across the carefully bold excuses sleep ironic, final courts. regular excuses"
228,Customer#000000228,"rZ1wxvHNByT71bUJWZjXMDROzlAch6FVu,dj8Zfq",20,30-435-915-1603,6868.12,FURNITURE,es. blithely permanent sentim
247,Customer#000000247,eSAW4XaakYFj2WToKU,20,30-151-905-3513,8495.92,HOUSEHOLD,"tes nag according to the blithe, even packages. sometimes unusual packages integrate"
278,Customer#000000278,XHAfHlrYQM3elmhJ,20,30-445-570-5841,7621.56,BUILDING,"ely unusual accounts. stealthily special instructions affix blithely. regular, ironic packages sleep even platelet"
285,Customer#000000285,rB6fTQKle64k3MvCCatad8DfMgR5OZA G4r,20,30-235-130-1313,7276.72,FURNITURE,slyly according to the blithely special instructions. ironic ideas against the blithely furious pac
321,Customer#000000321,LX0SKs3jqo9wH1yixIdGWp2ItclDiuL,20,30-114-675-9153,7718.77,FURNITURE,"ng the final, bold requests. furiously regular accounts inside the furiously pending"


In [86]:
%%sql
-- all customer rows that have c_nationkey = 20 or c_acctbal > 1000
SELECT
  *
FROM
  customer
WHERE
  c_nationkey = 20
  OR c_acctbal > 1000
LIMIT
  10;

c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment
3,Customer#000000003,fkRGN8nY4pkE,1,11-719-748-3364,7498.12,AUTOMOBILE,fully. carefully silent instructions sleep alongside of the slyly regular asymptotes. quickly regular
4,Customer#000000004,4u58h fqkyE,4,14-128-190-5944,2866.83,MACHINERY,sublate. fluffily even instructions are about th
6,Customer#000000006,"g1s,pzDenUEBW3O,2 pxu0f9n2g64rJrt5E",20,30-114-968-4951,7638.57,AUTOMOBILE,quickly silent asymptotes are slyly regular excuses. instructions wake furiously? quickly bold courts p
7,Customer#000000007,8OkMVLQ1dK6Mbu6WG9 w4pLGQ n7MQ,18,28-190-982-9759,9561.95,AUTOMOBILE,"ounts. ironic, regular accounts sleep. final requests haggle quickly after the"
8,Customer#000000008,"j,pZ,Qp,qtFEo0r0c 92qobZtlhSuOqbE4JGV",17,27-147-574-9335,6819.74,BUILDING,riously final excuses sublate quickly among the fluffily even foxes. quickly final packages haggle furiously furi
9,Customer#000000009,vgIql8H6zoyuLMFNdAMLyE7 H9,8,18-338-906-3675,8324.07,FURNITURE,ss pinto beans believe slyly quiet deposits-- doggedly bold packages boost. quickly ironic de
10,Customer#000000010,"Vf mQ6Ug9Ucf5OKGYq fsaX AtfsO7,rwY",5,15-741-346-9870,2753.54,HOUSEHOLD,g quickly after the evenly bold
12,Customer#000000012,Sb4gxKs7W1AZa,13,23-791-276-1263,3396.49,HOUSEHOLD,ickly regular dependencies boost blithely around the slyly ironic theodolites. furiously special dolp
13,Customer#000000013,Ez3ax0D5HnUbeUVSxoX8a8B,3,13-761-547-5974,3857.34,BUILDING,quickly brave foxes. blithely even packages against the pinto beans boost furiously against the re
14,Customer#000000014,h3GFMzeFfYiamqr,1,11-845-129-3851,5266.3,FURNITURE,"r, express foxes cajole slyly aga"


In [87]:
%%sql
-- all customer rows that have (c_nationkey = 20 and c_acctbal > 1000) or rows that have c_nationkey = 11
SELECT
  *
FROM
  customer
WHERE
  (
    c_nationkey = 20
    AND c_acctbal > 1000
  )
  OR c_nationkey = 11
LIMIT
  10;

c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment
6,Customer#000000006,"g1s,pzDenUEBW3O,2 pxu0f9n2g64rJrt5E",20,30-114-968-4951,7638.57,AUTOMOBILE,quickly silent asymptotes are slyly regular excuses. instructions wake furiously? quickly bold courts p
52,Customer#000000052,"UracAlAA8tSHL5V,poTZIOjh8o,",11,21-186-284-5998,5630.28,HOUSEHOLD,ts boost. carefully express waters across the blithely regular foxes inte
81,Customer#000000081,9jUFbrThIIeoUNd8 9,20,30-165-277-3269,2023.71,BUILDING,s against the ironic packages haggle carefully above the slyly express pinto beans
84,Customer#000000084,GB3sUmv RRXV DPzeOSbGxMIF9Z4Eq9 rop,11,21-546-818-3802,5174.71,FURNITURE,ounts. blithely express theodolites nag carefully ironic pinto beans. carefully final
100,Customer#000000100,MBy6qq3OEGpV4u,20,30-749-445-4907,9889.89,FURNITURE,"dazzle carefully furiously final foxes. express, ironic packages among the qui"
131,Customer#000000131,"ItdUFrHPZlzjZ, fo03sG4topAKTV",11,21-840-210-3572,8595.53,HOUSEHOLD,ly final Tiresias. slyly permanent theodolites cajole quickly. carefully unus
134,Customer#000000134,6I1TTaoG7bbiogCqRcptG6BYme,11,21-200-159-5932,4608.9,BUILDING,ly regular dolphins haggle blithely.
148,Customer#000000148,qJ8bFn4kwiit7RzwGrwo5m,11,21-562-498-6636,2135.6,HOUSEHOLD,e carefully pending ideas detect slyly along the furiously special excuses. instructions use carefully
190,Customer#000000190,"mY30kK8AfsTGrx,L4zI QlQnnmCUxikyc8QcZ7",11,21-730-373-8193,1657.46,AUTOMOBILE,y even packages engage furiously pending p
210,Customer#000000210,",XOlfSzkZDAkm96adR41j,",20,30-876-248-9750,7250.14,HOUSEHOLD,es cajole bravely across the blithely



We can combine multiple filter clauses, as seen above. We have seen examples of equals (`=`) and greater than (`>`) conditional operators. There are 6 **conditional operators**, they are

1. **<** Less than
2. **>** Greater than
3. **<=** Less than or equal to
4. **>=** Greater than or equal to
5. **=** Equal
6. **<>** and **!=** both represent Not equal (some DBs only support one of these)

Additionally, for string types, we can make **pattern matching with `like` condition**. In a `like` condition, a `_` means any single character, and `%` means zero or more characters, for example.




In [88]:
%%sql
-- all customer rows where the c_name has a 381 in it
SELECT
  *
FROM
  customer
WHERE
  c_name LIKE '%381%';

c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment
381,Customer#000000381,wXs5zN2nPHqPsfFO,5,15-860-208-7093,9931.71,BUILDING,"ithely along the regular, regular theodolites. fluffily pending"
1381,Customer#000001381,kAgLl7nUiPStCleWOiKevH3QAOhqtg9dVvrdN,22,32-418-900-6494,367.82,BUILDING,posits sleep carefully around the slyly e
2381,Customer#000002381,"z7B43DZ7RGlkgEi3YaXfy,Aw2SZepYurvII41Do",5,15-493-990-8133,412.99,FURNITURE,ul requests use slyly quickly even deposits. slyly pending
3381,Customer#000003381,03jULkpVTm92eKW24meIj,13,23-441-750-5088,2473.54,AUTOMOBILE,er the carefully bold multipliers doze blithely along the furiousl
3810,Customer#000003810,hlRTIO4e4HNahc8A D,18,28-881-994-8196,9906.8,FURNITURE,bold requests after the furiousl
3811,Customer#000003811,b6vEJqifAgSbGhzTwTz,22,32-962-997-2221,5697.04,FURNITURE,he carefully special packages. regular deposits sleep blithely bl
3812,Customer#000003812,HGYp5dZtlA,14,24-653-654-5032,4204.53,FURNITURE,y ironic requests believe blithely
3813,Customer#000003813,Aeky0En0JO5V1zRgFZ9EvCcBWaTmW,6,16-983-191-7833,-494.03,HOUSEHOLD,rding to the express foxes. bold platelets main
3814,Customer#000003814,FQ3lWCA3znooc3S SmDCfwqdn4R9,20,30-833-732-5401,-207.83,AUTOMOBILE,ounts alongside of the fluffily pendin
3815,Customer#000003815,S5SIUeDCuVOKRTZqZ5M4CC,19,29-968-870-7672,2887.99,FURNITURE,ccounts. fluffily bold requests sleep furio


In [89]:
%%sql
-- all customer rows where the c_name ends with a 381
SELECT
  *
FROM
  customer
WHERE
  c_name LIKE '%381';

c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment
381,Customer#000000381,wXs5zN2nPHqPsfFO,5,15-860-208-7093,9931.71,BUILDING,"ithely along the regular, regular theodolites. fluffily pending"
1381,Customer#000001381,kAgLl7nUiPStCleWOiKevH3QAOhqtg9dVvrdN,22,32-418-900-6494,367.82,BUILDING,posits sleep carefully around the slyly e
2381,Customer#000002381,"z7B43DZ7RGlkgEi3YaXfy,Aw2SZepYurvII41Do",5,15-493-990-8133,412.99,FURNITURE,ul requests use slyly quickly even deposits. slyly pending
3381,Customer#000003381,03jULkpVTm92eKW24meIj,13,23-441-750-5088,2473.54,AUTOMOBILE,er the carefully bold multipliers doze blithely along the furiousl
4381,Customer#000004381,MIQXH5W6Zsup5cVYfCtWupiJtgi,2,12-570-797-1472,2542.55,HOUSEHOLD,r deposits. carefully even packages along
5381,Customer#000005381,"bXQ,KuigJB1nASXN73PDwNOvXCIkp5",5,15-700-184-7619,4130.88,MACHINERY,es. carefully ironic ideas sleep blithely about the i
6381,Customer#000006381,BKfk07DtN45gg2w4mMUK1,7,17-877-502-9214,7346.88,HOUSEHOLD,"inal asymptotes boost. bold, ironic requests are along the regular, special packages. pending account"
7381,Customer#000007381,yq7RXRmclCUi6wJspelKaEWSJ TfycLah,20,30-666-139-1602,73.39,BUILDING,fluffily special requests are about the fluffily unusual foxes. final frets are slyly fluffily final deposits. even
8381,Customer#000008381,7kbg8wegbgGmgiW8OQ4SbJ8colXl6rpBmHudJ,0,10-177-308-9094,6674.59,AUTOMOBILE,uests against the carefully bold excuses sleep blithely slyly final instructions; unusual requests about
9381,Customer#000009381,BhXODcEOpwNg6,17,27-708-588-6706,4788.15,HOUSEHOLD,sual hockey players use above the final packages. quickly ironic excuses sleep. slyly final pa


In [90]:
%%sql
-- all customer rows where the c_name starts with a 381
SELECT
  *
FROM
  customer
WHERE
  c_name LIKE '381%';

c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment


In [91]:
%%sql
-- all customer rows where the c_name has a combination of any character and 9 and 1
SELECT
  *
FROM
  customer
WHERE
  c_name LIKE '%_91%';

c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment
91,Customer#000000091,9Sce2m BjvDdjQkqMx8UnrUsJkk1IBAvZPTsA,8,18-239-400-3677,4643.14,AUTOMOBILE,yly ironic foxes lose slyly pending asymptotes. slyly final theodolites nag blithely ar
191,Customer#000000191,cZMo3 b4GwZtUmdbw,16,26-811-707-6869,2945.16,BUILDING,daringly quickly ironic foxes. care
291,Customer#000000291,2FfdPluDa2fxPaRh,8,18-657-656-2318,4261.68,HOUSEHOLD,"ld deposits. regularly ironic pinto beans cajole permanently furiously express packages. regular, unusual sheaves"
391,Customer#000000391,"BZ,850WgpZ0YSFs79Sb",11,21-604-451-4462,4801.3,HOUSEHOLD,tions wake about the blithely final instructions. excuses sleep regular requests. slyly
491,Customer#000000491,"AXsbcyMDujG,CAiEu4FmufbZ1k",0,10-856-259-7548,785.37,AUTOMOBILE,"ly final, even hockey players. carefully final ideas w"
591,Customer#000000591,wkmTqEmyI3UOEoG3q,20,30-584-309-7885,6344.66,MACHINERY,xpress deposits. slyly ironic ideas haggle: daringly even requests after the quickly final ideas boost q
691,Customer#000000691,0aGn3Vcf6ZKi82ogENfnso,16,26-741-688-4189,9566.15,MACHINERY,ven packages cajole fluffily fluffily unusual frays. ironic excuses sleep furiously. regular
791,Customer#000000791,Y14aVvMuDDgnmEuCEPK,13,23-575-775-4059,3694.81,HOUSEHOLD,beans use carefully furiously regular deposits. slyly
891,Customer#000000891,"r4,EU38BM0qdbjwqH",11,21-439-958-7518,6032.18,FURNITURE,ong the quickly quick patterns. slyly
910,Customer#000000910,bKS7h8o7ZEiRj,9,19-899-463-4292,5794.69,BUILDING,silent deposits are. blithely final foxes cajole slyly according to the furiously re


We can also filter for more than one value using `IN` and `NOT IN`.


In [92]:
%%sql
-- all customer rows which have c_nationkey = 10 or c_nationkey = 20
SELECT
  *
FROM
  customer
WHERE
  c_nationkey IN (10, 20);

c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment
6,Customer#000000006,"g1s,pzDenUEBW3O,2 pxu0f9n2g64rJrt5E",20,30-114-968-4951,7638.57,AUTOMOBILE,quickly silent asymptotes are slyly regular excuses. instructions wake furiously? quickly bold courts p
16,Customer#000000016,"P2IQMff18ercaYrO,40",10,20-781-609-3107,4681.03,FURNITURE,ests cajole. pinto beans detect slyly. final packages cajole slyly
41,Customer#000000041,jeREsFtCuMqEwdvTFqTkY2NzGRYDG1m,10,20-917-711-4011,270.95,HOUSEHOLD,uctions wake carefully pending deposits: pinto beans along the carefully final deposits sleep blithely a
49,Customer#000000049,PdKqM4TlA OLTjaeRmvH7QWDu80USfslgqutF,10,20-908-631-4424,4573.94,FURNITURE,quests haggle! furiously unusual theodolites cajole carefully. t
55,Customer#000000055,ti9p9XgdmFsjsQI6XQrISDUMFAusnmKS SBoCE,10,20-180-440-8525,4572.11,MACHINERY,dolites. bold instructions wake fluffily regular ideas. regular theodolites are furiously carefully unusual ac
56,Customer#000000056,qh212iaGWtoVp,10,20-895-685-6920,6530.86,FURNITURE,quickly final dependencies. even dependencies are slyly regularly silent theodolites. slow a
81,Customer#000000081,9jUFbrThIIeoUNd8 9,20,30-165-277-3269,2023.71,BUILDING,s against the ironic packages haggle carefully above the slyly express pinto beans
100,Customer#000000100,MBy6qq3OEGpV4u,20,30-749-445-4907,9889.89,FURNITURE,"dazzle carefully furiously final foxes. express, ironic packages among the qui"
104,Customer#000000104,SEOogsfT y09vI2z PcSTnI18U6rNTf,10,20-966-284-8065,-588.38,FURNITURE,efully bold deposits. carefully
105,Customer#000000105,"XI8hMXfr8bIKTGhIRS2sYs,p",10,20-793-553-6417,9091.82,MACHINERY,"solve pending, final requests. regular, bold platele"


In [93]:
%%sql
-- all customer rows which have do not have c_nationkey as 10 or 20
SELECT
  *
FROM
  customer
WHERE
  c_nationkey NOT IN (10, 20);

c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment
1,Customer#000000001,j5JsirBM9PsCy0O1m,15,25-989-741-2988,711.56,BUILDING,y final requests wake slyly quickly special accounts. blithely
2,Customer#000000002,487LW1dovn6Q4dMVymKwwLE9OKf3QG,13,23-768-687-3665,121.65,AUTOMOBILE,y carefully regular foxes. slyly regular requests about the bli
3,Customer#000000003,fkRGN8nY4pkE,1,11-719-748-3364,7498.12,AUTOMOBILE,fully. carefully silent instructions sleep alongside of the slyly regular asymptotes. quickly regular
4,Customer#000000004,4u58h fqkyE,4,14-128-190-5944,2866.83,MACHINERY,sublate. fluffily even instructions are about th
5,Customer#000000005,hwBtxkoBF qSW4KrIk5U 2B1AU7H,3,13-750-942-6364,794.47,HOUSEHOLD,equests haggle furiously against the pending packa
7,Customer#000000007,8OkMVLQ1dK6Mbu6WG9 w4pLGQ n7MQ,18,28-190-982-9759,9561.95,AUTOMOBILE,"ounts. ironic, regular accounts sleep. final requests haggle quickly after the"
8,Customer#000000008,"j,pZ,Qp,qtFEo0r0c 92qobZtlhSuOqbE4JGV",17,27-147-574-9335,6819.74,BUILDING,riously final excuses sublate quickly among the fluffily even foxes. quickly final packages haggle furiously furi
9,Customer#000000009,vgIql8H6zoyuLMFNdAMLyE7 H9,8,18-338-906-3675,8324.07,FURNITURE,ss pinto beans believe slyly quiet deposits-- doggedly bold packages boost. quickly ironic de
10,Customer#000000010,"Vf mQ6Ug9Ucf5OKGYq fsaX AtfsO7,rwY",5,15-741-346-9870,2753.54,HOUSEHOLD,g quickly after the evenly bold
11,Customer#000000011,cG48rYjF3Aw7xs hKUXXqmI,23,33-464-151-3439,-272.6,BUILDING,ng to the regular foxes. furiously final deposits across the final platelets cajole quickly above th


We can get the number of rows in a table using `count(*)` as shown below.


In [94]:
%%sql
SELECT
  COUNT(*)
FROM
  customer;

count(1)
37500


In [95]:
%%sql
SELECT
  COUNT(*)
FROM
  lineitem;

count(1)
1499579


If we want to get the rows sorted by values in a specific column, we use `ORDER BY`, for example.


In [96]:
%%sql
-- Will show the first ten customer records with the lowest custkey
-- rows are ordered in ASC order by default
SELECT
  *
FROM
  orders
ORDER BY
  o_custkey
LIMIT
  10;

o_orderkey,o_custkey,o_orderstatus,o_totalprice,o_orderdate,o_orderpriority,o_clerk,o_shippriority,o_comment
1226497,1,F,81079.08,1993-10-04,1-URGENT,Clerk#000000154,0,nal instructions are carefully along the bold instructions. blithely
164711,1,F,293295.8,1992-04-26,3-MEDIUM,Clerk#000000361,0,fully special ideas. fluffil
1071617,1,P,218197.4,1995-03-10,2-HIGH,Clerk#000000408,0,e carefully according to the furiously even packages. furious
385825,1,O,233371.19,1995-11-01,2-HIGH,Clerk#000000465,0,ly express accounts. special requests according to the carefull
1192231,1,O,160844.86,1996-06-03,1-URGENT,Clerk#000000978,0,ix waters. fluffily pending accounts sleep slyly. slyly special p
430243,1,F,43325.97,1994-12-24,4-NOT SPECIFIED,Clerk#000000121,0,e slyly along the furiously pending attainments
1374019,1,F,176119.85,1992-04-05,1-URGENT,Clerk#000000440,0,"uick, unusual courts"
454791,1,F,73591.5,1992-04-19,1-URGENT,Clerk#000000815,0,ccounts sleep carefully along the slyly ev
579908,1,O,36034.72,1996-12-09,5-LOW,Clerk#000000783,0,"t packages hinder bold, even dolphins. slyly ironic packages wake fluffily a"
1201223,2,O,155669.86,1996-01-13,5-LOW,Clerk#000000287,0,"g, ironic platelets! furiously even accounts wake blithely s"


In [97]:
%%sql
-- Will show the first ten customer's records with the highest custkey
SELECT
  *
FROM
  orders
ORDER BY
  o_custkey DESC
LIMIT
  10;

o_orderkey,o_custkey,o_orderstatus,o_totalprice,o_orderdate,o_orderpriority,o_clerk,o_shippriority,o_comment
1136677,37499,O,96878.08,1998-03-31,2-HIGH,Clerk#000000875,0,nic epitaphs among the furiously fina
67298,37499,O,336911.49,1995-09-15,1-URGENT,Clerk#000000213,0,carefully bold requests. careful
851232,37499,F,143416.3,1994-03-22,5-LOW,Clerk#000000009,0,eas hang around the
94817,37499,F,174695.1,1992-08-02,4-NOT SPECIFIED,Clerk#000000650,0,d pearls. asymptotes haggle furiously regular ideas. furiously
891463,37499,F,76063.2,1995-01-01,4-NOT SPECIFIED,Clerk#000000968,0,ial deposits are across the quickly sil
215168,37499,F,200021.1,1992-02-24,2-HIGH,Clerk#000000633,0,use quickly regular request
921314,37499,F,61815.09,1994-02-17,5-LOW,Clerk#000000361,0,deposits. blithely special dependencies nag about the sly
261319,37499,O,193665.34,1995-07-03,1-URGENT,Clerk#000000126,0,"the ironic deposits. pending, special requ"
1124449,37499,O,88841.19,1995-11-04,2-HIGH,Clerk#000000224,0,"e express, express sauternes. accounts cajole slyl"
460038,37499,O,177781.68,1996-09-14,2-HIGH,Clerk#000000548,0,"deposits cajole furiously near the ironic,"



# [JOINS] Combine data from multiple tables using JOINs (there are different types of JOINs)

We can combine data from multiple tables using joins. When we write a join query, we have a format as shown below.

```sql
-- not based on real tables
SELECT
    a.*
FROM
    table_a a -- LEFT table a
    JOIN table_b b -- RIGHT table b
    ON a.id = b.id
```

The table specified first (table_a) is the left table, whereas the table established second is the right table. When we have multiple tables joined, we consider the joined dataset from the first two tables as the left table and the third table as the right table (The DB optimizes the joins for performance).

```sql
-- not based on real tables
SELECT
    a.*
FROM
    table_a a -- LEFT table a
    JOIN table_b b -- RIGHT table b
    ON a.id = b.id
    JOIN table_c c -- LEFT table is the joined data from 
-- table_a & table_b, right table is table_c
    ON a.c_id = c.id
```

There are five main types of joins, they are:

## 1. Inner join (default): Get only rows in both tables



In [98]:
%%sql
SELECT
  o.o_orderkey,
  l.l_orderkey
FROM
  orders o
  JOIN lineitem l ON o.o_orderkey = l.l_orderkey
  AND o.o_orderdate BETWEEN l.l_shipdate - INTERVAL '5' DAY AND l.l_shipdate  + INTERVAL '5' DAY
LIMIT
  100;

o_orderkey,l_orderkey
7,7
32,32
33,33
69,69
71,71
132,132
133,133
198,198
259,259
260,260


In [99]:
%%sql
SELECT
  COUNT(o.o_orderkey) AS order_rows_count,
  COUNT(l.l_orderkey) AS lineitem_rows_count
FROM
  orders o
  JOIN lineitem l ON o.o_orderkey = l.l_orderkey
  AND o.o_orderdate BETWEEN l.l_shipdate - INTERVAL '5' DAY AND l.l_shipdate  + INTERVAL '5' DAY;

order_rows_count,lineitem_rows_count
61551,61551


**Note:** JOIN defaults to INNER JOIN.

The output will have rows from orders and lineitem that found at least one matching row from the other table with the specified join condition (same o_orderkey and orderdate within ship date +/- 5 days). 

We can also see that 247650 rows from orders and lineitem tables matched.

## 2. Left outer join (aka left join): Get all rows from the left table and only matching rows from the right table.



In [100]:
%%sql
SELECT
  o.o_orderkey,
  l.l_orderkey
FROM
  orders o
  LEFT JOIN lineitem l ON o.o_orderkey = l.l_orderkey
  AND o.o_orderdate BETWEEN l.l_shipdate - INTERVAL '5' DAY AND l.l_shipdate  + INTERVAL '5' DAY
LIMIT
  100;

o_orderkey,l_orderkey
7,7.0
34,
65,
130,
167,
196,
198,198.0
228,
229,
262,


In [101]:
%%sql
SELECT
  COUNT(o.o_orderkey) AS order_rows_count,
  COUNT(l.l_orderkey) AS lineitem_rows_count
FROM
  orders o
  LEFT JOIN lineitem l ON o.o_orderkey = l.l_orderkey
  AND o.o_orderdate BETWEEN l.l_shipdate - INTERVAL '5' DAY AND l.l_shipdate  + INTERVAL '5' DAY;

order_rows_count,lineitem_rows_count
379824,61551


The output will have all the rows from orders and the rows from lineitem that were able to find at least one matching row from the orders table with the specified join condition (same o_orderkey and orderdate within ship date +/- 5 days). 

We can also see that the number of rows from the orders table is 1,519,332 & from the lineitem table is 247,650. The number of rows in orders is 1,500,000, but the join condition produces 1,519,332 since some orders match with multiple lineitems.

## 3. Right outer join (aka right join): Get matching rows from the left and all rows from the right table.



In [102]:
%%sql
SELECT
  o.o_orderkey,
  l.l_orderkey
FROM
  orders o
  RIGHT JOIN lineitem l ON o.o_orderkey = l.l_orderkey
  AND o.o_orderdate BETWEEN l.l_shipdate - INTERVAL '5' DAY AND l.l_shipdate  + INTERVAL '5' DAY
LIMIT
  100;

o_orderkey,l_orderkey
,1
,1
,1
,1
,1
,1
,2
,3
,3
,3


In [103]:
%%sql
SELECT
  COUNT(o.o_orderkey) AS order_rows_count,
  COUNT(l.l_orderkey) AS lineitem_rows_count
FROM
  orders o
  RIGHT JOIN lineitem l ON o.o_orderkey = l.l_orderkey
  AND o.o_orderdate BETWEEN l.l_shipdate - INTERVAL '5' DAY AND l.l_shipdate  + INTERVAL '5' DAY;

order_rows_count,lineitem_rows_count
61551,1499579


The output will have the rows from orders that found at least one matching row from the lineitem table with the specified join condition (same o_orderkey and orderdate within ship date +/- 5 days) and all the rows from the lineitem table.

We can also see that the number of rows from the orders table is 247,650 & from the lineitem table is 6,001,215 .

## 4. Full outer join: Get all rows from both the left and right tables.



In [104]:
%%sql
SELECT
  o.o_orderkey,
  l.l_orderkey
FROM
  orders o
  FULL OUTER JOIN lineitem l ON o.o_orderkey = l.l_orderkey
  AND o.o_orderdate BETWEEN l.l_shipdate - INTERVAL '5' DAY AND l.l_shipdate  + INTERVAL '5' DAY
LIMIT
  100;

o_orderkey,l_orderkey
7.0,7.0
,7.0
,7.0
,7.0
,7.0
,7.0
,7.0
34.0,
,34.0
,34.0


In [105]:
%%sql
SELECT
  COUNT(o.o_orderkey) AS order_rows_count,
  COUNT(l.l_orderkey) AS lineitem_rows_count
FROM
  orders o
  FULL OUTER JOIN lineitem l ON o.o_orderkey = l.l_orderkey
  AND o.o_orderdate BETWEEN l.l_shipdate - INTERVAL '5' DAY AND l.l_shipdate  + INTERVAL '5' DAY;

order_rows_count,lineitem_rows_count
379824,1499579


The output will have all the rows from orders that found at least one matching row from the lineitem table with the specified join condition (same o_orderkey and orderdate within ship date +/- 5 days) and all the rows from the lineitem table.

We can also see that the number of rows from the orders table is 1,519,332 & from the lineitem table is 6,001,215.

## 5. Cross join: Get the cartesian product of all rows



In [106]:
%%sql
SELECT
  n.n_name AS nation_c_name,
  r.r_name AS region_c_name
FROM
  nation n
  CROSS JOIN region r;

nation_c_name,region_c_name
ALGERIA,AFRICA
ALGERIA,AMERICA
ALGERIA,ASIA
ALGERIA,EUROPE
ALGERIA,MIDDLE EAST
ARGENTINA,AFRICA
ARGENTINA,AMERICA
ARGENTINA,ASIA
ARGENTINA,EUROPE
ARGENTINA,MIDDLE EAST



The output will have every row of the nation joined with every row of the region. There are 25 nations and five regions, leading to 125 rows in our result from the cross-join.

![Joins](./joins.png)
![Cross join](./cross_join.png)

There are cases where we will need to join a table with itself, called a SELF-join.

**Example**:

1. For every customer order, get the order placed earlier in the same week (Sunday - Saturday, not the previous seven days). Only show customer orders that have at least one such order.




Most analytical queries require calculating metrics that involve combining data from multiple rows. `GROUP BY` allows us to perform aggregate calculations on data from a set of rows grouped together based on values of specified column(s).

**Example**:

1. Create a report that shows the number of orders per orderpriority segment.



In [107]:
%%sql
SELECT
  o_orderpriority,
  COUNT(*) AS num_orders
FROM
  orders
GROUP BY
  o_orderpriority;

o_orderpriority,num_orders
5-LOW,75189
3-MEDIUM,74395
1-URGENT,74988
4-NOT SPECIFIED,75243
2-HIGH,75185



In the above query, we group the data by `orderpriority`, and the calculation `count(*)` will be applied to the rows having a specific `orderpriority` value. The output will consist of one row per unique value of `orderpriority` and the `count(*)` calculation.

![Group by](./group.png)

The calculations allowed are typically SUM/MIN/MAX/AVG/COUNT. However, some databases have more complex aggregate functions; check your DB documentation.

# [SUB QUERY] Use the result of a query within a query using sub-queries

When we want to use the result of a query as a table in another query, we use subqueries. **Example**:

1. Create a report that shows the nation, how many items it supplied (by suppliers in that nation), and how many items it purchased (by customers in that nation). 



In [108]:
%%sql
SELECT
  n.n_name AS nation_c_name,
  s.quantity AS supplied_items_quantity,
  c.quantity AS purchased_items_quantity
FROM
  nation n
  LEFT JOIN (
    SELECT
      n.n_nationkey,
      SUM(l.l_quantity) AS quantity
    FROM
      lineitem l
      JOIN supplier s ON l.l_suppkey = s.s_suppkey
      JOIN nation n ON s.s_nationkey = n.n_nationkey
    GROUP BY
      n.n_nationkey
  ) s ON n.n_nationkey = s.n_nationkey
  LEFT JOIN (
    SELECT
      n.n_nationkey,
      SUM(l.l_quantity) AS quantity
    FROM
      lineitem l
      JOIN orders o ON l.l_orderkey = o.o_orderkey
      JOIN customer c ON o.o_custkey = c.c_custkey
      JOIN nation n ON c.c_nationkey = n.n_nationkey
    GROUP BY
      n.n_nationkey
  ) c ON n.n_nationkey = c.n_nationkey;

                                                                                

nation_c_name,supplied_items_quantity,purchased_items_quantity
JAPAN,1566331.0,1492769.0
RUSSIA,1559636.0,1568477.0
ARGENTINA,1427396.0,1562720.0
JORDAN,1412382.0,1515472.0
FRANCE,1493736.0,1545607.0
MOZAMBIQUE,1581893.0,1556360.0
CANADA,1444464.0,1549467.0
SAUDI ARABIA,1718728.0,1501974.0
ETHIOPIA,1466990.0,1549282.0
ROMANIA,1188554.0,1521046.0



In the above query, we can see that there are two sub-queries, one to calculate the quantity supplied by a nation and the other to calculate the quantity purchased by the customers of a nation.

# [CASE WHEN] Replicate IF.ELSE logic with CASE statements

We can do conditional logic in the `SELECT ... FROM` part of our query, as shown below.



In [109]:
%%sql
SELECT
    o_orderkey,
    o_totalprice,
    CASE
        WHEN o_totalprice > 100000 THEN 'high'
        WHEN o_totalprice BETWEEN 25000
        AND 100000 THEN 'medium'
        ELSE 'low'
    END AS order_price_bucket
FROM
    orders;

o_orderkey,o_totalprice,order_price_bucket
1,217279.39,high
2,58634.64,medium
3,226541.68,high
4,29259.46,medium
5,127549.56,high
6,64649.34,medium
7,280265.7,high
32,170064.21,high
33,129243.33,high
34,49754.91,medium


We can see how we display different values depending on the `totalprice` column. We can also use multiple criteria as our conditional criteria (e.g., totalprice > 100000 AND orderpriority = '2-HIGH').

# [STANDARD FUNCTION] Use standard inbuilt DB functions for common String, Time, and Numeric data manipulation

When processing data, more often than not, we will need to change values in columns; shown below are a few standard functions to be aware of:

1. **String functions**
   1. **LENGTH** is used to calculate the length of a string. E.g., `SELECT LENGTH('hi');` will output 2.
   2. **CONCAT** combines multiple string columns into one. E.g., `SELECT CONCAT(o_orderstatus, '-', o_orderpriority) FROM ORDERS LIMIT 5;` will concatenate the o_orderstatus and o_orderpriority columns with a dash in between them.
   3. **SPLIT** is used to split a value into an array based on a given delimiter. E.g., `SELECT STRING_SPLIT(o_orderpriority, '-') FROM ORDERS LIMIT 5;` will output a column with arrays formed by splitting o_orderpriority values on `-`.
   4. **SUBSTRING** is used to get a sub-string from a value, given the start and end character indices. E.g., `SELECT o_orderpriority, SUBSTRING(o_orderpriority, 1, 5) FROM orders LIMIT 5;` will get the first five (1 - 5) characters of the o_orderpriority column.
   5. **TRIM** is used to remove empty spaces to the left and right of the value. E.g., `SELECT TRIM(' hi ');` will output `hi` without any spaces around it. LTRIM and RTRIM are similar but only remove spaces before and after the string, respectively.
2. **Date and Time functions**
   1. **Adding and subtracting dates**: Is used to add and subtract periods; the format heavily depends on the DB. In DuckDB, `date_diff` accepts 3 parameters, the outputs unit (day, month, year), the datetime/date values a and b such that the output will be a - b. The `+ INTERVAL n UNIT(DAY/MONTH/YEAR)` adds the value (in specified units) to the timestamp value.
      ```sql
        -- Date and Time Functions
        SELECT
            date_diff('day', DATE '2022-10-01', DATE '2023-11-05') AS diff_in_days,
            date_diff('month', DATE '2022-10-01', DATE '2023-11-05') AS diff_in_months,
            date_diff('year', DATE '2022-10-01', DATE '2023-11-05') AS diff_in_years,
            DATE '2022-10-01' + INTERVAL 400 DAY AS new_date;
      ```
    It will show the difference between the two dates in the specified period. We can also add/subtract an arbitrary period from a date/time column. E.g., `SELECT DATE '2022-11-05' + INTERVAL '10' DAY;` will show the output `2022-11-15` (try subtraction of dates).
   2. **String <=> date/time conversions**: When we want to change the data type of a string to date/time, we can use the `DATE 'YYYY-MM-DD'` or `TIMESTAMP 'YYYY-MM-DD HH:mm:SS'` functions. But when the data is in a non-standard date/time format such as `MM/DD/YYYY`, we will need to specify the input structure; we do this using `strptime`, E.g., `SELECT strptime('11-05-2023', '%m-%d-%Y');`.  
   
   We can convert a timestamp/date into a string with the required format using `strftime`. E.g., `SELECT strftime(o_orderdate, '%Y-%m-01') AS first_month_date FROM orders LIMIT 5;` will map every o_orderdate to the first of their month.

   See [this page](https://duckdb.org/docs/sql/functions/date) on how to set the proper date time format.
   3. **Time frame functions (YEAR/MONTH/DAY)**:  When we want to extract specific periods from a date/time column, we can use these functions. E.g., `SELECT year(DATE '2023-11-05');` will return 2023. Similarly, we have month, day, hour, min, etc.
3. **Numeric**
   1. **ROUND** is used to specify the number of digits allowed after the decimal point. E.g. `SELECT ROUND(100.102345, 2);`
   2. **ABS** is used to get the absolute value of a given number. E.g. `SELECT ABS(-100), ABS(100);` 
   3. **Mathematical operations** these are +,-,*,/.
   4. **Ceil/Floor** is used to get the next higher and most recent lower integers, given a decimal digit. E.g. `SELECT CEIL(100.1), FLOOR(100.1);`