# [SETUP] 

In [9]:
! python ../../setup.py

Remove existing tpch.db file at /home/josephkevinmachado/code/adv_data_transformation_in_sql
Recreate tpch.db file at /home/josephkevinmachado/code/adv_data_transformation_in_sql/tpch.db
Establishing connection to tpch.db db file
Reading in the table creation script
Executing table creation
Committing and closing connection
Setup Done!!!


## Connect to DuckDB

In [10]:
import duckdb
import pandas as pd

%load_ext sql
conn = duckdb.connect("../../tpch.db")
%sql conn --alias duckdb

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [11]:
%%sql
show tables;

name
customer
lineitem
nation
orders
part
partsupp
region
supplier


## Data distribution diff

In [26]:
%config SqlMagic.displaylimit = None

In [30]:
%%sql 
With row_count_distribution as (
  select 
    o_orderdate, 
    count(*) as num_rows, 
    lead(count(*)) over (order by o_orderdate desc) as prev_date_num_rows 
  from 
    orders 
  group by 
    o_orderdate
) 
select 
  o_orderdate, 
  num_rows, 
  round(
    ((num_rows - prev_date_num_rows) * 100.0)/ prev_date_num_rows, 
    2
  ) as perc_diff,
    case when abs(perc_diff) > 1.00 then True else False end as raise_alert
from 
  row_count_distribution 
order by 
  o_orderdate desc
limit 50

o_orderdate,num_rows,perc_diff,raise_alert
1998-08-02,581,-5.99,True
1998-08-01,618,-0.48,False
1998-07-31,621,0.32,False
1998-07-30,619,1.81,True
1998-07-29,608,-0.65,False
1998-07-28,612,1.66,True
1998-07-27,602,-4.44,True
1998-07-26,630,0.48,False
1998-07-25,627,3.47,True
1998-07-24,606,-5.31,True


## Dim + Metric diff

In [31]:
%%sql
describe orders

column_name,column_type,null,key,default,extra
o_orderkey,BIGINT,NO,,,
o_custkey,BIGINT,NO,,,
o_orderstatus,VARCHAR,NO,,,
o_totalprice,"DECIMAL(15,2)",NO,,,
o_orderdate,DATE,NO,,,
o_orderpriority,VARCHAR,NO,,,
o_clerk,VARCHAR,NO,,,
o_shippriority,INTEGER,NO,,,
o_comment,VARCHAR,NO,,,


In [33]:
%%sql
select o_orderdate
, o_orderstatus
, round(min(o_totalprice), 2) as min_totalprice
    , round(max(o_totalprice), 2) as max_totalprice
    , round(avg(o_totalprice), 2) as avg_totalprice
from orders
group by o_orderdate, o_orderstatus
order by o_orderdate desc, o_orderstatus
limit 10

o_orderdate,o_orderstatus,min_totalprice,max_totalprice,avg_totalprice
1998-08-02,O,1154.1,397208.63,151364.84
1998-08-01,O,1466.22,430942.94,151863.35
1998-07-31,O,1583.37,427026.87,149472.41
1998-07-30,O,1140.1,394369.45,150807.38
1998-07-29,O,1074.17,442940.66,150795.07
1998-07-28,O,1240.07,502742.76,150953.32
1998-07-27,O,1552.48,426463.16,153914.84
1998-07-26,O,1442.9,486911.38,152521.01
1998-07-25,O,2247.35,422534.98,147979.82
1998-07-24,O,1125.56,425458.58,149913.84


In [38]:
%%sql
with existing as (
select o_orderdate
, o_orderstatus
, round(min(o_totalprice), 2) as min_totalprice
    , round(max(o_totalprice), 2) as max_totalprice
    , round(avg(o_totalprice), 2) as avg_totalprice
from orders
group by o_orderdate, o_orderstatus)
, new as (
    select o_orderdate
, o_orderstatus
, round(min(o_totalprice), 2)*2 as min_totalprice
    , round(max(o_totalprice), 2) as max_totalprice
    , round(avg(o_totalprice), 2) as avg_totalprice
from orders
group by o_orderdate, o_orderstatus
)
select e.o_orderdate
, e.o_orderstatus
, e.min_totalprice
, e.max_totalprice
, e.avg_totalprice
, n.min_totalprice
, n.max_totalprice
, n.avg_totalprice
from existing e 
left join new n
on e.o_orderdate = n.o_orderdate
and e.o_orderstatus = n.o_orderstatus
where e.min_totalprice != n.min_totalprice 
or e.max_totalprice != n.max_totalprice
or e.avg_totalprice != n.avg_totalprice

order by e.o_orderdate desc, e.o_orderstatus

o_orderdate,o_orderstatus,min_totalprice,max_totalprice,avg_totalprice,min_totalprice_1,max_totalprice_1,avg_totalprice_1
1998-08-02,O,1154.1,397208.63,151364.84,2308.2,397208.63,151364.84
1998-08-01,O,1466.22,430942.94,151863.35,2932.44,430942.94,151863.35
1998-07-31,O,1583.37,427026.87,149472.41,3166.74,427026.87,149472.41
1998-07-30,O,1140.1,394369.45,150807.38,2280.2,394369.45,150807.38
1998-07-29,O,1074.17,442940.66,150795.07,2148.34,442940.66,150795.07
1998-07-28,O,1240.07,502742.76,150953.32,2480.14,502742.76,150953.32
1998-07-27,O,1552.48,426463.16,153914.84,3104.96,426463.16,153914.84
1998-07-26,O,1442.9,486911.38,152521.01,2885.8,486911.38,152521.01
1998-07-25,O,2247.35,422534.98,147979.82,4494.7,422534.98,147979.82
1998-07-24,O,1125.56,425458.58,149913.84,2251.12,425458.58,149913.84
