# [SETUP] 

In [9]:
! python ../../setup.py

Remove existing tpch.db file at /home/josephkevinmachado/code/adv_data_transformation_in_sql
Recreate tpch.db file at /home/josephkevinmachado/code/adv_data_transformation_in_sql/tpch.db
Establishing connection to tpch.db db file
Reading in the table creation script
Executing table creation
Committing and closing connection
Setup Done!!!


## Connect to DuckDB

In [2]:
import duckdb
import pandas as pd

%load_ext sql
conn = duckdb.connect("../../tpch.db")
%sql conn --alias duckdb

In [3]:
%%sql
show tables;

name
customer
lineitem
nation
orders
part
partsupp
region
supplier


## Data distribution diff

In [26]:
%config SqlMagic.displaylimit = None

In [30]:
%%sql 
With row_count_distribution as (
  select 
    o_orderdate, 
    count(*) as num_rows, 
    lead(count(*)) over (order by o_orderdate desc) as prev_date_num_rows 
  from 
    orders 
  group by 
    o_orderdate
) 
select 
  o_orderdate, 
  num_rows, 
  round(
    ((num_rows - prev_date_num_rows) * 100.0)/ prev_date_num_rows, 
    2
  ) as perc_diff,
    case when abs(perc_diff) > 1.00 then True else False end as raise_alert
from 
  row_count_distribution 
order by 
  o_orderdate desc
limit 50

o_orderdate,num_rows,perc_diff,raise_alert
1998-08-02,581,-5.99,True
1998-08-01,618,-0.48,False
1998-07-31,621,0.32,False
1998-07-30,619,1.81,True
1998-07-29,608,-0.65,False
1998-07-28,612,1.66,True
1998-07-27,602,-4.44,True
1998-07-26,630,0.48,False
1998-07-25,627,3.47,True
1998-07-24,606,-5.31,True


## Dim + Metric diff

In [31]:
%%sql
describe orders

column_name,column_type,null,key,default,extra
o_orderkey,BIGINT,NO,,,
o_custkey,BIGINT,NO,,,
o_orderstatus,VARCHAR,NO,,,
o_totalprice,"DECIMAL(15,2)",NO,,,
o_orderdate,DATE,NO,,,
o_orderpriority,VARCHAR,NO,,,
o_clerk,VARCHAR,NO,,,
o_shippriority,INTEGER,NO,,,
o_comment,VARCHAR,NO,,,


In [33]:
%%sql
select o_orderdate
, o_orderstatus
, round(min(o_totalprice), 2) as min_totalprice
    , round(max(o_totalprice), 2) as max_totalprice
    , round(avg(o_totalprice), 2) as avg_totalprice
from orders
group by o_orderdate, o_orderstatus
order by o_orderdate desc, o_orderstatus
limit 10

o_orderdate,o_orderstatus,min_totalprice,max_totalprice,avg_totalprice
1998-08-02,O,1154.1,397208.63,151364.84
1998-08-01,O,1466.22,430942.94,151863.35
1998-07-31,O,1583.37,427026.87,149472.41
1998-07-30,O,1140.1,394369.45,150807.38
1998-07-29,O,1074.17,442940.66,150795.07
1998-07-28,O,1240.07,502742.76,150953.32
1998-07-27,O,1552.48,426463.16,153914.84
1998-07-26,O,1442.9,486911.38,152521.01
1998-07-25,O,2247.35,422534.98,147979.82
1998-07-24,O,1125.56,425458.58,149913.84


In [9]:
%%sql
drop table if exists new_orders

Success


In [10]:
%%sql
create table new_orders as select o_orderdate, o_orderstatus, case when o_orderstatus = 'O' then o_totalprice else o_totalprice * 0.5 end as o_totalprice from orders 

Count
1500000


In [13]:
%%sql
with existing as (
select o_orderdate
, o_orderstatus
, round(min(o_totalprice), 2) as min_totalprice
    , round(max(o_totalprice), 2) as max_totalprice
    , round(avg(o_totalprice), 2) as avg_totalprice
from orders
group by CUBE(o_orderdate, o_orderstatus))
, new as (
    select o_orderdate
, o_orderstatus
, round(min(o_totalprice), 2) as min_totalprice
    , round(max(o_totalprice), 2) as max_totalprice
    , round(avg(o_totalprice), 2) as avg_totalprice
from new_orders
group by CUBE(o_orderdate, o_orderstatus)
)
select e.o_orderdate
, e.o_orderstatus
, e.min_totalprice
, e.max_totalprice
, e.avg_totalprice
, n.min_totalprice as new_min_totalprice
, n.max_totalprice as new_max_totalprice
, n.avg_totalprice as new_avg_totalprice
from existing e 
left join new n
on e.o_orderdate = n.o_orderdate
and e.o_orderstatus = n.o_orderstatus
where e.min_totalprice != n.min_totalprice 
or e.max_totalprice != n.max_totalprice
or e.avg_totalprice != n.avg_totalprice
order by e.o_orderdate desc, e.o_orderstatus

o_orderdate,o_orderstatus,min_totalprice,max_totalprice,avg_totalprice,new_min_totalprice,new_max_totalprice,new_avg_totalprice
1995-06-16,P,123326.15,343351.6,204703.07,61663.08,171675.8,102351.54
1995-06-15,F,30417.68,73077.58,51747.63,15208.84,36538.79,25873.82
1995-06-15,P,61614.19,401799.93,204271.27,30807.1,200899.97,102135.63
1995-06-14,P,20035.22,376373.16,186160.01,10017.61,188186.58,93080.01
1995-06-13,F,39577.59,40701.85,40139.72,19788.8,20350.93,20069.86
1995-06-13,P,16886.79,341243.87,193640.44,8443.4,170621.94,96820.22
1995-06-12,F,19903.88,61913.17,36779.56,9951.94,30956.59,18389.78
1995-06-12,P,12392.17,414142.68,191949.54,6196.09,207071.34,95974.77
1995-06-11,F,3924.56,52497.07,21523.46,1962.28,26248.54,10761.73
1995-06-11,P,46117.57,352517.98,186943.23,23058.79,176258.99,93471.62
