In [0]:
use catalog hive_metastore

## Parsing JSON Data

In [0]:
SELECT * FROM customers limit 3

In [0]:
describe customers

In [0]:
select customer_id, profile:first_name, profile:address:country 
from customers limit 3

In [0]:
-- profile 컬럼은 STRUCT가 아니라 STRING(JSON 문자열) 이기 때문에
-- profile.first_name 처럼 점(.)으로 접근할 수 없다는 뜻입니다.

select customer_id, profile.first_name, profile.address.country
from customers limit 3

In [0]:
-- from_json 함수는 문자열 형태의 JSON 데이터를 지정된 스키마(구조)에 맞춰 구조화된 데이터 타입(Struct, Map 등)으로 변환해주는 함수로
-- from_json(col, schema, options={})

SELECT from_json(profile) AS profile_struct
  FROM customers;

In [0]:
SELECT profile 
FROM customers 
LIMIT 1

In [0]:
  -- schema_of_json은 JSON 문자열의 구조를 분석하여 데이터 타입과 필드 정보를 담은 스키마를 자동으로 생성해주는 함수입니다
  -- 사용자가 복잡한 JSON의 스키마(StructType 등)를 일일이 수동으로 정의하지 않고, JSON 예시 문자열 하나만 넣으면 Spark이 이를 해석해 DDL 형식의 스키마 문자열을 반환합니다. 
  
  SELECT customer_id, from_json(profile, schema_of_json('{"first_name":"Susana","last_name":"Gonnely","gender":"Female","address":{"street":"760 Express Court","city":"Obrenovac","country":"Serbia"}}')) as profile_stuct from customers
  limit 3

In [0]:
create or replace temp view parsed_customers as
  SELECT customer_id, from_json(profile, schema_of_json('{"first_name":"Susana","last_name":"Gonnely","gender":"Female","address":{"street":"760 Express Court","city":"Obrenovac","country":"Serbia"}}')) as profile_struct from customers;

  select * from parsed_customers limit 3

In [0]:
select customer_id, profile_struct.first_name, profile_struct.address.country
from parsed_customers limit 3

In [0]:
create or replace temp view customers_final as select customer_id, profile_struct.* from parsed_customers;
select * from customers_final limit 3

In [0]:
SELECT order_id, customer_id, books
FROM orders limit 3

## Explode Function

In [0]:
select *, explode(profile) from customers;

In [0]:
select * from orders limit 4

In [0]:
SELECT order_id, customer_id, explode(books) AS book 
FROM orders limit 5

## Collecting Rows

In [0]:
SELECT customer_id,collect_set(order_id) from orders group by customer_id limit 3

In [0]:
SELECT customer_id, collect_set(order_id) from orders group by customer_id limit 3

In [0]:
select customer_id, collect_set(order_id), collect_set(books.book_id) from orders group by customer_id limit 3

## Flatten Arrays

In [0]:
select customer_id, collect_set(order_id), collect_set(books.book_id) as before_flatten,  flatten(collect_set(books.book_id)) as after_flatten from orders group by customer_id limit 3

In [0]:
select customer_id, collect_set(order_id), collect_set(books.book_id) as before_flatten,  array_distinct(flatten(collect_set(books.book_id))) as after_flatten from orders group by customer_id limit 3

In [0]:
select customer_id, collect_set(order_id), collect_set(books.book_id) as before_flatten, sort_array(array_distinct(flatten(collect_set(books.book_id)))) as after_flatten from orders group by customer_id limit 3

## Join Operations


In [0]:
select *, explode(books) as book from orders limit 3

In [0]:
select * from books limit 3

In [0]:
describe books

In [0]:
select *, explode(books) as book 
  from orders limit 3

In [0]:
CREATE OR REPLACE temp VIEW orders_enriched as 
select *
from (
  select *, explode(books) as book 
  from orders
) as o 
INNER join books b 
on  o.book.book_id = b.book_id;

In [0]:
select * from orders_enriched limit 3

In [0]:
describe orders_enriched;

## Set Operations

In [0]:
-- UNION = 중복 제거 (느림)
-- UNION ALL = 그대로 합침 (빠름)

create or replace temp view orders_updates
as select * from parquet.`dbfs:/mnt/demo-datasets/bookstore/orders-new`;

select * from (select * from orders limit 3)
union 
select * from (select * from orders_updates limit 3)

In [0]:
-- 두 SELECT 결과에 “공통으로 존재하는 행만” 반환

select * from orders
intersect 
select * from orders_updates

In [0]:
-- 첫 번째 SELECT에는 있지만
-- 두 번째 SELECT에는 없는 행만 반환

select count(*) from (
SELECT * FROM orders 
MINUS 
SELECT * FROM orders_updates )

In [0]:
-- MINUS = EXCEPT (의미 동일)
-- 이름만 다르고, 동작은 같다

select count(*) from (
SELECT * FROM orders 
except 
SELECT * FROM orders_updates )

## Reshaping Data with Pivot

In [0]:


  SELECT *
  FROM orders_enriched

In [0]:
describe orders_enriched

In [0]:
select customer_id, books.quantity, books.book_id from orders_enriched limit 5

In [0]:
SELECT * FROM (
  SELECT
    customer_id,
    book.book_id AS book_id,
    book.quantity AS quantity
  FROM orders_enriched
) PIVOT (
  sum(quantity) FOR book_id in (
    'B01', 'B02', 'B03', 'B04', 'B05', 'B06',
    'B07', 'B08', 'B09', 'B10', 'B11', 'B12'
  )
);

In [0]:
SELECT *
FROM orders_enriched limit 3


In [0]:
SELECT *
FROM (
  SELECT
    customer_id,
    book_id,
    quantity
  FROM orders_enriched
) pivot (
  sum(quantity) for book_id in (
    'B01', 'B02', 'B03', 'B04', 'B05', 'B06',
    'B07', 'B08', 'B09', 'B10', 'B11', 'B12'
  )
)

In [0]:
SELECT
  customer_id,
  book.book_id,
  SUM(book.quantity) AS total_qty
FROM orders_enriched
CROSS JOIN explode(books) AS book
GROUP BY customer_id, book.book_id;


In [0]:
-- books : ARRAY<STRUCT<...>>
-- book : STRUCT<book_id, quantity, subtotal>

SELECT
  customer_id,
  book.book_id,
  SUM(book.quantity) AS total_qty
FROM orders_enriched
GROUP BY customer_id, book.book_id;


In [0]:
SELECT *
FROM (
  SELECT
  customer_id,
  book.book_id ,
  SUM(book.quantity) AS total_qty
FROM orders_enriched
GROUP BY customer_id, book.book_id
) pivot (
  sum(total_qty) for book_id in (
    'B01', 'B02', 'B03', 'B04', 'B05', 'B06',
    'B07', 'B08', 'B09', 'B10', 'B11', 'B12'
  )
)