## Init

In [1]:
import findspark
import os
import sys

from dotenv import load_dotenv
from IPython.core.magic import register_cell_magic

# Find Spark package
findspark.init()

# Get environment variable from .env file
env = load_dotenv()

# Add project working directory to PATH
sys.path.append(os.getenv("PROJECT_FOLDER"))

In [2]:
from src.transform.etl import *
from src.transform.common import *

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, IntegerType

# Initialize Spark session with configurations
spark = SparkSession.builder \
    .appName("Delta-Unity-Catalog") \
    .master("local[*]") \
    .config("spark.jars.packages", 
            "io.delta:delta-spark_2.12:3.2.1,io.unitycatalog:unitycatalog-spark_2.12:0.2.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "io.unitycatalog.spark.UCSingleCatalog") \
    .config("spark.sql.catalog.unity", "io.unitycatalog.spark.UCSingleCatalog") \
    .config("spark.sql.catalog.unity.uri", "http://localhost:8080") \
    .config("spark.sql.catalog.unity.token", "") \
    .config("spark.sql.defaultCatalog", "unity") \
    .getOrCreate()


24/12/16 18:48:52 WARN Utils: Your hostname, khoa-le-MS-7B19 resolves to a loopback address: 127.0.1.1; using 192.168.1.7 instead (on interface eno1)
24/12/16 18:48:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/khoa-le/.ivy2/cache
The jars for the packages stored in: /home/khoa-le/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
io.unitycatalog#unitycatalog-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7daeecb5-7172-4207-bf5f-e662c1e80e41;1.0
	confs: [default]


:: loading settings :: url = jar:file:/home/khoa-le/data/app/spark-3.5.3-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found io.delta#delta-spark_2.12;3.2.1 in central
	found io.delta#delta-storage;3.2.1 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found io.unitycatalog#unitycatalog-spark_2.12;0.2.0 in central
	found io.unitycatalog#unitycatalog-client;0.2.0 in central
	found org.slf4j#slf4j-api;2.0.13 in central
	found org.apache.logging.log4j#log4j-slf4j2-impl;2.23.1 in central
	found org.apache.logging.log4j#log4j-api;2.23.1 in central
	found org.apache.logging.log4j#log4j-core;2.23.1 in central
	found com.fasterxml.jackson.datatype#jackson-datatype-jsr310;2.17.0 in central
	found org.openapitools#jackson-databind-nullable;0.2.6 in central
	found com.google.code.findbugs#jsr305;3.0.2 in central
	found com.fasterxml.jackson.core#jackson-databind;2.15.0 in central
	found com.fasterxml.jackson.core#jackson-annotations;2.15.0 in central
	found com.fasterxml.jackson.core#jackson-core;2.15.0 in central
	found com.fasterxml.jackson.module#jackson-module-scala_2.12;2.15.0 in central
	found 

In [4]:
# Create a global variable for the Spark session
@register_cell_magic
def sql(line, cell=None):
    query = cell or line
    df = spark.sql(query)
    return df.show()

In [5]:
# Declare variables
catalog = os.getenv("CATALOG")
schema = os.getenv("CURATED_SCHEMA")

## Load raw tables

In [6]:
customers    = spark.table("unity.raw.customers")
staffs       = spark.table("unity.raw.staffs")
stores       = spark.table("unity.raw.stores")
products     = spark.table("unity.raw.products")
transactions = spark.table("unity.raw.transactions")

In [7]:
# Create temp views
customers.createOrReplaceTempView("customers")
staffs.createOrReplaceTempView("staffs")
stores.createOrReplaceTempView("stores")
products.createOrReplaceTempView("products")
transactions.createOrReplaceTempView("transactions")

## Denorm Table

In [26]:
query = """
SELECT
    t.transaction_id,
    t.item_id,
    t.quantity,
    p.product_name,
    p.category,
    p.unit_price,
    c.first_name || " " || c.last_name as customer_name,
    year(current_date()) - year(c.yob) as age,
    s.first_name || " " || s.last_name as staff_name,
    t.quantity * p.unit_price as total_amount,
    t.store as store_name,
    st.address as store_address,
    t.utc_dt as transaction_ts
FROM transactions t
LEFT JOIN customers c
    ON t.customer_id = c.customer_id
LEFT JOIN products p
    ON t.item_id = p.product_id
LEFT JOIN staffs s
    ON t.staff_id = s.staff_id
LEFT JOIN stores st
    ON t.store = st.name
"""

In [27]:
df = (
    spark.sql(query)
    .transform(add_processing_ts)
)

In [28]:
df.printSchema()

root
 |-- transaction_id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- unit_price: long (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- staff_name: string (nullable = true)
 |-- total_amount: long (nullable = true)
 |-- store_name: string (nullable = true)
 |-- store_address: string (nullable = true)
 |-- transaction_ts: string (nullable = true)
 |-- last_processed_ts: timestamp (nullable = false)



## Merge table

In [None]:
# merge_table(
#     df=df,
#     uc_path=uc_path(catalog, schema, "master_table"),
#     merge_columns=["customer_id"],
#     spark=spark
# )