In [2]:
import findspark

findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, IntegerType

# Initialize Spark session with configurations
spark = SparkSession.builder \
    .appName("Delta-Unity-Catalog") \
    .master("local[*]") \
    .config("spark.jars.packages", 
            "io.delta:delta-spark_2.12:3.2.1,io.unitycatalog:unitycatalog-spark_2.12:0.2.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "io.unitycatalog.spark.UCSingleCatalog") \
    .config("spark.sql.catalog.unity", "io.unitycatalog.spark.UCSingleCatalog") \
    .config("spark.sql.catalog.unity.uri", "http://localhost:8080") \
    .config("spark.sql.catalog.unity.token", "") \
    .config("spark.sql.defaultCatalog", "unity") \
    .getOrCreate()

24/12/15 21:27:20 WARN Utils: Your hostname, khoa-le-MS-7B19 resolves to a loopback address: 127.0.1.1; using 192.168.1.6 instead (on interface eno1)
24/12/15 21:27:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/khoa-le/.ivy2/cache
The jars for the packages stored in: /home/khoa-le/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
io.unitycatalog#unitycatalog-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5b6da8ec-3b6a-4e6a-9caa-1702d31c18b9;1.0
	confs: [default]


:: loading settings :: url = jar:file:/home/khoa-le/data/app/spark-3.5.3-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found io.delta#delta-spark_2.12;3.2.1 in central
	found io.delta#delta-storage;3.2.1 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found io.unitycatalog#unitycatalog-spark_2.12;0.2.0 in central
	found io.unitycatalog#unitycatalog-client;0.2.0 in central
	found org.slf4j#slf4j-api;2.0.13 in central
	found org.apache.logging.log4j#log4j-slf4j2-impl;2.23.1 in central
	found org.apache.logging.log4j#log4j-api;2.23.1 in central
	found org.apache.logging.log4j#log4j-core;2.23.1 in central
	found com.fasterxml.jackson.datatype#jackson-datatype-jsr310;2.17.0 in central
	found org.openapitools#jackson-databind-nullable;0.2.6 in central
	found com.google.code.findbugs#jsr305;3.0.2 in central
	found com.fasterxml.jackson.core#jackson-databind;2.15.0 in central
	found com.fasterxml.jackson.core#jackson-annotations;2.15.0 in central
	found com.fasterxml.jackson.core#jackson-core;2.15.0 in central
	found com.fasterxml.jackson.module#jackson-module-scala_2.12;2.15.0 in central
	found 

In [4]:
from src.transform.etl import *
from src.transform.common import *

from IPython.core.magic import register_cell_magic

In [5]:
# Create a global variable for the Spark session
@register_cell_magic
def sql(line, cell=None):
    query = cell or line
    df = spark.sql(query)
    return df.show()

In [6]:
# Declare variables
catalog = "unity"
schema = "curated"

## Load raw tables

In [11]:
customers    = spark.table("unity.raw.customers")
staffs       = spark.table("unity.raw.staffs")
stores       = spark.table("unity.raw.stores")
products     = spark.table("unity.raw.products")
transactions = spark.table("unity.raw.transactions")

In [12]:
# Create temp views
customers.createOrReplaceTempView("customers")
staffs.createOrReplaceTempView("staffs")
stores.createOrReplaceTempView("stores")
products.createOrReplaceTempView("products")
transactions.createOrReplaceTempView("transactions")

In [16]:
transactions.printSchema()

root
 |-- transaction_id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- item_order: integer (nullable = true)
 |-- store: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- staff_id: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- utc_dt: string (nullable = true)
 |-- last_processed_ts: timestamp (nullable = true)



In [18]:
%%sql
SELECT * FROM transactions

+--------------------+--------------------+----------+---------------+--------------------+--------------------+--------+-------------------+--------------------+
|      transaction_id|             item_id|item_order|          store|         customer_id|            staff_id|quantity|             utc_dt|   last_processed_ts|
+--------------------+--------------------+----------+---------------+--------------------+--------------------+--------+-------------------+--------------------+
|c1c25f3771f2496ca...|c446411d441a4f288...|         1|         To Hit|7f3de301fbf4472e9...|56b20a9993c140fa9...|       2|2024-11-05T12:47:12|2024-12-15 21:11:...|
|c1c25f3771f2496ca...|d817251d1b7e42609...|         2|         To Hit|7f3de301fbf4472e9...|56b20a9993c140fa9...|       1|2024-11-05T12:47:12|2024-12-15 21:11:...|
|c1c25f3771f2496ca...|3381a129c1cc4500a...|         3|         To Hit|7f3de301fbf4472e9...|56b20a9993c140fa9...|       2|2024-11-05T12:47:12|2024-12-15 21:11:...|
|c1c25f3771f2496ca...|

In [33]:
%%sql

SELECT
    t.transaction_id,
    t.item_id,
    t.quantity,
    p.product_name,
    p.category,
    p.unit_price,
    c.first_name || " " || c.last_name as customer_name,
    year(current_date()) - year(c.yob) as age,
    
    t.utc_dt as transaction_ts
FROM transactions t
LEFT JOIN customers c
    ON t.customer_id = c.customer_id
LEFT JOIN products p
    ON t.item_id = p.product_id

+--------------------+--------------------+--------+--------------------+---------+----------+------------------+---+-------------------+
|      transaction_id|             item_id|quantity|        product_name| category|unit_price|     customer_name|age|     transaction_ts|
+--------------------+--------------------+--------+--------------------+---------+----------+------------------+---+-------------------+
|c1c25f3771f2496ca...|c446411d441a4f288...|       2|   Red Boxing Gloves|    Sport|       283|      Scott Walton| 39|2024-11-05T12:47:12|
|c1c25f3771f2496ca...|d817251d1b7e42609...|       1|Practical Red Pla...|Furniture|       292|      Scott Walton| 39|2024-11-05T12:47:12|
|c1c25f3771f2496ca...|3381a129c1cc4500a...|       2|Generic Red Cotto...|Furniture|       184|      Scott Walton| 39|2024-11-05T12:47:12|
|c1c25f3771f2496ca...|d2bcd6de34c04c0b8...|       2|  Handmade Golf Club|    Sport|        14|      Scott Walton| 39|2024-11-05T12:47:12|
|c1c25f3771f2496ca...|0789bd727f4e