# Initialization

In [1]:
spark

In [2]:
import os
import sys

# Add project working directory to PATH
sys.path.append(os.getenv("PROJECT_FOLDER"))

from src.utils import sql, register_spark_session

# Register spark as global session
register_spark_session(spark)

In [3]:
# Print our environments, including assigning and printing the values
print(f"""
    CATALOG: {(CATALOG := os.getenv("CATALOG"))}
    RAW_SCHEMA: {(RAW_SCHEMA := os.getenv("RAW_SCHEMA"))}
    CURATED_SCHEMA: {(CURATED_SCHEMA := os.getenv("CURATED_SCHEMA"))}
    PROJECT_FOLDER: {(PROJECT_FOLDER := os.getenv("PROJECT_FOLDER"))}
    STORAGE_FOLDER: {(STORAGE_FOLDER := os.getenv("STORAGE_FOLDER"))}
    UC_HOME: {(STORAGE_FOLDER := os.getenv("UC_HOME"))}
""")


    CATALOG: unity
    RAW_SCHEMA: raw
    CURATED_SCHEMA: curated
    PROJECT_FOLDER: /home/khoa-le/data/working/projects/unity-catalog-project
    STORAGE_FOLDER: /home/khoa-le/data/storage/unity-catalog-project
    UC_HOME: /home/khoa-le/data/app/uc/unitycatalog



# Show Catalogs

In [4]:
# There are two catalogs by default: spark_catalog and unity
spark.sql("SHOW CATALOGS;").show()

+-------------+
|      catalog|
+-------------+
|spark_catalog|
|        unity|
+-------------+



In [5]:
# There is one default schema from `unity` catalog
spark.sql(f"SHOW SCHEMAS FROM {CATALOG};").show()

+---------+
|namespace|
+---------+
|  default|
|      raw|
+---------+



# Create unity.raw schema

This schema is used to store raw, unprocessed table.

In [10]:
spark.sql(f"""
CREATE SCHEMA IF NOT EXISTS `{CATALOG}`.`{RAW_SCHEMA}`
""")

DataFrame[]

In [11]:
# With raw schema added, we now have both `raw` and `default`
spark.sql(f"SHOW SCHEMAS FROM {CATALOG};").show()

+---------+
|namespace|
+---------+
|  default|
|      raw|
+---------+



## Create tables under raw schema

In [15]:
# Create tables under this raw schema
create_customers = f"""
CREATE TABLE IF NOT EXISTS {CATALOG}.{RAW_SCHEMA}.customers (
    customer_id STRING,
    gender STRING,
    first_name STRING,
    last_name STRING,
    email STRING,
    yob INTEGER,
    phone_number STRING,
    job STRING,
    address STRING,
    first_transaction TIMESTAMP,
    membership STRING,
    last_processed_ts TIMESTAMP
)
USING DELTA
LOCATION '{STORAGE_FOLDER}/{CATALOG}/{RAW_SCHEMA}/customers'
COMMENT 'This table stores customer personal information'
TBLPROPERTIES ('domain' = 'customer')
"""

# Execute the SQL query
spark.sql(create_customers)

DataFrame[]

In [17]:
create_staffs = f"""
CREATE TABLE IF NOT EXISTS {CATALOG}.{RAW_SCHEMA}.staffs (
    staff_id STRING,
    gender STRING,
    first_name STRING,
    last_name STRING,
    store_id STRING,
    last_processed_ts TIMESTAMP
)
USING DELTA
LOCATION '{STORAGE_FOLDER}/{CATALOG}/{RAW_SCHEMA}/staffs'
COMMENT 'This table stores staff information'
TBLPROPERTIES ('domain' = 'staff')
"""

# Execute the SQL query
spark.sql(create_staffs)

DataFrame[]

In [18]:
create_stores = f"""
CREATE TABLE IF NOT EXISTS {CATALOG}.{RAW_SCHEMA}.stores (
    name STRING,
    address STRING,
    phone STRING,
    email STRING,
    last_processed_ts TIMESTAMP
)
USING DELTA
LOCATION '{STORAGE_FOLDER}/{CATALOG}/{RAW_SCHEMA}/stores'
COMMENT 'This table stores store information'
TBLPROPERTIES ('domain' = 'store')
"""

# Execute the SQL query
spark.sql(create_stores)

DataFrame[]

In [19]:
create_products = f"""
CREATE TABLE IF NOT EXISTS {CATALOG}.{RAW_SCHEMA}.products (
    product_id STRING,
    category STRING,
    product_name STRING,
    unit_price LONG,
    last_processed_ts TIMESTAMP
)
USING DELTA
LOCATION '{STORAGE_FOLDER}/{CATALOG}/{RAW_SCHEMA}/products'
COMMENT 'This table stores product information'
TBLPROPERTIES ('domain' = 'product')
"""

# Execute the SQL query
spark.sql(create_products)

DataFrame[]

In [20]:
create_transactions = f"""
CREATE TABLE IF NOT EXISTS {CATALOG}.{RAW_SCHEMA}.transactions (
    transaction_id STRING,
    item_id STRING,
    item_order INT,
    store STRING,
    customer_id STRING,
    staff_id STRING,
    quantity INT,
    utc_dt STRING,
    last_processed_ts TIMESTAMP
)
USING DELTA
LOCATION '{STORAGE_FOLDER}/{CATALOG}/{RAW_SCHEMA}/transactions'
COMMENT 'This table stores transaction information'
TBLPROPERTIES ('domain' = 'transaction')
"""

# Execute the SQL query
spark.sql(create_transactions)

DataFrame[]

## Create volumes under raw schema

In [5]:
%%sh

cd ${UC_HOME}

# Define common paths and names
VOLUME_BASE_PATH="${STORAGE_FOLDER}/${CATALOG}/${RAW_SCHEMA}/volumes"

# Create folder for storing volumes
mkdir -p "${VOLUME_BASE_PATH}/txt_files"
mkdir -p "${VOLUME_BASE_PATH}/json_files"

# Function to create volume
create_volume() {
    local volume_name=$1
    local storage_path=$2
    local comment=$3

    bin/uc volume create \
        --full_name "${CATALOG}.${RAW_SCHEMA}.${volume_name}" \
        --storage_location "${storage_path}" \
        --comment "${comment}" \
    || echo "Volume ${volume_name} has already existed"
}

# Create volumes
create_volume "json_files" "${VOLUME_BASE_PATH}/json_files" "This volume is used to store json files"
create_volume "txt_files" "${VOLUME_BASE_PATH}/txt_files" "This volume is used to store text files"

Exception in thread "main" java.lang.RuntimeException: io.unitycatalog.client.ApiException: createVolume call failed with: 409 - {"error_code":"ALREADY_EXISTS","details":[{"reason":"ALREADY_EXISTS","metadata":{},"@type":"google.rpc.ErrorInfo"}],"stack_trace":null,"message":"Volume already exists: unity.raw.json_files"}
.unitycatalog.cli.UnityCatalogCli.main(UnityCatalogCli.java:171)
piException: createVolume call failed with: 409 - {"error_code":"ALREADY_EXISTS","details":[{"reason":"ALREADY_EXISTS","metadata":{},"@type":"google.rpc.ErrorInfo"}],"stack_trace":null,"message":"Volume already exists: unity.raw.json_files"}
VolumesApi.java:77).client.api.VolumesApi.getApiException(
va:117)unitycatalog.client.api.VolumesApi.createVolumeWithHttpInfo(VolumesApi.ja
	at io.unitycatalog.client.api.VolumesApi.createVolume(VolumesApi.java:95)
og.cli.VolumeCli.createVolume(VolumeCli.java:74)
java:40)nitycatalog.cli.VolumeCli.handle(VolumeCli.
	at io.unitycatalog.cli.UnityCatalogCli.main(UnityCatalo

Volume json_files has already existed


Exception in thread "main" java.lang.RuntimeException: io.unitycatalog.client.ApiException: createVolume call failed with: 409 - {"error_code":"ALREADY_EXISTS","details":[{"reason":"ALREADY_EXISTS","metadata":{},"@type":"google.rpc.ErrorInfo"}],"stack_trace":null,"message":"Volume already exists: unity.raw.txt_files"}
	at io.unitycatalog.cli.UnityCatalogCli.main(UnityCatalogCli.java:171)
ient.ApiException: createVolume call failed with: 409 - {"error_code":"ALREADY_EXISTS","details":[{"reason":"ALREADY_EXISTS","metadata":{},"@type":"google.rpc.ErrorInfo"}],"stack_trace":null,"message":"Volume already exists: unity.raw.txt_files"}
tion(VolumesApi.java:77)nt.api.VolumesApi.getApiExcep
pi.java:117)catalog.client.api.VolumesApi.createVolumeWithHttpInfo(VolumesA
	at io.unitycatalog.client.api.VolumesApi.createVolume(VolumesApi.java:95)
catalog.cli.VolumeCli.createVolume(VolumeCli.java:74)
eCli.java:40)atalog.cli.VolumeCli.handle(Volum
	at io.unitycatalog.cli.UnityCatalogCli.main(UnityCatalo

Volume txt_files has already existed


## Create function(s) under raw schema

In [38]:
%%sh

cd ${UC_HOME}

# Create a python function to hash text
bin/uc function create \
    --full_name "${CATALOG}.${RAW_SCHEMA}.hash_text" \
    --data_type STRING \
    --input_params "input_text STRING" \
    --comment "This function is used for hashing text" \
    --language "python" \
    --def "import uuid\nreturn str(uuid.uuid5(uuid.NAMESPACE_DNS, input_text))" \
    | echo "Function already existed"

Function already existed


Exception in thread "main" java.lang.RuntimeException: io.unitycatalog.client.ApiException: createFunction call failed with: 409 - {"error_code":"ALREADY_EXISTS","details":[{"reason":"ALREADY_EXISTS","metadata":{},"@type":"google.rpc.ErrorInfo"}],"stack_trace":null,"message":"Function already exists: hash_text"}
atalog.cli.UnityCatalogCli.main(UnityCatalogCli.java:171)
tion: createFunction call failed with: 409 - {"error_code":"ALREADY_EXISTS","details":[{"reason":"ALREADY_EXISTS","metadata":{},"@type":"google.rpc.ErrorInfo"}],"stack_trace":null,"message":"Function already exists: hash_text"}
.java:76)itycatalog.client.api.FunctionsApi.getApiException(FunctionsApi
16) io.unitycatalog.client.api.FunctionsApi.createFunctionWithHttpInfo(FunctionsApi.java:1
	at io.unitycatalog.client.api.FunctionsApi.createFunction(FunctionsApi.java:94)
alog.cli.FunctionCli.createFunction(FunctionCli.java:93)
FunctionCli.java:36)cli.FunctionCli.handle(
	at io.unitycatalog.cli.UnityCatalogCli.main(UnityCata

In [57]:
%%sh

cd ${UC_HOME}

# Create a python function to hash text
bin/uc function create \
    --full_name "${CATALOG}.${RAW_SCHEMA}.customer_full_name" \
    --data_type STRING \
    --input_params "first_name STRING, last_name STRING" \
    --comment "This function is used for concatenating into customer full name" \
    --def "return first_name + ' ' + last_name" \
    | echo "Function already existed"

Function already existed


Exception in thread "main" java.lang.RuntimeException: io.unitycatalog.client.ApiException: createFunction call failed with: 409 - {"error_code":"ALREADY_EXISTS","details":[{"reason":"ALREADY_EXISTS","metadata":{},"@type":"google.rpc.ErrorInfo"}],"stack_trace":null,"message":"Function already exists: customer_full_name"}
io.unitycatalog.cli.UnityCatalogCli.main(UnityCatalogCli.java:171)
.ApiException: createFunction call failed with: 409 - {"error_code":"ALREADY_EXISTS","details":[{"reason":"ALREADY_EXISTS","metadata":{},"@type":"google.rpc.ErrorInfo"}],"stack_trace":null,"message":"Function already exists: customer_full_name"}
ption(FunctionsApi.java:76)api.FunctionsApi.getApiExce
unctionsApi.java:116)lient.api.FunctionsApi.createFunctionWithHttpInfo(F
4)t io.unitycatalog.client.api.FunctionsApi.createFunction(FunctionsApi.java:9
	at io.unitycatalog.cli.FunctionCli.createFunction(FunctionCli.java:93)
unctionCli.handle(FunctionCli.java:36)
java:139)itycatalog.cli.UnityCatalogCli.main(U

In [55]:
%%sh

cd ${UC_HOME}

# Create a python function to hash text
bin/uc function call \
    --full_name "${CATALOG}.${RAW_SCHEMA}.customer_full_name" \
    --input_params "Adam, West"

"Adam West"

