#### Day 8

In [0]:
%sql
-- Create a catalog (sort of like a domain boundary)
CREATE CATALOG IF NOT EXISTS ecommerce_catalog;

In [0]:
%sql
-- Use the catalog
USE CATALOG ecommerce_catalog;


Create Schemas (Bronze, Silver, Gold) - Mirroring the medallion architecture

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS bronze;
CREATE SCHEMA IF NOT EXISTS silver;
CREATE SCHEMA IF NOT EXISTS gold;

In [0]:
%sql
-- Verify if schemas have been created

SHOW SCHEMAS;


databaseName
bronze
default
gold
information_schema
silver


In [0]:
%sql
SHOW CATALOGS;


catalog
ecommerce_catalog
samples
system
workspace


In [0]:
%sql
-- List folders under this volume

SHOW VOLUMES IN workspace.ecommerce;


database,volume_name
ecommerce,ecommerce_data


In [0]:
%sql
-- Explore the volume path

LIST '/Volumes/workspace/ecommerce/ecommerce_data';


path,name,size,modification_time
/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv,2019-Nov.csv,9006762395,1767973029000
/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv,2019-Oct.csv,5668612855,1767973179000
/Volumes/workspace/ecommerce/ecommerce_data/df_oct_delta/,df_oct_delta/,0,1768673000920
/Volumes/workspace/ecommerce/ecommerce_data/df_oct_delta_day_5/,df_oct_delta_day_5/,0,1768673000920
/Volumes/workspace/ecommerce/ecommerce_data/df_oct_delta_day_6/,df_oct_delta_day_6/,0,1768673000920


Use catalogs & schema

In [0]:
%sql
USE CATALOG ecommerce_catalog;
USE SCHEMA bronze;


Create Tables by Writing Data - First Bronze

In [0]:
df_oct = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv",
    header=True,
    inferSchema=True
)

df_oct.write \
  .format("delta") \
  .mode("overwrite") \
  .saveAsTable("ecommerce_catalog.bronze.events")


Follow the same procedure for Silver & Gold

In [0]:
from pyspark.sql.functions import col

# ==========================
# 1) Read Bronze table
# ==========================
df_bronze = spark.table("ecommerce_catalog.bronze.events")

# ==========================
# 2) Clean & Validate Data
# (same as Day 6)
# ==========================
df_silver = df_bronze.filter(
    (col("event_time").isNotNull()) &
    (col("event_type").isNotNull()) &
    (col("user_id").isNotNull())
)

# Fill null text fields
df_silver = df_silver.na.fill({
    "brand": "Not_Available",
    "category_code": "Not_Available"
})

# ==========================
# 3) Drop Duplicates
# ==========================
df_silver = df_silver.dropDuplicates(["user_session", "event_time"])

# ==========================
# 4) Write Silver table
# ==========================
df_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("ecommerce_catalog.silver.events")


Follow the same process for Gold

In [0]:
from pyspark.sql import functions as F

# ==========================
# 1) Read Silver table
# ==========================
silver = spark.table("ecommerce_catalog.silver.events")

# ==========================
# 2) Aggregate by category
# (same as Day 6)
# ==========================
category_perf = silver.groupBy("category_code") \
    .agg(
        F.countDistinct(
            F.when(F.col("event_type") == "view", F.col("user_session"))
        ).alias("unique_views"),

        F.countDistinct(
            F.when(F.col("event_type") == "cart", F.col("user_session"))
        ).alias("unique_carts"),

        F.countDistinct(
            F.when(F.col("event_type") == "purchase", F.col("user_session"))
        ).alias("unique_purchases"),

        F.sum(
            F.when(F.col("event_type") == "purchase", F.col("price"))
        ).alias("total_revenue")
    ) \
    .withColumn(
        "cart_to_purchase_ratio",
        F.when(
            F.col("unique_carts") > 0,
            F.round(F.col("unique_purchases") / F.col("unique_carts") * 100, 2)
        ).otherwise(0)
    )

# ==========================
# 3) Write Gold table
# ==========================
category_perf.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("ecommerce_catalog.gold.category_performance")


Check if your data tables are inside your catalog/schemas

We did this using:
.saveAsTable("ecommerce_catalog.bronze.events")
.saveAsTable("ecommerce_catalog.silver.events")
.saveAsTable("ecommerce_catalog.gold.category_performance")

Just check this-

In [0]:
%sql
SHOW TABLES IN ecommerce_catalog.bronze;


database,tableName,isTemporary
bronze,events,False


In [0]:
%sql
SHOW TABLES IN ecommerce_catalog.silver;

database,tableName,isTemporary
silver,events,False


In [0]:
%sql
SHOW TABLES IN ecommerce_catalog.gold;

database,tableName,isTemporary
gold,category_performance,False


Set Up Permissions

Example: Read-only access to _Silver_ for me

Identify current user

In [0]:
%sql
SELECT current_user();


current_user()
maisondemaitre+databricks@hotmail.com


In [0]:
%sql
GRANT SELECT
ON SCHEMA ecommerce_catalog.silver
TO `maisondemaitre+databricks@hotmail.com`;


Grant Gold read-only access to me

In [0]:
%sql
GRANT SELECT
ON SCHEMA ecommerce_catalog.gold
TO `maisondemaitre+databricks@hotmail.com`;


Create Views for Controlled Access (hide raw columns)

In [0]:
%sql
CREATE OR REPLACE VIEW ecommerce_catalog.gold.v_category_summary AS
SELECT
    category_code,
    unique_purchases,
    total_revenue,
    cart_to_purchase_ratio
FROM ecommerce_catalog.gold.category_performance;


Grant access to view only - for this particular table

In [0]:
%sql
GRANT SELECT
ON VIEW ecommerce_catalog.gold.v_category_summary
TO `maisondemaitre+databricks@hotmail.com`;
