In [1]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, trim, monotonically_increasing_id
from delta.tables import DeltaTable # Ensure this is imported for Delta operations

StatementMeta(, e2a4c304-5dd3-4d05-814d-070c306484b7, 3, Finished, Available, Finished)

In [2]:
df_product_raw = spark.table("Silver_Data.product.silver_product_data")

StatementMeta(, e2a4c304-5dd3-4d05-814d-070c306484b7, 4, Finished, Available, Finished)

In [3]:
# -------------------------------------------
# Data Transformation to create dim_product
# -------------------------------------------

# Select and rename columns, cast to appropriate types where necessary,
# and apply transformations based on the dim_product schema.

df_product_clean = df_product_raw.select(
    # Generate surrogate key for product_key (INT)
    (monotonically_increasing_id() + 1).alias("product_key"), # Add 1 to start from 1

    # Natural Key: product_id (VARCHAR) - direct mapping
    col("product_id").cast("string"),

    # Product length: product_length (DECIMAL)
    col("product_length").cast("decimal(10,4)"), # Assuming 10 total digits, 4 after decimal

    # Product depth: product_depth (DECIMAL)
    col("product_depth").cast("decimal(10,4)"),

    # Product width: product_width (DECIMAL)
    col("product_width").cast("decimal(10,4)"),

    # Product cluster identifier: cluster_id (VARCHAR)
    col("cluster_id").cast("string"),

    # Product hierarchy levels 1 to 5 (VARCHAR)
    col("hierarchy1_id").cast("string"),
    col("hierarchy2_id").cast("string"),
    col("hierarchy3_id").cast("string"),
    col("hierarchy4_id").cast("string"),
    col("hierarchy5_id").cast("string")
)

StatementMeta(, e2a4c304-5dd3-4d05-814d-070c306484b7, 5, Finished, Available, Finished)

In [4]:
# Display schema and a sample of the cleaned data 
print("Schema of dim_product DataFrame:")
df_product_clean.printSchema()
print("Sample of dim_product data:")
df_product_clean.show(5)

StatementMeta(, e2a4c304-5dd3-4d05-814d-070c306484b7, 6, Finished, Available, Finished)

Schema of dim_product DataFrame:
root
 |-- product_key: long (nullable = false)
 |-- product_id: string (nullable = true)
 |-- product_length: decimal(10,4) (nullable = true)
 |-- product_depth: decimal(10,4) (nullable = true)
 |-- product_width: decimal(10,4) (nullable = true)
 |-- cluster_id: string (nullable = true)
 |-- hierarchy1_id: string (nullable = true)
 |-- hierarchy2_id: string (nullable = true)
 |-- hierarchy3_id: string (nullable = true)
 |-- hierarchy4_id: string (nullable = true)
 |-- hierarchy5_id: string (nullable = true)

Sample of dim_product data:
+-----------+----------+--------------+-------------+-------------+----------+-------------+-------------+-------------+-------------+-------------+
|product_key|product_id|product_length|product_depth|product_width|cluster_id|hierarchy1_id|hierarchy2_id|hierarchy3_id|hierarchy4_id|hierarchy5_id|
+-----------+----------+--------------+-------------+-------------+----------+-------------+-------------+-------------+-------

In [6]:
# -------------------------------------------
# Escrita dos dados limpos na camada Gold
# -------------------------------------------
df_product_clean.write.format("delta").mode("overwrite").saveAsTable("Gold_Data.dim_product.dim_product")

StatementMeta(, e2a4c304-5dd3-4d05-814d-070c306484b7, 8, Finished, Available, Finished)

In [8]:
%%sql
SELECT * FROM Gold_Data.dim_product.dim_product LIMIT 10

StatementMeta(, e2a4c304-5dd3-4d05-814d-070c306484b7, 10, Finished, Available, Finished)

<Spark SQL result set with 10 rows and 11 fields>