In [8]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, trim, monotonically_increasing_id, sha2, concat_ws, trim 
from delta.tables import DeltaTable 

StatementMeta(, 79f650c9-c483-487e-8e47-84a532b9086a, 10, Finished, Available, Finished)

In [9]:
# Initialize Spark Session (if not already initialized by Fabric)
spark = SparkSession.builder.getOrCreate()

StatementMeta(, 79f650c9-c483-487e-8e47-84a532b9086a, 11, Finished, Available, Finished)

In [10]:
# -------------------------------------------
# Leitura dos dados da camada Silver (sales)
# -------------------------------------------
df_sales_raw = spark.table("Silver_Data.sales.silver_sales_data")

# -------------------------------------------
# Data Transformation to create dim_promotion
# -------------------------------------------

# Select relevant promotion columns from the sales_raw data
# and ensure they are trimmed and lowercased for consistency before deduping.
df_promotion_distinct = df_sales_raw.select(
    trim(lower(col("promo_type_1"))).alias("promo_type_1"),
    trim(lower(col("promo_bin_1"))).alias("promo_bin_1"),
    trim(lower(col("promo_type_2"))).alias("promo_type_2"),
    trim(lower(col("promo_bin_2"))).alias("promo_bin_2"),
    col("promo_discount_2").cast("decimal(10,4)").alias("promo_discount_2"), # Assuming DECIMAL(10,4)
    trim(lower(col("promo_discount_type_2"))).alias("promo_discount_type_2")
).distinct() # Get unique combinations of these promotion attributes

# Generate a surrogate key for dim_promotion.
df_promotion_clean = df_promotion_distinct.withColumn(
    "promotion_key",
    (monotonically_increasing_id() + 1).cast("int") # Add 1 to start from 1
)

# Reorder columns to match the desired schema and ensure correct data types
df_promotion_clean = df_promotion_clean.select(
    col("promotion_key"),
    col("promo_type_1").cast("string"),
    col("promo_bin_1").cast("string"),
    col("promo_type_2").cast("string"),
    col("promo_bin_2").cast("string"),
    col("promo_discount_2").cast("decimal(10,4)"),
    col("promo_discount_type_2").cast("string")
)

StatementMeta(, 79f650c9-c483-487e-8e47-84a532b9086a, 12, Finished, Available, Finished)

In [2]:
# Write data to gold layer
df_promotion_clean.write.format("delta").mode("overwrite").saveAsTable("Gold_Data.dim_promotion.dim_promotion")

StatementMeta(, 8432a4c4-e649-49fa-9fef-fba2890035e2, -1, SessionError, , SessionError)

In [12]:
# # Display schema and a sample of the cleaned data
# print("Schema of dim_promotion DataFrame:")
# df_promotion_clean.printSchema()
# print("Sample of dim_promotion data:")
# df_promotion_clean.show(100, truncate=False) # truncate=False to see full string values

StatementMeta(, 79f650c9-c483-487e-8e47-84a532b9086a, 14, Finished, Available, Finished)

Schema of dim_promotion DataFrame:
root
 |-- promotion_key: integer (nullable = false)
 |-- promo_type_1: string (nullable = true)
 |-- promo_bin_1: string (nullable = true)
 |-- promo_type_2: string (nullable = true)
 |-- promo_bin_2: string (nullable = true)
 |-- promo_discount_2: decimal(10,4) (nullable = true)
 |-- promo_discount_type_2: string (nullable = true)

Sample of dim_promotion data:
+-------------+------------+-----------+------------+-----------+----------------+---------------------+
|promotion_key|promo_type_1|promo_bin_1|promo_type_2|promo_bin_2|promo_discount_2|promo_discount_type_2|
+-------------+------------+-----------+------------+-----------+----------------+---------------------+
|1            |pr17        |veryhigh   |pr03        |na         |0.0000          |na                   |
|2            |pr14        |na         |pr02        |verylow    |20.0000         |pr02                 |
|3            |pr11        |verylow    |pr03        |na         |0.0000    