In [23]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col,
    sum,
    avg,
    when,
    trim,
    lower,
    date_format,
    coalesce,
    lit,
    max
)
from delta.tables import DeltaTable # Ensure this is imported for Delta operations


# Initialize Spark Session (if not already initialized by Fabric)
spark = SparkSession.builder.getOrCreate()

StatementMeta(, e7f24ce9-8427-49e4-b28d-7f23932b936f, 25, Finished, Available, Finished)

In [24]:
# -------------------------------------------
# Leitura dos dados da camada Silver (sales)
# -------------------------------------------
df_sales_raw = spark.table("Silver_Data.sales.silver_sales_data")

# -------------------------------------------
# Leitura dos dados das tabelas Dimensão (Gold Layer)
# -------------------------------------------
df_dim_date = spark.table("Gold_Data.dim_date.dim_date")
df_dim_store = spark.table("Gold_Data.dim_stores.dim_stores")
df_dim_product = spark.table("Gold_Data.dim_product.dim_product")
df_dim_promotion = spark.table("Gold_Data.dim_promotion.dim_promotion")

# -------------------------------------------
# Data Transformation to create fact_sales
# -------------------------------------------

# 1. Prepare sales_raw for joining (e.g., trim/lower case natural keys)
df_sales_prepared = df_sales_raw.withColumn("product_id_lower", trim(lower(col("product_id")))) \
                                .withColumn("store_id_lower", trim(lower(col("store_id")))) \
                                .withColumn("promo_type_1_lower", trim(lower(col("promo_type_1")))) \
                                .withColumn("promo_bin_1_lower", trim(lower(col("promo_bin_1")))) \
                                .withColumn("promo_type_2_lower", trim(lower(col("promo_type_2")))) \
                                .withColumn("promo_bin_2_lower", trim(lower(col("promo_bin_2")))) \
                                .withColumn("promo_discount_type_2_lower", trim(lower(col("promo_discount_type_2")))) \
                                .withColumn("full_date_format", col("date").cast("date")) 


# 2. Join with dim_date to get date_id
# Note: df_sales_raw has a 'date' column, dim_date has 'full_date'. Cast 'date' to 'full_date' type.
df_fact = df_sales_prepared.join(
    df_dim_date,
    df_sales_prepared.full_date_format == df_dim_date.full_date,
    "left"
).select(df_sales_prepared["*"], df_dim_date.date_id)


# 3. Join with dim_store to get store_key
df_fact = df_fact.join(
    df_dim_store,
    df_fact.store_id_lower == df_dim_store.store_id, # Join on normalized store_id
    "left"
).select(df_fact["*"], df_dim_store.store_key)


# 4. Join with dim_product to get product_key
df_fact = df_fact.join(
    df_dim_product,
    df_fact.product_id_lower == df_dim_product.product_id, # Join on normalized product_id
    "left"
).select(df_fact["*"], df_dim_product.product_key)


# 5. Join with dim_promotion to get promotion_key
# Need to join on ALL promotion attributes to find the correct promotion_key
df_fact = df_fact.join(
    df_dim_promotion,
    (df_fact.promo_type_1_lower == df_dim_promotion.promo_type_1) & \
    (df_fact.promo_bin_1_lower == df_dim_promotion.promo_bin_1) & \
    (df_fact.promo_type_2_lower == df_dim_promotion.promo_type_2) & \
    (df_fact.promo_bin_2_lower == df_dim_promotion.promo_bin_2) & \
    (df_fact.promo_discount_2 == df_dim_promotion.promo_discount_2) & \
    (df_fact.promo_discount_type_2_lower == df_dim_promotion.promo_discount_type_2),
    "left"
).select(df_fact["*"], df_dim_promotion.promotion_key)

# Handle cases where a key might not be found in the dimension (should be rare if dims are comprehensive)
# Coalesce to -1 or a specific default value for unknown/missing keys.
# Using -1 for missing keys is a common practice in data warehousing.
df_fact = df_fact.withColumn("date_id", coalesce(col("date_id"), lit(-1)).cast("int")) \
                 .withColumn("store_key", coalesce(col("store_key"), lit(-1)).cast("int")) \
                 .withColumn("product_key", coalesce(col("product_key"), lit(-1)).cast("int")) \
                 .withColumn("promotion_key", coalesce(col("promotion_key"), lit(-1)).cast("int"))


# 6. Group by dimension keys and aggregate measures
# Ensure all dimension keys and relevant attributes for aggregation are included in the groupby.
# For 'price', it's (SUM(revenue) / SUM(sales_units)) per day/store/product.
df_fact_sales = df_fact.groupBy(
    col("date_id"),
    col("store_key"),
    col("product_key"),
    col("promotion_key")
).agg(
    sum(col("sales")).alias("quantities_sold"), # Total sales units for the day
    sum(col("revenue")).alias("revenue"), # Total revenue for the day
    avg(col("price")).alias("price"), # Average price for the day
    # stock_quantity is end-of-day, so we need to pick one or handle it carefully in aggregation.
    # If it's a snapshot, we might take the last one or average, depending on requirement.
    # For simplicity, assuming 'stock' in sales_raw represents end-of-day stock,
    # and we want the stock for that day/store/product combination.
    # If multiple entries for same day/store/product, consider max/avg of stock.
    # Here, we'll take the max stock if there are multiple entries for the same group.
    max(col("stock")).alias("stock_quantity"), # considering max() assuming it's a single snapshot per day/store/product.
    # average_daily_sales_units: TRUE if any promotion was active
    max(col("is_promo_type_1_active_this_week").cast("boolean")).alias("is_promotion_active") # Use max() to aggregate the boolean flag
)

# Reorder columns and ensure correct data types for the final fact table
df_fact_sales = df_fact_sales.select(
    col("date_id").cast("int"),
    col("store_key").cast("int"),
    col("product_key").cast("int"),
    col("promotion_key").cast("int"),
    col("quantities_sold").cast("decimal(18,2)"), # sales_units can be a float
    col("revenue").cast("decimal(18,2)"), # Adjust precision as needed
    col("price").cast("decimal(18,4)"), # Price can have more decimals
    col("stock_quantity").cast("int"), # Assuming stock_quantity is an integer
    col("is_promotion_active").cast("boolean")
)

StatementMeta(, e7f24ce9-8427-49e4-b28d-7f23932b936f, 26, Finished, Available, Finished)

In [25]:
# # Display schema and a sample of the generated fact_sales data (optional)
# print("Schema of fact_sales DataFrame:")
# df_fact_sales.printSchema()
# print("Sample of fact_sales data:")
# df_fact_sales.show(5)

StatementMeta(, e7f24ce9-8427-49e4-b28d-7f23932b936f, 27, Finished, Available, Finished)

Schema of fact_sales DataFrame:
root
 |-- date_id: integer (nullable = false)
 |-- store_key: integer (nullable = false)
 |-- product_key: integer (nullable = false)
 |-- promotion_key: integer (nullable = false)
 |-- quantities_sold: decimal(18,2) (nullable = true)
 |-- revenue: decimal(18,2) (nullable = true)
 |-- price: decimal(18,4) (nullable = true)
 |-- stock_quantity: integer (nullable = true)
 |-- is_promotion_active: boolean (nullable = true)

Sample of fact_sales data:
+--------+---------+-----------+-------------+---------------+-------+--------+--------------+-------------------+
| date_id|store_key|product_key|promotion_key|quantities_sold|revenue|   price|stock_quantity|is_promotion_active|
+--------+---------+-----------+-------------+---------------+-------+--------+--------------+-------------------+
|20191013|        1|         12|           10|           0.00|   0.00|119.9000|             0|              false|
|20190507|        1|         40|           10|          

In [26]:
# -------------------------------------------
# Escrita dos dados limpos na camada Gold
# -------------------------------------------
df_fact_sales.write.format("delta").mode("overwrite").saveAsTable("Gold_Data.fact_sales.fact_sales")

StatementMeta(, e7f24ce9-8427-49e4-b28d-7f23932b936f, 28, Finished, Available, Finished)

# Quick sanity check

In [27]:
# %%sql
# SELECT * FROM Gold_Data.fact_sales.fact_sales WHERE promotion_key = -1 OR date_id = -1 OR store_key = -1 OR product_key = -1

StatementMeta(, e7f24ce9-8427-49e4-b28d-7f23932b936f, 29, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 9 fields>