In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, trim, monotonically_increasing_id, concat, lit
from delta.tables import DeltaTable

# -------------------------------------------
# Leitura dos dados da camada Silver
# -------------------------------------------
df_cities_silver = spark.table("Silver_Data.cities.silver_cities_data")

StatementMeta(, ee0a703e-e8f0-4841-8c1c-30c5d3f1aeed, 10, Finished, Available, Finished)

In [9]:
# -------------------------------------------
# Data Transformation to create dim_store
# -------------------------------------------

# Select and rename columns, cast to appropriate types where necessary,
# and apply transformations based on the dim_store schema.

df_stores_clean = df_cities_silver.select(
    # Generate surrogate key for store_key (INT)
    # Using monotonically_increasing_id() provides a unique, but not necessarily consecutive, ID.
    (monotonically_increasing_id() + 1).alias("store_key"), # Add 1 to start from 1, or adjust as needed

    # Natural Key: store_id (VARCHAR) - direct mapping
    col("store_id").cast("string"),

    # Type of the store: storetype_id (VARCHAR) - direct mapping
    col("storetype_id").cast("string"),

    # Size category of the store: store_size (VARCHAR) - direct mapping
    col("store_size").cast("string"),

    # City identifier: city_id (VARCHAR) - renamed from city_id_old
    col("city_id_old").alias("city_id").cast("string"),

    # Country identifier: country_id (VARCHAR) - direct mapping
    col("country_id").alias("country_name").cast("string"),

    # City code: city_code (VARCHAR) - direct mapping
    col("city_code").alias("city_name").cast("string")
)

# # Display schema 
# print("Schema of dim_store DataFrame:")
# df_stores_clean.printSchema()
# print("Sample of dim_store data:")
# df_stores_clean.show(5)

StatementMeta(, ee0a703e-e8f0-4841-8c1c-30c5d3f1aeed, 11, Finished, Available, Finished)

Schema of dim_store DataFrame:
root
 |-- store_key: long (nullable = false)
 |-- store_id: string (nullable = true)
 |-- storetype_id: string (nullable = true)
 |-- store_size: string (nullable = true)
 |-- city_id: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- city_name: string (nullable = true)

Sample of dim_store data:
+---------+--------+------------+----------+-------+------------+---------+
|store_key|store_id|storetype_id|store_size|city_id|country_name|city_name|
+---------+--------+------------+----------+-------+------------+---------+
|        1|   s0002|        st04|      39.0|   C007|      turkey|    adana|
|        2|   s0003|        st03|      17.0|   C014|      turkey| istanbul|
|        3|   s0005|        st04|      19.0|   C001|      turkey|  denizli|
|        4|   s0007|        st03|      16.0|   C014|      turkey| istanbul|
|        5|   s0010|        st04|      17.0|   C014|      turkey| istanbul|
+---------+--------+------------+------

In [11]:
# -------------------------------------------
# Escrita dos dados limpos na camada Gold
# -------------------------------------------
df_stores_clean.write.format("delta").mode("overwrite").saveAsTable("Gold_Data.dim_stores.dim_stores")

StatementMeta(, ee0a703e-e8f0-4841-8c1c-30c5d3f1aeed, 13, Finished, Available, Finished)