## Variables de entorno

In [0]:
dbutils.widgets.removeAll()

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F

In [0]:
dbutils.widgets.text("CATALOGO", "principal")
dbutils.widgets.text("ESQUEMA_SOURCE", "silver")
dbutils.widgets.text("ESQUEMA_DEST", "golden")

In [0]:
catalogo = dbutils.widgets.get("CATALOGO")
esquema_source = dbutils.widgets.get("ESQUEMA_SOURCE")
esquema_dest = dbutils.widgets.get("ESQUEMA_DEST")


### Creacion de modelo estrella

para este proyecto usare un peque√±o modelo estralla (simulando mi ambiente laboral)

###### Dim_products

In [0]:
dim_products = (
    spark.table(f"{catalogo}.{esquema_source}.products")
    .select(
        F.col("ProductID").alias("ProductKey"),
        "ProductName",
        "QuantityPerUnit",
        "UnitPrice"
    )
)

dim_products.write.mode("overwrite").saveAsTable(f"{catalogo}.{esquema_dest}.dim_products")


###### Dim_customers

In [0]:
dim_customers = (
    spark.table(f"{catalogo}.{esquema_source}.orders")
    .select(
        F.col("CustomerID").alias("CustomerKey")
    )
    .dropDuplicates()
)

dim_customers.write.mode("overwrite").saveAsTable(f"{catalogo}.{esquema_dest}.dim_customers")


###### Dim_dates

In [0]:
df_orders = spark.table(f"{catalogo}.{esquema_source}.orders")

dim_dates = (
    df_orders
    .select("OrderDate")
    .dropDuplicates()
    .withColumn("DateKey", F.date_format("OrderDate", "yyyyMMdd").cast("int"))
    .withColumn("Year", F.year("OrderDate"))
    .withColumn("Month", F.month("OrderDate"))
)

dim_dates.write.mode("overwrite").saveAsTable(f"{catalogo}.{esquema_dest}.dim_dates")


###### Dim_country


In [0]:
dim_country = (
    spark.table(f"{catalogo}.{esquema_source}.orders")
    .select(
        F.col("ShipCountry").alias("CountryKey")
    )
    .dropDuplicates()
)

dim_country.write.mode("overwrite").saveAsTable(f"{catalogo}.{esquema_dest}.dim_country")


###### fact_sales


In [0]:
df_od = spark.table(f"{catalogo}.{esquema_source}.order_details").alias("od")
df_o  = spark.table(f"{catalogo}.{esquema_source}.orders").alias("o")
df_p  = spark.table(f"{catalogo}.{esquema_source}.products").alias("p")

fact_sales = (
    df_od
    .join(df_o, "OrderID", "left")
    .join(df_p, "ProductID", "left")
    .select(
        # Foreign keys hacia dimensiones
        F.col("od.OrderID").alias("OrderKey"),
        F.col("o.CustomerID").alias("CustomerKey"),
        F.date_format("o.OrderDate", "yyyyMMdd").cast("int").alias("DateKey"),
        F.col("od.ProductID").alias("ProductKey"),
        F.col("o.ShipCountry").alias("CountryKey"),

        # Metrics
        F.col("od.Quantity"),
        F.col("od.UnitPrice"),
        F.col("od.Discount"),
        (F.col("od.Quantity") * F.col("od.UnitPrice") * (1 - F.col("od.Discount"))).alias("LineTotal")
    )
)

fact_sales.write.mode("overwrite").saveAsTable(f"{catalogo}.{esquema_dest}.fact_sales")
