In [0]:
spark.sql("DROP TABLE IF EXISTS workspace.gold.fact_episodes")

In [0]:
spark.sql("""
CREATE TABLE IF NOT EXISTS workspace.gold.dim_time (
  date_key BIGINT,
  date TIMESTAMP,
  year INT,
  quarter INT,
  month INT,
  month_name STRING,
  day INT,
  day_name STRING,
  week_of_year BIGINT,
  is_weekend BOOLEAN

)
""")



In [0]:
import pandas as pd

# Leer silver a pandas
df = spark.table("workspace.silver.tvmaze").toPandas()

# Trabajar con un set fijo de columnas (si falta alguna, se crea en NaN)
fact = df.reindex(
    columns=[
        "episode_id",
        "show_id",
        "episode_name",
        "season",
        "episode_number",
        "episode_runtime",
        "episode_rating",
        "airdate",
        "airstamp",
        "show_network_id",
        "show_webchannel_id",
    ]
)

# Fecha: usa airdate y, si viene nula, toma airstamp
dates = pd.to_datetime(fact["airdate"], errors="coerce").combine_first(
    pd.to_datetime(fact["airstamp"], errors="coerce")
)

# Crear columnas derivadas y dejar solo las finales
fact = (
    fact.assign(
        date_key=dates.dt.strftime("%Y%m%d").astype("Int64"),
        network_id=fact["show_network_id"].combine_first(fact["show_webchannel_id"]),
    )
    .reindex(
        columns=[
            "episode_id",
            "show_id",
            "episode_name",
            "season",
            "episode_number",
            "episode_runtime",
            "episode_rating",
            "date_key",
            "network_id",
        ]
    )
    .dropna(subset=["episode_id"])
    .drop_duplicates(subset=["episode_id"])
)

# Convertir a Spark
df_spark = spark.createDataFrame(fact)


In [0]:

# Usamos overwrite para reemplazar completamente los datos existentes
df_spark.write.format("delta") \
    .option("mergeSchema", "true") \
    .mode("overwrite") \
    .saveAsTable("workspace.gold.fact_episodes")