In [2]:
import datetime as dt
import logging
from pyspark.sql import SparkSession, functions as F
from utils.connections.sql_server_connector import SQLServerConnector
from utils.trackers import setup_logging
from src.innova.config.config import etl_config

In [3]:
logger = setup_logging(etl_config["paths"]["logs_path"])
logger.info("▶ populate_dim_date_2023 iniciado")

spark = SparkSession.builder.appName("populate_dim_date_2023").getOrCreate()

In [4]:
DWH_SCHEMA: str = "dwh_innova"                 
TABLE_NAME: str = f"{DWH_SCHEMA}.dim_date"
DATE_START: str = dt.date(2023, 1, 1)
DATE_END: str = dt.date(2023, 12, 31)        

In [5]:
ndays = (DATE_END - DATE_START).days + 1
df_dates = (
    spark
    .range(ndays)
    .withColumn("full_date", F.expr(f"date_add('{DATE_START}', cast(id as int))"))
    .withColumn("date_id", F.date_format("full_date", "yyyyMMdd").cast("int"))
    .withColumn("day_of_month", F.dayofmonth("full_date").cast("tinyint"))
    .withColumn("month", F.month("full_date").cast("tinyint"))
    .withColumn("month_name", F.date_format("full_date", "MMMM"))
    .withColumn("quarter", F.quarter("full_date").cast("tinyint"))
    .withColumn("year", F.year("full_date").cast("smallint"))
    .withColumn("week_of_year", F.weekofyear("full_date").cast("tinyint"))
    .withColumn(
        "is_weekend",
        (F.dayofweek("full_date").isin([1, 7])).cast("boolean")
    )
    .select(
        "date_id", "full_date", "day_of_month", "month",
        "month_name", "quarter", "year", "week_of_year", "is_weekend"
    )
)

logger.info(f"Generadas {df_dates.count():,} filas para 2023")

In [6]:
df_dates.show()

+--------+----------+------------+-----+----------+-------+----+------------+----------+
| date_id| full_date|day_of_month|month|month_name|quarter|year|week_of_year|is_weekend|
+--------+----------+------------+-----+----------+-------+----+------------+----------+
|20230101|2023-01-01|           1|    1|   January|      1|2023|          52|      true|
|20230102|2023-01-02|           2|    1|   January|      1|2023|           1|     false|
|20230103|2023-01-03|           3|    1|   January|      1|2023|           1|     false|
|20230104|2023-01-04|           4|    1|   January|      1|2023|           1|     false|
|20230105|2023-01-05|           5|    1|   January|      1|2023|           1|     false|
|20230106|2023-01-06|           6|    1|   January|      1|2023|           1|     false|
|20230107|2023-01-07|           7|    1|   January|      1|2023|           1|      true|
|20230108|2023-01-08|           8|    1|   January|      1|2023|           1|      true|
|20230109|2023-01-09|

In [7]:

with SQLServerConnector(spark, logger=logger) as sql_conn:
    # sql_conn.execute_sql(f"TRUNCATE TABLE {TABLE_NAME}")
    sql_conn.write_dataframe(
        df_dates,
        table=TABLE_NAME,
        mode="append",
        options={"batchsize": 1000}
    )

logger.info("✅ dim_date 2023 cargada")
spark.stop()
