In [3]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    date_add,
    col,
    date_format,
    dayofweek,
    dayofmonth,
    dayofyear,
    weekofyear,
    month,
    year,
    quarter,
    when,
    expr
)
from pyspark.sql.functions import coalesce
from datetime import date, timedelta

# Initialize Spark Session (if not already initialized by Fabric)
spark = SparkSession.builder.getOrCreate()

# Define Lakehouse and Schema details
lakehouse_name = "Gold_Data"
schema_name = "dim_date"
table_name = "dim_date"

#  start and end dates for your dimension table
start_date = date(2000, 1, 1) #  Start date
end_date = date(2030, 12, 31) # End date

# Generate a list of dates
delta = end_date - start_date
dates = [start_date + timedelta(days=i) for i in range(delta.days + 1)]

# Create a Spark DataFrame from the list of dates
initial_df = spark.createDataFrame([(d,) for d in dates], ["full_date"])

# Cast full_date to DATE type
initial_df = initial_df.withColumn("full_date", col("full_date").cast("date"))

# Generate the dim_date columns based on the provided schema
initial_df = initial_df.withColumn("date_id", date_format(col("full_date"), "yyyyMMdd").cast("int")) \
       .withColumn("day_of_week", dayofweek(col("full_date"))) \
       .withColumn("day_name", date_format(col("full_date"), "EEEE")) \
       .withColumn("day_of_month", dayofmonth(col("full_date"))) \
       .withColumn("day_of_year", dayofyear(col("full_date"))) \
       .withColumn("week_of_year", weekofyear(col("full_date"))) \
       .withColumn("month", month(col("full_date"))) \
       .withColumn("month_name", date_format(col("full_date"), "MMMM")) \
       .withColumn("quarter", quarter(col("full_date"))) \
       .withColumn("year", year(col("full_date"))) \
       .withColumn("is_weekend", \
                   when((dayofweek(col("full_date")) == 1) | (dayofweek(col("full_date")) == 7), True) \
                   .otherwise(False)) \
       .withColumn("is_holiday", expr("false").cast("boolean"))

StatementMeta(, 3a9ac553-50c9-4666-affb-67b1361a5c36, 5, Finished, Available, Finished)

In [4]:
# Turkish retail-related holidays for 2017, 2018, and 2019
holidays_turkey = [
    # 2017 Holidays
    ('2017-01-01', 'New Year\'s Day'),
    ('2017-04-23', 'National Sovereignty and Children\'s Day'),
    ('2017-05-01', 'Labor and Solidarity Day'),
    ('2017-05-19', 'Commemoration of Atatürk, Youth and Sports Day'),
    ('2017-06-25', 'Ramadan Feast - Day 1'),
    ('2017-06-26', 'Ramadan Feast - Day 2'),
    ('2017-06-27', 'Ramadan Feast - Day 3'),
    ('2017-07-15', 'Democracy and National Unity Day'),
    ('2017-08-30', 'Victory Day'),
    ('2017-09-01', 'Sacrifice Feast - Day 1'),
    ('2017-09-02', 'Sacrifice Feast - Day 2'),
    ('2017-09-03', 'Sacrifice Feast - Day 3'),
    ('2017-09-04', 'Sacrifice Feast - Day 4'),
    ('2017-10-29', 'Republic Day'),

    # 2018 Holidays
    ('2018-01-01', 'New Year\'s Day'),
    ('2018-04-23', 'National Sovereignty and Children\'s Day'),
    ('2018-05-01', 'Labor and Solidarity Day'),
    ('2018-05-19', 'Commemoration of Atatürk, Youth and Sports Day'),
    ('2018-06-15', 'Ramadan Feast - Day 1'),
    ('2018-06-16', 'Ramadan Feast - Day 2'),
    ('2018-06-17', 'Ramadan Feast - Day 3'),
    ('2018-07-15', 'Democracy and National Unity Day'),
    ('2018-08-21', 'Sacrifice Feast - Day 1'),
    ('2018-08-22', 'Sacrifice Feast - Day 2'),
    ('2018-08-23', 'Sacrifice Feast - Day 3'),
    ('2018-08-24', 'Sacrifice Feast - Day 4'),
    ('2018-08-30', 'Victory Day'),
    ('2018-10-29', 'Republic Day'),

    # 2019 Holidays
    ('2019-01-01', 'New Year\'s Day'),
    ('2019-04-23', 'National Sovereignty and Children\'s Day'),
    ('2019-05-01', 'Labor and Solidarity Day'),
    ('2019-05-19', 'Commemoration of Atatürk, Youth and Sports Day'),
    ('2019-06-04', 'Ramadan Feast - Day 1'),
    ('2019-06-05', 'Ramadan Feast - Day 2'),
    ('2019-06-06', 'Ramadan Feast - Day 3'),
    ('2019-07-15', 'Democracy and National Unity Day'),
    ('2019-08-11', 'Sacrifice Feast - Day 1'),
    ('2019-08-12', 'Sacrifice Feast - Day 2'),
    ('2019-08-13', 'Sacrifice Feast - Day 3'),
    ('2019-08-14', 'Sacrifice Feast - Day 4'),
    ('2019-08-30', 'Victory Day'),
    ('2019-10-29', 'Republic Day')
]


StatementMeta(, 3a9ac553-50c9-4666-affb-67b1361a5c36, 6, Finished, Available, Finished)

In [5]:
from pyspark.sql.functions import lit, coalesce, sum, when, col, expr

# 1. Build holidays_df with a unique flag name to avoid ambiguity
holidays_df = (
    spark.createDataFrame(holidays_turkey, ["full_date", "holiday_name"])
         .withColumn("full_date", col("full_date").cast("date"))
         .withColumn("holiday_flag", lit(True))      # renamed flag
)

# 2. Merge into dim_date and finalise `is_holiday`
dim_date = (
    initial_df.join(holidays_df.select("full_date", "holiday_flag"), on="full_date", how="left")
      .withColumn("is_holiday", coalesce(col("holiday_flag"), lit(False)))  # True where holiday, else False
      .drop("holiday_flag")
)


StatementMeta(, 3a9ac553-50c9-4666-affb-67b1361a5c36, 7, Finished, Available, Finished)

In [6]:
# Calculate business_day_offset_5_date
# This is a bit more complex as it requires checking for weekends.
# We'll use a UDF or an iterative approach for more robust business day calculations if needed.

dim_date = dim_date.withColumn("business_day_offset_5_date", date_add(col("full_date"), -5))

# Calculate week_start_date (Monday) and week_end_date (Sunday)
# Spark's dayofweek returns 1 for Sunday, 2 for Monday, ..., 7 for Saturday.
# To get Monday: (day_of_week - 2 + 7) % 7 days prior to get to Monday.
dim_date = dim_date.withColumn("week_start_date",
                   expr("date_sub(full_date, (dayofweek(full_date) + 5) % 7)")) \
       .withColumn("week_end_date",
                   expr("date_add(full_date, (7 - dayofweek(full_date)))"))





StatementMeta(, 3a9ac553-50c9-4666-affb-67b1361a5c36, 8, Finished, Available, Finished)

In [7]:
# --- Calculate Weekly Aggregated Features (Holidays and Weekends) ---

# First, create a temporary DataFrame with weekly summaries
weekly_summary_df = dim_date \
    .groupBy("week_start_date") \
    .agg(
        # Count the number of holidays in each week
        sum(when(col("is_holiday") == True, 1).otherwise(0)).alias("num_holidays_in_week"),
        # Create a boolean flag if the week has at least one holiday
        (sum(when(col("is_holiday") == True, 1).otherwise(0)) > 0).alias("has_holiday_in_week"),
        # Count the number of weekend days in each week
        sum(when(col("is_weekend") == True, 1).otherwise(0)).alias("num_weekend_days_in_week")
    )

# Join these weekly summaries back to the main dim_date DataFrame
# We join on 'week_start_date' which effectively broadcasts the weekly summary to each day within that week
dim_date = dim_date.join(weekly_summary_df, on="week_start_date", how="left")

# --- Now, include these new columns in your final select statement ---
dim_date = dim_date.select(
    col("date_id").cast("int"),
    col("full_date").cast("date"),
    col("day_of_week").cast("int"),
    col("day_name").cast("string"),
    col("day_of_month").cast("int"),
    col("day_of_year").cast("int"),
    col("week_of_year").cast("int"),
    col("month").cast("int"),
    col("month_name").cast("string"),
    col("quarter").cast("int"),
    col("year").cast("int"),
    col("is_weekend").cast("boolean"),
    col("is_holiday").cast("boolean"),
    col("business_day_offset_5_date").cast("date"),
    col("week_start_date").cast("date"),
    col("week_end_date").cast("date"),
    # New columns added below:
    col("num_holidays_in_week").cast("int"),
    col("has_holiday_in_week").cast("boolean"),
    col("num_weekend_days_in_week").cast("int")
)

# Display a sample of the generated data 
print("Sample of the generated dim_date data:")
dim_date.show(5)
dim_date.printSchema()


StatementMeta(, 3a9ac553-50c9-4666-affb-67b1361a5c36, 9, Finished, Available, Finished)

Sample of the generated dim_date data:
+--------+----------+-----------+---------+------------+-----------+------------+-----+----------+-------+----+----------+----------+--------------------------+---------------+-------------+--------------------+-------------------+------------------------+
| date_id| full_date|day_of_week| day_name|day_of_month|day_of_year|week_of_year|month|month_name|quarter|year|is_weekend|is_holiday|business_day_offset_5_date|week_start_date|week_end_date|num_holidays_in_week|has_holiday_in_week|num_weekend_days_in_week|
+--------+----------+-----------+---------+------------+-----------+------------+-----+----------+-------+----+----------+----------+--------------------------+---------------+-------------+--------------------+-------------------+------------------------+
|20000106|2000-01-06|          5| Thursday|           6|          6|           1|    1|   January|      1|2000|     false|     false|                2000-01-01|     2000-01-03|   2000-01-08|

In [8]:
# This will create or overwrite the table 'dim_date' within the 'dim_date' schema in your Lakehouse
dim_date.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"`{lakehouse_name}`.`{schema_name}`.`{table_name}`")

print(f"dim_date table successfully created/updated in Lakehouse '{lakehouse_name}', Schema '{schema_name}'.")

StatementMeta(, 3a9ac553-50c9-4666-affb-67b1361a5c36, 10, Finished, Available, Finished)

dim_date table successfully created/updated in Lakehouse 'Gold_Data', Schema 'dim_date'.
