In [1]:
from pyspark.sql.functions import col, concat_ws, to_timestamp, coalesce, lit, when
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType, LongType, IntegerType
from pyspark.sql import functions as F
import pandas as pd
from pyspark.sql.window import Window

StatementMeta(, f38157da-12e2-45d7-878d-f778f8836a09, 3, Finished, Available, Finished)

### Popular Times (fact_popular_times)

In [2]:
# Load data
stg_popular_times = spark.read.table("stg_popular_times")
dim_date = spark.read.table("dim_date")
dim_events = spark.read.table("dim_events")
dim_park_attributes = spark.read.table("dim_park_attributes")
dim_weather = spark.read.table("dim_weather")
dim_hour = spark.read.table("dim_hour")
dim_park_amenities_wide = spark.read.table("dim_park_amenities_wide")

display(stg_popular_times.toPandas().head(3))

StatementMeta(, f38157da-12e2-45d7-878d-f778f8836a09, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 74faa77c-d030-4254-b0c9-4fd197a3372c)

In [3]:
# Join stg_popular_times + dim_date
fact_popular_times = stg_popular_times.join(
    dim_date,
    stg_popular_times["Date"] == dim_date["Date"],
    "left"
).select(
    stg_popular_times["*"],
    dim_date["DateKey"],
    dim_date["MonthName"],
    dim_date["DayName"],
    dim_date["IsWeekend"],
    dim_date["IsHoliday"]
)


StatementMeta(, f38157da-12e2-45d7-878d-f778f8836a09, 5, Finished, Available, Finished)

In [4]:
display(fact_popular_times.toPandas().head(3))

StatementMeta(, f38157da-12e2-45d7-878d-f778f8836a09, 6, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, fe67a9ba-c2a6-4b4a-a5be-8729fdda5f2d)

In [5]:
# Join fact_popular_times + dim_hour
fact_popular_times = fact_popular_times.join(
    dim_hour,
    fact_popular_times["Hour"] == dim_hour["Hour"],
    "left"
).select(
    fact_popular_times["*"],
    dim_hour["HourKey"],
    dim_hour["HourLabel"]
)

StatementMeta(, f38157da-12e2-45d7-878d-f778f8836a09, 7, Finished, Available, Finished)

In [6]:
fact_popular_times = fact_popular_times.join(
    dim_events,
    (fact_popular_times["PlaceID"] == dim_events["PlaceID"]) &
    (fact_popular_times["DateTime"] >= dim_events["StartTime"]) &
    (fact_popular_times["DateTime"] <= dim_events["EndTime"]),
    "left"
).withColumn(
    "HasEvent",
    when(col("EventKey").isNotNull(), 1).otherwise(0)
).withColumn(
    "EventKey",
    coalesce(col("EventKey"), lit(-1))
).select(
    fact_popular_times["*"],
    col("EventKey"),
    col("HasEvent")
)

StatementMeta(, f38157da-12e2-45d7-878d-f778f8836a09, 8, Finished, Available, Finished)

In [7]:
display(fact_popular_times.toPandas().head(3))

StatementMeta(, f38157da-12e2-45d7-878d-f778f8836a09, 9, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 5fd5d4db-caeb-44d3-9024-ee602a6daa8a)

In [8]:
# Join dim_weather
fact_popular_times = fact_popular_times.join(
    dim_weather,
    fact_popular_times["DateTime"] == dim_weather["Datetime"],
    "left"
).withColumn(
    "WeatherKey",
    coalesce(col("WeatherKey"), lit(-1))
).select(
    fact_popular_times["*"],
    col("WeatherKey"),
    col("Temp"),
    col("IsRaining"),
    col("IsSnowing")
)

StatementMeta(, f38157da-12e2-45d7-878d-f778f8836a09, 10, Finished, Available, Finished)

In [9]:
display(fact_popular_times.toPandas().head(3))

StatementMeta(, f38157da-12e2-45d7-878d-f778f8836a09, 11, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 7a0f953d-bd0e-449d-a9dd-b44d45d3a4e3)

In [10]:
# Join dim_park_attributes
fact_popular_times = fact_popular_times.join(
    dim_park_attributes,
    fact_popular_times["PlaceId"] == dim_park_attributes["PlaceID"],
    "left"
).withColumn(
    "ParkKey",
    coalesce(col("ParkKey"), lit(-1))
).select(
    fact_popular_times["*"],
    col("ParkKey"),
    col("ParkName"),
    col("Classification"),
    col("MaintenanceArea")
)

StatementMeta(, f38157da-12e2-45d7-878d-f778f8836a09, 12, Finished, Available, Finished)

In [11]:
display(fact_popular_times.toPandas().head(3))

StatementMeta(, f38157da-12e2-45d7-878d-f778f8836a09, 13, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 5e686d07-e98a-4e17-82db-35c7c4095d14)

In [12]:
# Join dim_park_amenities_wide
fact_popular_times = fact_popular_times.join(
    dim_park_amenities_wide,
    fact_popular_times["PlaceId"] == dim_park_amenities_wide["PlaceID"],
    how="left"
).select(
    fact_popular_times["*"], 
    col("AmenityKey")
)

StatementMeta(, f38157da-12e2-45d7-878d-f778f8836a09, 14, Finished, Available, Finished)

In [13]:
display(fact_popular_times.toPandas().head(3))

StatementMeta(, f38157da-12e2-45d7-878d-f778f8836a09, 15, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 4d23b218-52e9-47c6-a919-0a4e45e77e16)

In [14]:
# Check for duplicates based on natural key
dup_check = fact_popular_times.groupBy("PlaceId", "DateKey", "Hour").agg(F.count("*").alias("row_count")).filter("row_count > 1")
if dup_check.count() > 0:
    print("Found duplicate rows in fact table:")
    dup_check.show()
    # Deduplicate by keeping first
    initial_count = fact_popular_times.count()
    fact_popular_times = fact_popular_times.dropDuplicates(["PlaceId", "DateKey", "Hour"])
    final_count = fact_popular_times.count()
    print(f"\nDropped {initial_count - final_count} duplicate rows (from {initial_count} to {final_count}).")
else:
    print("No duplicate rows in fact table.")

StatementMeta(, f38157da-12e2-45d7-878d-f778f8836a09, 16, Finished, Available, Finished)

Found duplicate rows in fact table:
+--------------------+--------+----+---------+
|             PlaceId| DateKey|Hour|row_count|
+--------------------+--------+----+---------+
|ChIJo-QmrYxxhlQRF...|20240714|   8|        2|
|ChIJo-QmrYxxhlQRF...|20240609|  13|        2|
|ChIJ_V-LvhZ0hlQRP...|20240908|  11|        2|
|ChIJnVFsezVyhlQRf...|20240811|  10|        2|
|ChIJdT38dmRxhlQRq...|20240928|   9|        2|
|ChIJdT38dmRxhlQRq...|20240818|  15|        2|
|ChIJdT38dmRxhlQRq...|20240804|  20|        2|
|ChIJdT38dmRxhlQRq...|20240928|  12|        2|
|ChIJo-QmrYxxhlQRF...|20241012|  21|        2|
|ChIJo-QmrYxxhlQRF...|20240714|  18|        2|
|ChIJdT38dmRxhlQRq...|20240818|  18|        2|
|ChIJdT38dmRxhlQRq...|20240928|  14|        2|
|ChIJ_V-LvhZ0hlQRP...|20240908|  18|        2|
|ChIJdT38dmRxhlQRq...|20240804|  19|        2|
|ChIJdT38dmRxhlQRq...|20240818|  16|        2|
|ChIJo-QmrYxxhlQRF...|20240907|   7|        2|
|ChIJnVFsezVyhlQRf...|20240811|  11|        2|
|ChIJo-QmrYxxhlQRF...|20

In [15]:
# # Check for duplicates based on natural key
# dup_check = fact_popular_times.groupBy("PlaceId", "DateKey", "Hour").agg(F.count("*").alias("row_count")).filter("row_count > 1")
# if dup_check.count() > 0:
#     print("Found duplicate rows in fact table:")
#     dup_check.show()
#     # Deduplicate by keeping first
#     window = Window.partitionBy("PlaceId", "DateKey", "Hour").orderBy("PlaceId")
#     fact_popular_times = fact_popular_times.withColumn("row_num", F.row_number().over(window)).filter(col("row_num") == 1).drop("row_num")
#     print("Deduplicated fact table.")
# else:
#     print("No duplicate rows in fact table.")


StatementMeta(, f38157da-12e2-45d7-878d-f778f8836a09, 17, Finished, Available, Finished)

In [16]:
# Check for unmatched dimension keys
unmatched_events = fact_popular_times.filter(col("EventKey") == -1).count()
unmatched_weather = fact_popular_times.filter(col("WeatherKey") == -1).count()
unmatched_park_attributes = fact_popular_times.filter(col("ParkKey") == -1).count()

print(f"EventKey -1 count (no matching event): {unmatched_events}")
print(f"WeatherKey -1 count (no matching weather): {unmatched_weather}")
print(f"ParkKey -1 count (no matching park): {unmatched_park_attributes}")

StatementMeta(, f38157da-12e2-45d7-878d-f778f8836a09, 18, Finished, Available, Finished)

EventKey -1 count (no matching event): 444359
WeatherKey -1 count (no matching weather): 1820
ParkKey -1 count (no matching park): 25752


In [17]:
# Save fact table
fact_popular_times.write.format("delta").mode("overwrite").saveAsTable("fact_popular_times")

StatementMeta(, f38157da-12e2-45d7-878d-f778f8836a09, 19, Finished, Available, Finished)