In [57]:
from pyspark.sql.functions import col, concat_ws, to_timestamp, coalesce, lit, when
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType, LongType, IntegerType
from pyspark.sql import functions as F
import pandas as pd
from pyspark.sql.window import Window

StatementMeta(, 936fd94b-8075-4687-931e-a3e251e9d600, 59, Finished, Available, Finished)

### Popular Times (fact_observational_study)

In [58]:
# Load data
stg_observational_study = spark.read.table("stg_observational_study")
dim_date = spark.read.table("dim_date")
dim_events = spark.read.table("dim_events")
dim_park_attributes = spark.read.table("dim_park_attributes")
dim_weather = spark.read.table("dim_weather")
dim_hour = spark.read.table("dim_hour")

display(stg_observational_study.toPandas().head(3))

StatementMeta(, 936fd94b-8075-4687-931e-a3e251e9d600, 60, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 050eea89-1bdb-4bc7-acce-18d00d2d363d)

In [59]:
# Join stg_observational_study + dim_date
fact_observational_study = stg_observational_study.join(
    dim_date,
    stg_observational_study["Date"] == dim_date["Date"],
    "left"
).select(
    stg_observational_study["*"],
    dim_date["DateKey"],
    dim_date["Month"],
    dim_date["DayOfWeek"],
    dim_date["IsWeekend"],
    dim_date["IsHoliday"]
)

StatementMeta(, 936fd94b-8075-4687-931e-a3e251e9d600, 61, Finished, Available, Finished)

In [60]:
display(fact_observational_study.toPandas().head(3))

StatementMeta(, 936fd94b-8075-4687-931e-a3e251e9d600, 62, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, e41ad8ba-f02d-45fc-abb9-358526a13796)

In [61]:
# Join fact_observational_study + dim_hour
fact_observational_study = fact_observational_study.join(
    dim_hour,
    fact_observational_study["Hour"] == dim_hour["Hour"],
    "left"
).select(
    fact_observational_study["*"],
    dim_hour["HourKey"],
    dim_hour["HourLabel"]
)

StatementMeta(, 936fd94b-8075-4687-931e-a3e251e9d600, 63, Finished, Available, Finished)

In [62]:
fact_observational_study = fact_observational_study.join(
    dim_events,
    (fact_observational_study["PlaceID"] == dim_events["PlaceID"]) &
    (fact_observational_study["DateTime"] >= dim_events["StartTime"]) &
    (fact_observational_study["DateTime"] <= dim_events["EndTime"]),
    "left"
).withColumn(
    "HasEvent",
    when(col("EventKey").isNotNull(), 1).otherwise(0)
).withColumn(
    "EventKey",
    coalesce(col("EventKey"), lit(-1))
).select(
    fact_observational_study["*"],
    col("EventKey"),
    col("HasEvent")
)

StatementMeta(, 936fd94b-8075-4687-931e-a3e251e9d600, 64, Finished, Available, Finished)

In [63]:
display(fact_observational_study.toPandas().head(3))

StatementMeta(, 936fd94b-8075-4687-931e-a3e251e9d600, 65, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 41070056-513a-49d8-8fa4-62fecd3fb45b)

In [64]:
# Join dim_weather
fact_observational_study = fact_observational_study.join(
    dim_weather,
    fact_observational_study["DateTime"] == dim_weather["Datetime"],
    "left"
).withColumn(
    "WeatherKey",
    coalesce(col("WeatherKey"), lit(-1))
).select(
    fact_observational_study["*"],
    col("WeatherKey"),
    col("Temp"),
    col("IsRaining"),
    col("IsSnowing")
)

StatementMeta(, 936fd94b-8075-4687-931e-a3e251e9d600, 66, Finished, Available, Finished)

In [65]:
display(fact_observational_study.toPandas().head(3))

StatementMeta(, 936fd94b-8075-4687-931e-a3e251e9d600, 67, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, e9ef7bf1-36b3-4845-bb04-18398e6f7dde)

In [66]:
# Join dim_park_attributes
fact_observational_study = fact_observational_study.join(
    dim_park_attributes,
    fact_observational_study["PlaceID"] == dim_park_attributes["PlaceID"],
    "left"
).withColumn(
    "ParkKey",
    coalesce(col("ParkKey"), lit(-1))
).select(
    fact_observational_study["*"],
    col("ParkKey"),
    col("ParkSize"),
    col("PopHa"),
    col("BusStopHa"),
    col("Score"),
    col("Classification")
)

StatementMeta(, 936fd94b-8075-4687-931e-a3e251e9d600, 68, Finished, Available, Finished)

In [67]:
display(fact_observational_study.toPandas().head(3))

StatementMeta(, 936fd94b-8075-4687-931e-a3e251e9d600, 69, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 1f50001e-ca8d-4ea3-a955-e3c8276ad88d)

In [68]:
# Check for duplicates based on natural key
dup_check = fact_observational_study.groupBy("ParkName", "DateKey", "Hour").agg(F.count("*").alias("row_count")).filter("row_count > 1")
if dup_check.count() > 0:
    print("Found duplicate rows in fact table:")
    dup_check.show()
    # Deduplicate by keeping first
    initial_count = fact_observational_study.count()
    fact_observational_study = fact_observational_study.dropDuplicates(["ParkName", "DateKey", "Hour"])
    final_count = fact_observational_study.count()
    print(f"\nDropped {initial_count - final_count} duplicate rows (from {initial_count} to {final_count}).")
else:
    print("No duplicate rows in fact table.")

StatementMeta(, 936fd94b-8075-4687-931e-a3e251e9d600, 70, Finished, Available, Finished)

Found duplicate rows in fact table:
+------------+--------+----+---------+
|    ParkName| DateKey|Hour|row_count|
+------------+--------+----+---------+
|Tisdall Park|20250207|  12|        2|
+------------+--------+----+---------+


Dropped 1 duplicate rows (from 341 to 340).


In [69]:
# Check for unmatched dimension keys
unmatched_events = fact_observational_study.filter(col("EventKey") == -1).count()
unmatched_weather = fact_observational_study.filter(col("WeatherKey") == -1).count()
unmatched_park_attributes = fact_observational_study.filter(col("ParkKey") == -1).count()

print(f"EventKey -1 count (no matching event): {unmatched_events}")
print(f"WeatherKey -1 count (no matching weather): {unmatched_weather}")
print(f"ParkKey -1 count (no matching park): {unmatched_park_attributes}")

StatementMeta(, 936fd94b-8075-4687-931e-a3e251e9d600, 71, Finished, Available, Finished)

EventKey -1 count (no matching event): 340
WeatherKey -1 count (no matching weather): 0
ParkKey -1 count (no matching park): 91


In [70]:
# Save fact table
fact_observational_study.write.format("delta").mode("overwrite").saveAsTable("fact_observational_study")

StatementMeta(, 936fd94b-8075-4687-931e-a3e251e9d600, 72, Finished, Available, Finished)