In [1]:
from pyspark.sql.functions import col, concat_ws, to_timestamp, coalesce, lit, when
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType, LongType, IntegerType
from pyspark.sql import functions as F
import pandas as pd
from pyspark.sql.window import Window

StatementMeta(, 0e5bcb59-e88c-4a5b-99f5-cdc4ac04c1cc, 3, Finished, Available, Finished)

### **Live Times (fact_live_times)**

In [2]:
# Load data
stg_live_times = spark.read.table("stg_live_times")
dim_date = spark.read.table("dim_date")
dim_events = spark.read.table("dim_events")
dim_park_attributes = spark.read.table("dim_park_attributes")
dim_weather = spark.read.table("dim_weather")
dim_hour = spark.read.table("dim_hour")
dim_park_amenities_wide = spark.read.table("dim_park_amenities_wide")

display(stg_live_times.toPandas().head(3))

StatementMeta(, 0e5bcb59-e88c-4a5b-99f5-cdc4ac04c1cc, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, c3b7cf70-be56-4819-a139-6a79ff8c175b)

In [3]:
from pyspark.sql.functions import min, max, to_date

# Ensure DateTime is cast to date if needed
print("Date Range in dim_date:")
dim_date.select(
    min(to_date("Date")).alias("MinDate"),
    max(to_date("Date")).alias("MaxDate")
).show()

# Ensure DateTime is cast to date if needed
print("Date Range in dim_weather:")
dim_weather.select(
    min(to_date("DateTime")).alias("MinDate"),
    max(to_date("DateTime")).alias("MaxDate")
).show()

# Ensure DateTime is cast to date if needed
print("Date Range in stg_live_times:")
stg_live_times.select(
    min(to_date("DateTimeCollected")).alias("MinDate"),
    max(to_date("DateTimeCollected")).alias("MaxDate")
).show()

StatementMeta(, 0e5bcb59-e88c-4a5b-99f5-cdc4ac04c1cc, 5, Finished, Available, Finished)

Date Range in dim_date:
+----------+----------+
|   MinDate|   MaxDate|
+----------+----------+
|2024-04-30|2030-12-31|
+----------+----------+

Date Range in dim_weather:
+----------+----------+
|   MinDate|   MaxDate|
+----------+----------+
|2024-05-01|2025-05-22|
+----------+----------+

Date Range in stg_live_times:
+----------+----------+
|   MinDate|   MaxDate|
+----------+----------+
|2024-04-30|2025-05-15|
+----------+----------+



In [5]:
from pyspark.sql.functions import to_date

# Join stg_live_times + dim_date
fact_live_times = stg_live_times.join(
    dim_date,
    to_date(stg_live_times["DateTimeCollected"]) == dim_date["Date"],
    "left"
).select(
    stg_live_times["*"],
    dim_date["DateKey"],
    dim_date["IsWeekend"],
    dim_date["IsHoliday"],
    dim_date["DayOfWeek"]
)

StatementMeta(, 0e5bcb59-e88c-4a5b-99f5-cdc4ac04c1cc, 7, Finished, Available, Finished)

In [6]:
# Join fact_live_times + dim_hour
fact_live_times = fact_live_times.join(
    dim_hour,
    fact_live_times["Hour"] == dim_hour["Hour"],
    "left"
).select(
    fact_live_times["*"],
    dim_hour["HourKey"],
    dim_hour["HourLabel"]
)

StatementMeta(, 0e5bcb59-e88c-4a5b-99f5-cdc4ac04c1cc, 8, Finished, Available, Finished)

In [7]:
# Join dim_weather
fact_live_times = fact_live_times.join(
    dim_weather,
    fact_live_times["Datetime"] == dim_weather["Datetime"],
    how="left"
).withColumn(
    "WeatherKey",
    coalesce(col("WeatherKey"), lit(-1))
).select(
    fact_live_times["*"],
    col("WeatherKey"),
    col("Temp"),
    col("IsRaining"),
    col("IsSnowing")
)

StatementMeta(, 0e5bcb59-e88c-4a5b-99f5-cdc4ac04c1cc, 9, Finished, Available, Finished)

In [8]:
# Join dim_park_attributes
fact_live_times = fact_live_times.join(
    dim_park_attributes,
    fact_live_times["PlaceId"] == dim_park_attributes["PlaceID"],
    "left"
).withColumn(
    "ParkKey",
    coalesce(col("ParkKey"), lit(-1))
).select(
    fact_live_times["*"],
    col("ParkKey"),
    col("ParkName"),
)

StatementMeta(, 0e5bcb59-e88c-4a5b-99f5-cdc4ac04c1cc, 10, Finished, Available, Finished)

In [9]:
# Join dim_park_amenities_wide
fact_live_times = fact_live_times.join(
    dim_park_amenities_wide,
    fact_live_times["PlaceId"] == dim_park_amenities_wide["PlaceID"],
    how="left"
).select(
    fact_live_times["*"], 
    col("AmenityKey")
)

StatementMeta(, 0e5bcb59-e88c-4a5b-99f5-cdc4ac04c1cc, 11, Finished, Available, Finished)

In [10]:
# Join dim_events
fact_live_times = fact_live_times.join(
    dim_events,
    (fact_live_times["PlaceID"] == dim_events["PlaceID"]) &
    (fact_live_times["DateTime"] >= dim_events["StartTime"]) &
    (fact_live_times["DateTime"] <= dim_events["EndTime"]),
    "left"
).withColumn(
    "HasEvent",
    when(col("EventKey").isNotNull(), 1).otherwise(0)
).withColumn(
    "EventKey",
    coalesce(col("EventKey"), lit(-1))
).select(
    fact_live_times["*"],
    col("EventKey"),
    col("HasEvent")
)

StatementMeta(, 0e5bcb59-e88c-4a5b-99f5-cdc4ac04c1cc, 12, Finished, Available, Finished)

In [11]:
from pyspark.sql.functions import col

# Check number of Event
fact_live_times.groupBy("HasEvent").count().show()

StatementMeta(, 0e5bcb59-e88c-4a5b-99f5-cdc4ac04c1cc, 13, Finished, Available, Finished)

+--------+------+
|HasEvent| count|
+--------+------+
|       1|  1730|
|       0|384803|
+--------+------+



In [12]:
# Check for duplicates based on natural key
dup_check = fact_live_times.groupBy("PlaceId", "DateKey", "Hour").agg(F.count("*").alias("row_count")).filter("row_count > 1")
if dup_check.count() > 0:
    print("Found duplicate rows in fact table:")
    dup_check.show()
    # Deduplicate by keeping first
    initial_count = fact_live_times.count()
    fact_popular_times = fact_live_times.dropDuplicates(["PlaceId", "DateKey", "Hour"])
    final_count = fact_popular_times.count()
    print(f"\nDropped {initial_count - final_count} duplicate rows (from {initial_count} to {final_count}).")
else:
    print("No duplicate rows in fact table.")

StatementMeta(, 0e5bcb59-e88c-4a5b-99f5-cdc4ac04c1cc, 14, Finished, Available, Finished)

Found duplicate rows in fact table:
+--------------------+--------+----+---------+
|             PlaceId| DateKey|Hour|row_count|
+--------------------+--------+----+---------+
|ChIJdT38dmRxhlQRq...|20240818|  15|        2|
|ChIJdT38dmRxhlQRq...|20240804|  20|        2|
|ChIJF391QrBzhlQRw...|20250424|   7|        2|
|ChIJdT38dmRxhlQRq...|20240928|   9|        2|
|ChIJpcthL9tzhlQRc...|20250424|   7|        2|
|ChIJqe3omWVxhlQRn...|20250424|   7|        2|
|ChIJo-QmrYxxhlQRF...|20241012|  21|        2|
|ChIJo-QmrYxxhlQRF...|20240714|  18|        2|
|ChIJdT38dmRxhlQRq...|20240928|  12|        2|
|ChIJBzAe6bV2hlQRz...|20250424|   7|        2|
|ChIJnVFsezVyhlQRf...|20240811|  10|        2|
|ChIJMVTzBthzhlQRW...|20250424|   7|        2|
|ChIJ_V-LvhZ0hlQRP...|20240908|  11|        2|
|ChIJLQnMgZJyhlQRn...|20250424|   7|        2|
|ChIJP2cAOltxhlQRO...|20250424|   7|        2|
|ChIJP8TmK392hlQRs...|20250424|   7|        2|
|ChIJq74VcLx2hlQRq...|20250424|   7|        2|
|ChIJo-QmrYxxhlQRF...|20

In [14]:
# Check for unmatched dimension keys
unmatched_events = fact_live_times.filter(col("EventKey") == -1).count()
unmatched_weather = fact_live_times.filter(col("WeatherKey") == -1).count()
unmatched_park_attributes = fact_live_times.filter(col("ParkKey") == -1).count()

print(f"EventKey -1 count (no matching event): {unmatched_events}")
print(f"WeatherKey -1 count (no matching weather): {unmatched_weather}")
print(f"ParkKey -1 count (no matching park): {unmatched_park_attributes}")

StatementMeta(, 0e5bcb59-e88c-4a5b-99f5-cdc4ac04c1cc, 16, Finished, Available, Finished)

EventKey -1 count (no matching event): 384803
WeatherKey -1 count (no matching weather): 1598
ParkKey -1 count (no matching park): 24814


In [15]:
from pyspark.sql.functions import crc32, concat_ws, col

fact_live_times = fact_live_times.withColumn(
    "FactLiveKey",
    crc32(concat_ws("_", col("ParkKey").cast("string"), col("DateKey").cast("string"), col("HourKey").cast("string")))
)

StatementMeta(, 0e5bcb59-e88c-4a5b-99f5-cdc4ac04c1cc, 17, Finished, Available, Finished)

In [17]:
# Save fact table
fact_live_times.write.format("delta").mode("overwrite").saveAsTable("fact_live_times")

StatementMeta(, 0e5bcb59-e88c-4a5b-99f5-cdc4ac04c1cc, 19, Finished, Available, Finished)