In [2]:
from pyspark.sql.functions import crc32, col, lit, when, sha2, concat_ws, count, row_number, format_string, min as spark_min
from pyspark.sql.types import StringType, DoubleType, LongType, IntegerType
import pandas as pd
import builtins  

StatementMeta(, e651ca1b-3861-41e5-83a8-851536b15ca9, 4, Finished, Available, Finished)

In [5]:
def deduplicate(df, key_col):
    # Check for duplicates
    dups = df.groupBy(key_col).agg(count("*").alias("count")).filter(col("count") > 1)
    dup_count = dups.count()

    if dup_count > 0:
        print(f"Found {dup_count} duplicate {key_col} values. Deduplicating using dropDuplicates...")
        df = df.dropDuplicates([key_col])
    else:
        print(f"No duplicate {key_col} values found.")

    return df

StatementMeta(, 8dad4b53-917c-4ced-b196-81aa67d6d332, 7, Finished, Available, Finished)

### Park Amenities - Wide (dim_park_amenities_wide)

In [7]:
# Load data
stg_park_amenities_wide = spark.read.table("stg_park_amenities_wide")

# Create AmenityKey using ParkName + Amenity (handles NULL PlaceID cases)
dim_park_amenities_wide = stg_park_amenities_wide.withColumn(
    "AmenityKey", crc32(concat_ws("||", col("ParkName"), col("PlaceID")))
)

# Reorder for readability
dim_park_amenities_wide = dim_park_amenities_wide.select(
    "AmenityKey", *[c for c in stg_park_amenities_wide.columns]
)

# Check for duplicate keys
dim_park_amenities_wide = deduplicate(dim_park_amenities_wide, "AmenityKey")

# Write to Delta table
dim_park_amenities_wide.write.format("delta").mode("overwrite").saveAsTable("dim_park_amenities_wide")

display(dim_park_amenities_wide.toPandas().head(3))

StatementMeta(, cfe31aee-a7f7-4092-b1b4-8dffd561c8cf, 9, Finished, Available, Finished)

No duplicate AmenityKey values found.


SynapseWidget(Synapse.DataFrame, fc9a29d4-e3ef-440f-892e-a72f0cf470fb)

### Park Attributes (dim_park_attributes)

In [None]:
# Load data
stg_park_attributes = spark.read.table("stg_park_attributes")

# Drop duplicates based on PlaceID to ensure 1 row per park
dim_park_attributes = stg_park_attributes.dropDuplicates(["PlaceID"])

# Create ParkKey using crc32 (numeric surrogate key)
dim_park_attributes = dim_park_attributes.withColumn(
    "ParkKey", crc32(col("PlaceID"))
)

# Reorder columns for readability (ParkKey first)
dim_park_attributes = dim_park_attributes.select(
    "ParkKey", *[c for c in dim_park_attributes.columns if c != "ParkKey"]
)

In [13]:
display(dim_park_attributes.toPandas().head(3))

StatementMeta(, cfe31aee-a7f7-4092-b1b4-8dffd561c8cf, 15, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, e6701ca0-cd1d-4047-81ad-7d53582043fe)

In [14]:
# Create a DataFrame for the Unknown (-1) row
unknown_row = spark.createDataFrame(
    [
        (-1, "Unknown", "Unknown", None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
    ],
    dim_park_attributes.schema
)

# Append the Unknown row
dim_park_attributes = dim_park_attributes.unionByName(unknown_row)

StatementMeta(, cfe31aee-a7f7-4092-b1b4-8dffd561c8cf, 16, Finished, Available, Finished)

In [15]:
# Check for duplicate keys
dim_park_attributes = deduplicate(dim_park_attributes, "ParkKey")

StatementMeta(, cfe31aee-a7f7-4092-b1b4-8dffd561c8cf, 17, Finished, Available, Finished)

No duplicate ParkKey values found.


In [16]:
# Write to Delta table
dim_park_attributes.write.format("delta").mode("overwrite").saveAsTable("dim_park_attributes")

StatementMeta(, cfe31aee-a7f7-4092-b1b4-8dffd561c8cf, 18, Finished, Available, Finished)

In [17]:
display(dim_park_attributes.toPandas().head(3))

StatementMeta(, cfe31aee-a7f7-4092-b1b4-8dffd561c8cf, 19, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, d53b02d5-6450-49c0-8b30-4d844abf0132)

### Park Amenities (dim_park_amenities)

In [2]:
from pyspark.sql.functions import col, concat_ws, crc32, coalesce, lit

# Load tables
stg_park_amenities = spark.read.table("stg_park_amenities").alias("amenities")
dim_park_attributes = spark.read.table("dim_park_attributes").alias("attributes")

# Add AmenityKey
dim_park_amenities = stg_park_amenities.withColumn(
    "AmenityKey", crc32(concat_ws("||", col("ParkName"), col("Amenity")))
)

# Join using qualified names to avoid ambiguity
dim_park_amenities = dim_park_amenities.alias("amenities").join(
    dim_park_attributes.select(col("PlaceID").alias("attr_PlaceID"), col("ParkKey")),
    col("amenities.PlaceID") == col("attr_PlaceID"),
    how="left"
).withColumn(
    "ParkKey", coalesce(col("ParkKey"), lit(-1))
)

# Reorder columns
final_cols = ["AmenityKey", "ParkKey"] + stg_park_amenities.columns
dim_park_amenities = dim_park_amenities.select(*final_cols)

StatementMeta(, 8dad4b53-917c-4ced-b196-81aa67d6d332, 4, Finished, Available, Finished)

In [8]:
# # Load data
# stg_park_amenities = spark.read.table("stg_park_amenities")

# # Create AmenityKey using ParkName + Amenity (handles NULL PlaceID cases)
# dim_park_amenities = stg_park_amenities.withColumn(
#     "AmenityKey", crc32(concat_ws("||", col("ParkName"), col("Amenity")))
# )

# # Reorder for readability
# dim_park_amenities = dim_park_amenities.select(
#     "AmenityKey", *[c for c in stg_park_amenities.columns]
# )

StatementMeta(, cfe31aee-a7f7-4092-b1b4-8dffd561c8cf, 10, Finished, Available, Finished)

In [6]:
# Check for duplicate keys
dim_park_amenities = deduplicate(dim_park_amenities, "AmenityKey")

StatementMeta(, 8dad4b53-917c-4ced-b196-81aa67d6d332, 8, Finished, Available, Finished)

No duplicate AmenityKey values found.


In [8]:
# Write to Delta table
dim_park_amenities.write.format("delta").mode("overwrite").saveAsTable("dim_park_amenities")

StatementMeta(, 8dad4b53-917c-4ced-b196-81aa67d6d332, 10, Finished, Available, Finished)

In [7]:
display(dim_park_amenities.toPandas().head(3))

StatementMeta(, 8dad4b53-917c-4ced-b196-81aa67d6d332, 9, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 9b17feb6-1448-4206-9ba9-a591289b2c6f)

## Amenity Slicer

In [3]:
# Read the dim_park_amenities table
dim_park_amenities = spark.read.table("dim_park_amenities").filter(col("ParkKey") != -1)

# Select distinct Amenity values
dim_amenity_slicer = dim_park_amenities.select("Amenity").distinct().orderBy("Amenity")

# Optional: Drop nulls if needed
dim_amenity_slicer = dim_amenity_slicer.filter(col("Amenity").isNotNull())

# Save as Delta Table
dim_amenity_slicer.write.format("delta").mode("overwrite").saveAsTable("dim_amenity_slicer")

StatementMeta(, e651ca1b-3861-41e5-83a8-851536b15ca9, 5, Finished, Available, Finished)

### Date (dim_date)

In [16]:
# Load data
stg_live_times = spark.read.table("stg_live_times")
stg_popular_times = spark.read.table("stg_popular_times")

# Get minimum datetime from each table
min_live = stg_live_times.select(spark_min("Datetime").alias("min_time")).collect()[0]["min_time"]
min_popular = stg_popular_times.select(spark_min("Datetime").alias("min_time")).collect()[0]["min_time"]

# Get the earliest of the two
start_date = builtins.min(min_live, min_popular).date()  # convert to datetime.date for compatibility

# Create the date range
date_range = pd.date_range(start=start_date, end="2030-12-31", freq="D")
df = pd.DataFrame({"Date": date_range})

StatementMeta(, 4830c86d-677f-4fdf-be1b-0677ec6fe41a, 18, Finished, Available, Finished)

In [18]:
# Create the date dimension DataFrame
#date_range = pd.date_range(start="2024-01-01", end="2030-12-31", freq="D")
#df = pd.DataFrame({"Date": date_range})
df["DateKey"] = df["Date"].dt.strftime("%Y%m%d").astype(int)
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["MonthName"] = df["Date"].dt.month_name()
df["MonthAbbr"] = df["Date"].dt.strftime("%b")
df["Day"] = df["Date"].dt.day
df["DayName"] = df["Date"].dt.day_name()
df["DayOfWeek"] = df["Date"].dt.dayofweek + 1 # to start at 1
df["WeekNumber"] = df["Date"].dt.isocalendar().week
df["Quarter"] = df["Date"].dt.quarter
df["IsWeekend"] = df["DayName"].isin(["Saturday", "Sunday"]).astype(int)

# Add current indicator
today = pd.Timestamp.today().normalize()             # midnight today
df["CurrentInd"] = (df["Date"] <= today).astype(int) # 1 = past/today, 0 = future

# Convert to Spark
dim_date = spark.createDataFrame(df)

# Load stg_holidays
stg_holidays = spark.read.table("stg_holidays")

# Ensure holiday column is date type
stg_holidays = stg_holidays.withColumn("Holidays", col("Holidays").cast("date"))

# Join to add IsHoliday
dim_date = dim_date.join(
    stg_holidays.select(col("Holidays").alias("HolidayDate")),
    dim_date["Date"] == col("HolidayDate"),
    how="left"
).withColumn(
    "IsHoliday",
    when(col("HolidayDate").isNotNull(), lit(1)).otherwise(lit(0))
).drop("HolidayDate")

StatementMeta(, 4830c86d-677f-4fdf-be1b-0677ec6fe41a, 20, Finished, Available, Finished)

  [UNSUPPORTED_DATA_TYPE_FOR_ARROW_CONVERSION] uint32 is not supported in conversion to Arrow.
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


In [19]:
# Check for duplicate keys
dim_date = deduplicate(dim_date, "DateKey")

StatementMeta(, 4830c86d-677f-4fdf-be1b-0677ec6fe41a, 21, Finished, Available, Finished)

No duplicate DateKey values found.


In [20]:
# Save to table
dim_date.write.format("delta").mode("overwrite").saveAsTable("dim_date")

StatementMeta(, 4830c86d-677f-4fdf-be1b-0677ec6fe41a, 22, Finished, Available, Finished)

In [21]:
display(dim_date.toPandas().head(3))

StatementMeta(, 4830c86d-677f-4fdf-be1b-0677ec6fe41a, 23, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 9c419672-6da9-4ddd-9491-d1a486028e44)

### Hour (dim_hour)

In [22]:
# Create base hour DataFrame (0 to 23)
df_hour = pd.DataFrame({"Hour": range(24)})
df_hour["HourKey"] = df_hour["Hour"]  # Same as Hour
df_hour["HourLabel"] = df_hour["Hour"].apply(lambda x: f"{x:02d}:00")

# Add period labels
def get_period(hour):
    if 0 <= hour < 6:
        return "Night"
    elif 6 <= hour < 12:
        return "Morning"
    elif 12 <= hour < 18:
        return "Afternoon"
    else:
        return "Evening"

df_hour["HourPeriod"] = df_hour["Hour"].apply(get_period)

# Convert to Spark DataFrame
dim_hour = spark.createDataFrame(df_hour)

# Deduplicate HourKey just in case
dim_hour = dim_hour.dropDuplicates(["HourKey"])

# Save as Delta table
dim_hour.write.format("delta").mode("overwrite").saveAsTable("dim_hour")

StatementMeta(, cfe31aee-a7f7-4092-b1b4-8dffd561c8cf, 24, Finished, Available, Finished)

In [23]:
display(dim_hour.toPandas().head(3))

StatementMeta(, cfe31aee-a7f7-4092-b1b4-8dffd561c8cf, 25, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 05049680-c9d2-4cda-98a2-62fdffe36c5f)

### Event (dim_event)

In [24]:
# Load data
stg_events = spark.read.table("stg_events")

# Generate EventKey
dim_events = stg_events.withColumn(
    "EventKey",
    crc32(concat_ws("_", "ParkName", "StartTime", "EndTime"))
)

# Rearrange columns
dim_events = dim_events.select(
    "EventKey", "ParkName", "PlaceID", "StartTime", "EndTime", "EventName"
)

# Create -1 "Unknown Event" 
no_event_row = spark.createDataFrame([
    (-1, "Unknown", "Unknown", None, None, "No Event")
], dim_events.schema)

# Union with main dim_events
dim_events = dim_events.unionByName(no_event_row)

StatementMeta(, cfe31aee-a7f7-4092-b1b4-8dffd561c8cf, 26, Finished, Available, Finished)

In [25]:
# Check for duplicate keys
dim_events = deduplicate(dim_events, "EventKey")

StatementMeta(, cfe31aee-a7f7-4092-b1b4-8dffd561c8cf, 27, Finished, Available, Finished)

No duplicate EventKey values found.


In [26]:
# Save to table
dim_events.write.format("delta").mode("overwrite").saveAsTable("dim_events")

StatementMeta(, cfe31aee-a7f7-4092-b1b4-8dffd561c8cf, 28, Finished, Available, Finished)

In [27]:
display(dim_events.toPandas().head(3))

StatementMeta(, cfe31aee-a7f7-4092-b1b4-8dffd561c8cf, 29, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 1b53dc93-30c5-4c88-b393-d6f9a2841fa3)

### Weather (dim_weather)

In [28]:
# Load data
stg_weather = spark.read.table("stg_weather_merged")
display(stg_weather.toPandas().head(3))

StatementMeta(, cfe31aee-a7f7-4092-b1b4-8dffd561c8cf, 30, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 95702187-b9ba-4586-9eae-13dbbcdc226a)

In [29]:
# Create WeatherKey using crc32 (numeric surrogate key)
dim_weather = stg_weather.withColumn(
    "WeatherKey", crc32(col("Datetime").cast("string"))
)

# Rearrange columns
other_cols = [c for c in dim_weather.columns if c != "WeatherKey"]
dim_weather = dim_weather.select(["WeatherKey"] + other_cols)

# Add -1 row for unknown weather
unknown_row = spark.createDataFrame([
    (-1, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
], dim_weather.schema)

dim_weather = dim_weather.unionByName(unknown_row)

StatementMeta(, cfe31aee-a7f7-4092-b1b4-8dffd561c8cf, 31, Finished, Available, Finished)

In [30]:
# Check for duplicate keys
dim_weather = deduplicate(dim_weather, "WeatherKey")

StatementMeta(, cfe31aee-a7f7-4092-b1b4-8dffd561c8cf, 32, Finished, Available, Finished)

No duplicate WeatherKey values found.


In [31]:
# Save as Delta table
dim_weather.write.format("delta").mode("overwrite").saveAsTable("dim_weather")

StatementMeta(, cfe31aee-a7f7-4092-b1b4-8dffd561c8cf, 33, Finished, Available, Finished)