In [268]:
from pyspark.sql.functions import col, trim, initcap, to_timestamp, count, when, split, trim, from_utc_timestamp, date_format, hour, to_date, month, concat_ws, lpad, lit, countDistinct, substring
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType, LongType, IntegerType
from pyspark.sql.functions import coalesce
from datetime import datetime, timedelta
import re

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 270, Finished, Available, Finished)

In [241]:
# Function to report null counts for each column
def report_nulls(df):
    null_counts = df.select([
        count(when(col(c).isNull(), c)).alias(c) for c in df.columns
    ]).toPandas().T
    null_counts.columns = ['Count of nulls']
    print(f"\nNull counts:\n{null_counts}")

# Function to drop duplicates and report counts
def drop_duplicates(df):
    initial_count = df.count()
    df = df.dropDuplicates()
    final_count = df.count()
    print(f"\nDropped {initial_count - final_count} duplicate rows (from {initial_count} to {final_count})")
    return df

# Function to drop rows where specified columns are null + report counts
def drop_na_columns(df, cols):
    if isinstance(cols, str):
        cols = [cols]
    initial_count = df.count()
    condition = " AND ".join([f"{c} IS NULL" for c in cols]) # Build AND condition: all columns are null
    df = df.filter(f"NOT ({condition})") # Filter out rows where all cols are null
    final_count = df.count()
    rows_removed = initial_count - final_count
    col_list_str = ", ".join(cols)
    print(f"\nRemoved {rows_removed} rows where all of these columns are null: {col_list_str}")
    return df

# Function to trim string columns
def trim_string_columns(df, cols=None):
    if cols is None:
        cols = [f.name for f in df.schema.fields if isinstance(f.dataType, StringType)]
    for c in cols:
        df = df.withColumn(c, trim(col(c)))
    print(f"\nTrimmed strings in columns: {cols}")
    return df

# Function to rename columns to PascalCase
def rename_columns(df):
    for col_name in df.columns:
        # Insert spaces before capital letters (handles camelCase, PascalCase)
        spaced = re.sub(r'(?<!^)(?=[A-Z])', ' ', col_name)
        # Replace separators with spaces, split, capitalize each, and join
        clean_name = ''.join(
            word.capitalize() 
            for word in spaced.replace("/", " ").replace("-", " ").replace("_", " ").split()
        )
        df = df.withColumnRenamed(col_name, clean_name)
    print("\nRenamed columns to PascalCase\n")
    return df

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 243, Finished, Available, Finished)

### Parks ID

In [184]:
# Read raw table
stg_parks_id = spark.read.table("raw_parks_id")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 186, Finished, Available, Finished)

In [186]:
# Drop unwanted columns
stg_parks_id = stg_parks_id.drop("scrapedAt")

# Apply cleaning steps
report_nulls(stg_parks_id)
stg_parks_id = drop_duplicates(stg_parks_id)
stg_parks_id = drop_na_columns(stg_parks_id, ["placeID"])
stg_parks_id = trim_string_columns(stg_parks_id, cols=["Title", "URL", "placeID", "description", "categoryName"])

# Rename columns
stg_parks_id = stg_parks_id.withColumnRenamed("Title", "ParkName")
stg_parks_id = stg_parks_id.withColumnRenamed("URL", "GoogleURL")
stg_parks_id = rename_columns(stg_parks_id)

# Final schema preview
stg_parks_id.printSchema()

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 188, Finished, Available, Finished)


Null counts:
              Count of nulls
Title                      0
URL                        0
placeId                    0
description               28
categoryName               0

Dropped 0 duplicate rows (from 68 to 68)

Removed 0 rows where all of these columns are null: placeID

Trimmed strings in columns: ['Title', 'URL', 'placeID', 'description', 'categoryName']

Renamed columns to PascalCase

root
 |-- ParkName: string (nullable = true)
 |-- GoogleURL: string (nullable = true)
 |-- PlaceID: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- CategoryName: string (nullable = true)



In [188]:
# Save to table
stg_parks_id.write.format("delta").mode("overwrite").saveAsTable("stg_parks_id")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 190, Finished, Available, Finished)

### Events

In [189]:
# Read raw table
stg_events = spark.read.table("raw_events")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 191, Finished, Available, Finished)

In [191]:
# Drop unwanted columns
stg_events = stg_events.drop("OBJECTID", "GISParkID")

# Apply cleaning steps
report_nulls(stg_events)
stg_events = drop_duplicates(stg_events)
stg_events = drop_na_columns(stg_events, ["park_name", "start_time", "end_time"])
stg_events = trim_string_columns(stg_events, cols=["park_name", "event_name"])

# Join stg_events with stg_parks_id on ParkName (or other matching field)
stg_events = stg_events.join(
    stg_parks_id,
    stg_events["park_name"] == stg_parks_id["ParkName"],
    "inner"
).select(
    stg_events["*"],
    stg_parks_id["PlaceID"]
)

# Rename columns
stg_events = rename_columns(stg_events)

# Final schema preview
stg_events.printSchema()

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 193, Finished, Available, Finished)


Null counts:
            Count of nulls
park_name                0
start_time               0
end_time                 0
event_name               0

Dropped 9 duplicate rows (from 652 to 643)

Removed 0 rows where all of these columns are null: park_name, start_time, end_time

Trimmed strings in columns: ['park_name', 'event_name']

Renamed columns to PascalCase

root
 |-- ParkName: string (nullable = true)
 |-- StartTime: timestamp (nullable = true)
 |-- EndTime: timestamp (nullable = true)
 |-- EventName: string (nullable = true)
 |-- PlaceID: string (nullable = true)



In [194]:
# Save to table
stg_events.write.format("delta").mode("overwrite").saveAsTable("stg_events")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 196, Finished, Available, Finished)

### Holidays

In [195]:
# Read raw table
stg_holidays = spark.read.table("raw_holidays")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 197, Finished, Available, Finished)

In [197]:
# Convert from MM/dd/yyyy to timestamp
stg_holidays = stg_holidays.withColumn(
    "holidays",
    to_timestamp(col("holidays"), "M/d/yyyy")
)

# Apply cleaning steps
report_nulls(stg_holidays)
stg_holidays = drop_duplicates(stg_holidays)
stg_holidays = drop_na_columns(stg_holidays, ["holidays"])
stg_holidays = rename_columns(stg_holidays)

# Final schema preview
stg_holidays.printSchema()

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 199, Finished, Available, Finished)


Null counts:
          Count of nulls
holidays               0

Dropped 0 duplicate rows (from 22 to 22)

Removed 0 rows where all of these columns are null: holidays

Renamed columns to PascalCase

root
 |-- Holidays: timestamp (nullable = true)



In [199]:
# Save to table
stg_holidays.write.format("delta").mode("overwrite").saveAsTable("stg_holidays")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 201, Finished, Available, Finished)

### Amenities

In [200]:
# Read raw table
stg_park_amenities = spark.read.table("raw_park_amenities")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 202, Finished, Available, Finished)

In [202]:
# Drop unwanted columns
stg_park_amenities = stg_park_amenities.drop("OBJECTID", "GISParkID")

# Apply cleaning steps
report_nulls(stg_park_amenities)
stg_park_amenities = drop_duplicates(stg_park_amenities)
stg_park_amenities = drop_na_columns(stg_park_amenities, ["Park"])
stg_park_amenities = trim_string_columns(stg_park_amenities, cols=["placeID", "Park"])

# Rename columns
stg_park_amenities = stg_park_amenities.withColumnRenamed("Park", "ParkName")
stg_park_amenities = rename_columns(stg_park_amenities)

# Loop through columns and cast LongType to IntegerType
for field in stg_park_amenities.schema.fields:
    if isinstance(field.dataType, LongType):
        stg_park_amenities = stg_park_amenities.withColumn(
            field.name,
            col(field.name).cast(IntegerType())
        )

# Final schema preview
stg_park_amenities.printSchema()

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 204, Finished, Available, Finished)


Null counts:
                    Count of nulls
Park                             0
placeID                        192
Score                            0
Volleyball                       0
Cricket                          0
Tennis                           0
Skateboard                       0
RunningTrack                     0
BMX                              0
Pickleball                       0
Parkour                          0
MultiUseSportCourt               0
Lacrosse                         0
DiscGolf                         0
Basketball                       0
Playground                       0
SprayPark                        0
CommunityGarden                  0
OLA                              0
Synturf                          0
ClassA1                          0
ClassB1                          0
ClassB2                          0
ClassC                           0
BallHockey                       0
BaseballDiamond                  0
Washroom                         0
ClassD

In [203]:
# Create stg_park_amenities_wide

# Drop unwanted columns
stg_park_amenities_wide = stg_park_amenities.drop("Score", "MaintenanceArea")

# Display result
display(stg_park_amenities_wide.head(5))

# Save to table
stg_park_amenities_wide.write.format("delta").mode("overwrite").saveAsTable("stg_park_amenities_wide")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 205, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, c727a634-ae17-43d6-9713-6e94620436df)

In [204]:
# Create stg_park_amenities_long (Convert wide to long table)

# Convert from Spark to pandas
pdf = stg_park_amenities.toPandas()

# Create dim_park_amenities (long format)
df_long = pdf.melt(
    id_vars=["ParkName", "PlaceID", "MaintenanceArea", "Score"],
    var_name="Amenity",
    value_name="AmenityCount"
)

# Filter and drop columns
df_long = df_long[df_long["AmenityCount"] > 0].copy()
stg_park_amenities_long = df_long.drop(columns=["MaintenanceArea", "Score"])

# Display result
display(stg_park_amenities_long.head(5))

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 206, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, f025cc78-b690-470d-a7b5-b68a0023c187)

In [205]:
# Create stg_park_amenities_summary
stg_park_amenities_summary = df_long.drop(columns=["Amenity", "AmenityCount"])
stg_park_amenities_summary = stg_park_amenities_summary.drop_duplicates()

# Display result
display(stg_park_amenities_summary.head(5))
display(stg_park_amenities_summary.shape)

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 207, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 0dfef0c0-5f2f-457a-bcb3-489b29fc2c44)

(173, 4)

In [206]:
# Convert back to PySpark
stg_park_amenities_long = spark.createDataFrame(stg_park_amenities_long)
stg_park_amenities_summary = spark.createDataFrame(stg_park_amenities_summary)

# Save to table
stg_park_amenities_long.write.format("delta").mode("overwrite").saveAsTable("stg_park_amenities")
stg_park_amenities_summary.write.format("delta").mode("overwrite").saveAsTable("stg_park_amenities_summary")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 208, Finished, Available, Finished)

### Classifications

In [207]:
# Read raw table
stg_parks_classifications = spark.read.table("raw_parks_classifications")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 209, Finished, Available, Finished)

In [209]:
# Convert each to DoubleType
numeric_cols = ["parkSize", "Pop_ha", "Bus_Stop_ha"]
for c in numeric_cols:
    stg_parks_classifications = stg_parks_classifications.withColumn(c, col(c).cast(DoubleType()))

# Apply cleaning steps
report_nulls(stg_parks_classifications)
stg_parks_classifications = drop_duplicates(stg_parks_classifications)
stg_parks_classifications = drop_na_columns(stg_parks_classifications, ["park_name"])
stg_parks_classifications = trim_string_columns(stg_parks_classifications, cols=["park_name", "Classification", "placeID"])
stg_parks_classifications = rename_columns(stg_parks_classifications)

# Final schema preview
stg_parks_classifications.printSchema()

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 211, Finished, Available, Finished)


Null counts:
                Count of nulls
park_name                    0
Classification               0
placeID                    188
parkSize                     0
Pop_ha                       2
Bus_Stop_ha                  2

Dropped 0 duplicate rows (from 254 to 254)

Removed 0 rows where all of these columns are null: park_name

Trimmed strings in columns: ['park_name', 'Classification', 'placeID']

Renamed columns to PascalCase

root
 |-- ParkName: string (nullable = true)
 |-- Classification: string (nullable = true)
 |-- PlaceID: string (nullable = true)
 |-- ParkSize: double (nullable = true)
 |-- PopHa: double (nullable = true)
 |-- BusStopHa: double (nullable = true)



In [211]:
# Save to table
stg_parks_classifications.write.format("delta").mode("overwrite").saveAsTable("stg_parks_classifications")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 213, Finished, Available, Finished)

### Location

In [212]:
# Read raw table
stg_parks_location = spark.read.table("raw_parks_location")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 214, Finished, Available, Finished)

In [214]:
# Drop unwanted columns
stg_parks_location = stg_parks_location.drop("PARK_ID")

# Convert to Double type
stg_parks_location = stg_parks_location.withColumn("AREA_HA", col("AREA_HA").cast(DoubleType()))

# Split geo_point_2d into Longitude & Latitude
stg_parks_location = stg_parks_location.withColumn("Latitude", trim(split(col("geo_point_2d"), ",").getItem(0)).cast(DoubleType())) \
                                       .withColumn("Longitude", trim(split(col("geo_point_2d"), ",").getItem(1)).cast(DoubleType()))

# Apply cleaning steps
report_nulls(stg_parks_location)
stg_parks_location = drop_duplicates(stg_parks_location)
stg_parks_location = drop_na_columns(stg_parks_location, ["PARK_NAME"])
stg_parks_location = trim_string_columns(stg_parks_location, cols=["PARK_NAME", "PARK_URL", "Geom"])

# Rename columns
stg_parks_location = stg_parks_location.withColumnRenamed("PARK_NAME", "ParkName")
stg_parks_location = stg_parks_location.withColumnRenamed("AREA_HA", "AreaHa")
stg_parks_location = stg_parks_location.withColumnRenamed("PARK_URL", "ParkFinderURL")
stg_parks_location = rename_columns(stg_parks_location)

# Final schema preview
stg_parks_location.printSchema()

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 216, Finished, Available, Finished)


Null counts:
              Count of nulls
PARK_NAME                  1
AREA_HA                    5
PARK_URL                  12
Geom                       0
geo_point_2d               0
Latitude                   0
Longitude                  0

Dropped 0 duplicate rows (from 254 to 254)

Removed 1 rows where all of these columns are null: PARK_NAME

Trimmed strings in columns: ['PARK_NAME', 'PARK_URL', 'Geom']

Renamed columns to PascalCase

root
 |-- ParkName: string (nullable = true)
 |-- AreaHa: double (nullable = true)
 |-- ParkFinderURL: string (nullable = true)
 |-- Geom: string (nullable = true)
 |-- GeoPoint2d: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)



In [216]:
# Save to table
stg_parks_location.write.format("delta").mode("overwrite").saveAsTable("stg_parks_location")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 218, Finished, Available, Finished)

### Park Observational Study

In [285]:
# Load data
raw_parks_observational_study = spark.read.table("raw_parks_observational_study")
raw_parks_observational_study_id = spark.read.table("raw_parks_observational_study_id")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 287, Finished, Available, Finished)

In [286]:
# Alias for clarity (optional)
obs = raw_parks_observational_study.alias("obs")
obs_id = raw_parks_observational_study_id.alias("obs_id")

# Join first — let both PlaceIDs come in
joined_df = (obs.join(
        obs_id,                
        obs["park_selection"] == obs_id["ParkName"],
        how="inner"
    )
)

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 288, Finished, Available, Finished)

In [287]:
display(joined_df.toPandas().head(3))

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 289, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 83e77a66-7b64-4fe6-b2d2-ae8a8f3976fd)

In [288]:
# Drop unwanted columns
stg_observational_study = joined_df.drop("OBJECTID", "created_user", "last_edited_user")

# Convert datetime_pst
stg_observational_study = stg_observational_study.withColumn("datetime_clean", substring("datetime_pst", 1, 19))
stg_observational_study = stg_observational_study.withColumn("DateTime", to_timestamp("datetime_clean", "yyyy-MM-dd'T'HH:mm:ss"))
stg_observational_study = stg_observational_study.withColumn("Date", to_date(col("DateTime")))
stg_observational_study = stg_observational_study.withColumn("Hour", hour(col("DateTime")))

# Drop unwanted columns
stg_observational_study = stg_observational_study.drop("datetime_clean", "datetime_pst", "park_selection")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 290, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 8fff2e81-09ab-414e-b8e4-e136e19b226f)

In [289]:
# Convert each to DoubleType
integer_cols = ["count_male_infant_toddler",
    "count_female_infant_toddler",
    "count_nb_infant_toddler",
    "count_male_child",
    "count_female_child",
    "count_nb_child",
    "count_male_teen_young_adult",
    "count_female_teen_young_adult",
    "count_nb_teen_young_adult",
    "count_male_adult",
    "count_female_adult",
    "count_nb_adult",
    "count_male_senior",
    "count_female_senior",
    "count_nb_senior",
    "dogs_present",
    "count_all",
    "dogs_present_noleash"
]
double_cols = ["x", "y"]

for c in integer_cols:
    stg_observational_study = stg_observational_study.withColumn(c, col(c).cast(IntegerType()))

for c in double_cols:
    stg_observational_study = stg_observational_study.withColumn(c, col(c).cast(DoubleType()))

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 291, Finished, Available, Finished)

In [290]:
# Apply cleaning steps
report_nulls(stg_observational_study)
stg_observational_study = drop_duplicates(stg_observational_study)

# Rename columns
stg_observational_study = rename_columns(stg_observational_study)

# Final schema preview
stg_observational_study.printSchema()

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 292, Finished, Available, Finished)


Null counts:
                               Count of nulls
count_male_infant_toddler                   0
count_female_infant_toddler                 0
count_nb_infant_toddler                     0
count_male_child                            0
count_female_child                          0
count_nb_child                              0
count_male_teen_young_adult                 0
count_female_teen_young_adult               0
count_nb_teen_young_adult                   0
count_male_adult                            0
count_female_adult                          0
count_nb_adult                              0
count_male_senior                           0
count_female_senior                         0
count_nb_senior                             0
dogs_present                                0
active_level_percent                       12
additional_notes                           65
count_all                                   0
x                                           0
y                   

In [291]:
# Reorder columns
priority_cols = ["ParkName", "PlaceID", "DateTime", "Date", "Hour", "CountAll"]
other_cols = [col for col in stg_observational_study.columns if col not in priority_cols]
final_cols = priority_cols + other_cols
stg_observational_study = stg_observational_study.select(final_cols)

display(stg_observational_study.toPandas().head(3))

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 293, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 3d63ce55-f44f-4431-9fd3-1cc049e02766)

In [292]:
# Save to table
stg_observational_study.write.format("delta").mode("overwrite").saveAsTable("stg_observational_study")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 294, Finished, Available, Finished)

### Park Attributes (Merged)

In [217]:
# Load data
stg_parks_classifications = spark.read.table("stg_parks_classifications")
stg_parks_id = spark.read.table("stg_parks_id")
stg_park_location = spark.read.table("stg_parks_location")
stg_park_amenities_summary = spark.read.table("stg_park_amenities_summary")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 219, Finished, Available, Finished)

In [218]:
# Alias for clarity (optional)
amen = stg_park_amenities_summary.alias("amen")
cls = stg_parks_classifications.alias("cls")

# Join first — let both PlaceIDs come in
joined_df = amen.join(
    cls,
    amen["ParkName"] == cls["ParkName"],
    "inner"
)

# Create resolved PlaceID as a new unique column
joined_df = joined_df.withColumn(
    "ResolvedPlaceID",
    coalesce(amen["PlaceID"], cls["PlaceID"])
)

# Now select what you want, drop ambiguous PlaceIDs
stg_park_attributes = joined_df.select(
    "ResolvedPlaceID",  # The resolved PlaceID
    *[f"amen.{c}" for c in amen.columns if c != "PlaceID"],
    *[f"cls.{c}" for c in cls.columns if c != "PlaceID" and c != "ParkName"]
).withColumnRenamed("ResolvedPlaceID", "PlaceID")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 220, Finished, Available, Finished)

In [220]:
# Alias for clarity (optional)
attr = stg_park_attributes.alias("attr")
loc = stg_park_location.alias("loc")

# Join
joined_df = attr.join(
    loc,
    attr["ParkName"] == loc["ParkName"],
    "inner"
)

# Since PlaceID is only in attr, we can just rename it cleanly
stg_park_attributes = joined_df.select(
    attr["PlaceID"],
    *[f"attr.{c}" for c in attr.columns if c != "PlaceID"],
    *[f"loc.{c}" for c in loc.columns if c != "ParkName"]
)

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 222, Finished, Available, Finished)

In [222]:
# Alias for clarity (optional but recommended)
attr = stg_park_attributes.alias("attr")
pid = stg_parks_id.alias("pid")

# Join
joined_df = attr.join(
    pid,
    attr["PlaceID"] == pid["PlaceID"],
    "inner"
)

# Select PlaceID + ParkName from attr, plus all other columns
stg_park_attributes = joined_df.select(
    attr["PlaceID"],
    attr["ParkName"],
    *[f"attr.{c}" for c in attr.columns if c not in ("PlaceID", "ParkName")],
    *[f"pid.{c}" for c in pid.columns if c not in ("PlaceID", "ParkName")]
)

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 224, Finished, Available, Finished)

In [224]:
# Group by PlaceID and count occurrences
dups_df = (
    stg_park_attributes.groupBy("PlaceID")
    .agg(count("*").alias("count"))
    .filter("count > 1")
)

# Show duplicate PlaceIDs and their counts
display(dups_df.toPandas())

# Count number of duplicate PlaceIDs
num_dups = dups_df.count()
print(f"Number of PlaceIDs with duplicates: {num_dups}")


StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 226, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 0a1d1c54-008a-4f85-a262-3c77d5e90eb7)

Number of PlaceIDs with duplicates: 1


In [225]:
# Drop duplicates
stg_park_attributes = drop_duplicates(stg_park_attributes)

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 227, Finished, Available, Finished)


Dropped 0 duplicate rows (from 65 to 65)


In [226]:
# Save to table
stg_park_attributes.write.format("delta").mode("overwrite").saveAsTable("stg_park_attributes")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 228, Finished, Available, Finished)

### Popular Times

In [227]:
# Read raw table
stg_popular_times = spark.read.table("raw_popular_times")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 229, Finished, Available, Finished)

In [229]:
# Apply cleaning steps
report_nulls(stg_popular_times)
stg_popular_times = drop_duplicates(stg_popular_times)
stg_popular_times = drop_na_columns(stg_popular_times, ["placeId", "occupancyPercent"])
stg_popular_times = trim_string_columns(stg_popular_times, cols=["placeId", "DOW"])

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 231, Finished, Available, Finished)


Null counts:
                   Count of nulls
placeId                         0
DOW                             0
hour                            0
occupancyPercent                0
DateTimeCollected               0

Dropped 126 duplicate rows (from 52833588 to 52833462)

Removed 0 rows where all of these columns are null: placeId, occupancyPercent

Trimmed strings in columns: ['placeId', 'DOW']


In [230]:
# Rename columns
stg_popular_times = (
    stg_popular_times
    .withColumnRenamed('DOW', 'week_day')
    .withColumnRenamed('occupancyPercent', 'popular_times_percent')
    .withColumnRenamed('placeId', 'PlaceID')
)

# Convert to Double type
stg_popular_times = stg_popular_times.withColumn("hour", col("hour").cast(IntegerType()))
stg_popular_times = stg_popular_times.withColumn("popular_times_percent", col("popular_times_percent").cast(IntegerType()))

# Parse string to timestamp (assumes UTC)
stg_popular_times = stg_popular_times.withColumn(
    'DateTimeCollected',
    to_timestamp('DateTimeCollected')
)

# Convert UTC to Vancouver time
stg_popular_times = stg_popular_times.withColumn(
    'DateTimeCollected',
    from_utc_timestamp('DateTimeCollected', 'America/Vancouver')
)

# Extract day name
stg_popular_times = stg_popular_times.withColumn(
    'day_name',
    date_format('DateTimeCollected', 'EEEE')  # 'EEEE' gives full day name
)

# Extract hour
stg_popular_times = stg_popular_times.withColumn(
    'hour_collected',
    hour('DateTimeCollected')
)

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 232, Finished, Available, Finished)

In [231]:
## Retain greatest amount of data collected

# Filter hour_collected == 18
stg_popular_times = stg_popular_times.filter(col('hour_collected') == 18)

# Replace week_day codes with full names
stg_popular_times = (
    stg_popular_times
    .withColumn(
        'week_day',
        when(col('week_day') == 'Fr', 'Friday')
        .when(col('week_day') == 'Mo', 'Monday')
        .when(col('week_day') == 'Tu', 'Tuesday')
        .when(col('week_day') == 'We', 'Wednesday')
        .when(col('week_day') == 'Th', 'Thursday')
        .when(col('week_day') == 'Sa', 'Saturday')
        .when(col('week_day') == 'Su', 'Sunday')
        .otherwise(col('week_day'))
    )
)

# Create date (just date part) + month
stg_popular_times = (
    stg_popular_times
    .withColumn('date', to_date('DateTimeCollected'))
    .withColumn('month', month('DateTimeCollected'))
)

# Create datetime_hour by combining date + hour + ':00'
# Ensure hour is two digits
stg_popular_times = stg_popular_times.withColumn(
    'date_time',
    to_timestamp(
        concat_ws(
            ' ',
            col('date').cast('string'),
            concat_ws(
                ':',
                lpad(col('hour').cast('string'), 2, '0'),
                lit('00')
            )
        )
    )
)

# Filter rows where week_day == day_name
stg_popular_times = stg_popular_times.filter(col('week_day') == col('day_name'))

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 233, Finished, Available, Finished)

In [232]:
# Categorize occupancy
stg_popular_times = stg_popular_times.withColumn(
    'occupancy',
    when(col('popular_times_percent') <= 30, 'Low')
    .when(col('popular_times_percent') <= 70, 'Medium')
    .otherwise('High')
)

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 234, Finished, Available, Finished)

In [235]:
# Select columns
stg_popular_times = stg_popular_times.select(
    'placeId', 'date_time', 'date', 'hour', 'popular_times_percent', 'occupancy'
)

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 237, Finished, Available, Finished)

In [237]:
# Drop duplicates
stg_popular_times = drop_duplicates(stg_popular_times)

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 239, Finished, Available, Finished)


Dropped 0 duplicate rows (from 446118 to 446118)


In [238]:
# Rename to pascal case
stg_popular_times = rename_columns(stg_popular_times)

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 240, Finished, Available, Finished)


Renamed columns to PascalCase



In [239]:
# Save to table
stg_popular_times.write.format("delta").mode("overwrite").saveAsTable("stg_popular_times")

StatementMeta(, ec08d598-8668-4e7c-9d51-1a3c93ef3cc8, 241, Finished, Available, Finished)