# Melbourne Parking Data Cleaning & Processing

In [1]:
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Set base directory
base_dir = "./Datasets" 

sensors_df = pd.read_csv(f"{base_dir}/on-street-parking-bay-sensors.csv")
bays_df = pd.read_csv(f"{base_dir}/on-street-parking-bays.csv")
zones_df = pd.read_csv(f"{base_dir}/parking-zones-linked-to-street-segments.csv")
restrictions_df = pd.read_csv(f"{base_dir}/sign-plates-located-in-each-parking-zone.csv")

## 1. Sensor Data

**Purpose:** Prepare real-time sensor data for analysis

- Remove parking zones with null values
- Filter out sensor readings older than 30 minutes to ensure real-time relevance
- Validate that each KerbsideID is unique

In [3]:
# Make a copy
sensors_clean = sensors_df.copy()

# 1. Remove null zones
sensors_clean = sensors_clean[sensors_clean["Zone_Number"].notna()]

# 2. Remove data older than 30 mins
sensors_clean["Status_Timestamp"] = pd.to_datetime(sensors_clean["Status_Timestamp"])
latest_time = sensors_clean['Status_Timestamp'].max()
cutoff_time = latest_time - pd.Timedelta(minutes=30)

# cutoff_time = datetime.now() - pd.Timedelta(minutes=30)

sensors_clean = sensors_clean[sensors_clean["Status_Timestamp"] >= cutoff_time]

In [4]:
# Check if KerbsideIDs are unique after cleaning
kerbside_count = len(sensors_clean)
unique_kerbside_count = sensors_clean['KerbsideID'].nunique()

print(f"Total rows: {kerbside_count}")
print(f"Unique KerbsideIDs: {unique_kerbside_count}")
print(f"Are all KerbsideIDs unique? {kerbside_count == unique_kerbside_count}")

# If not unique, see the duplicates
if kerbside_count != unique_kerbside_count:
   duplicates = sensors_clean[sensors_clean.duplicated(subset=['KerbsideID'], keep=False)]
   print(f"Duplicate KerbsideIDs: {len(duplicates)}")

Total rows: 661
Unique KerbsideIDs: 661
Are all KerbsideIDs unique? True


In [5]:
sensors_clean.head()

Unnamed: 0,Lastupdated,Status_Timestamp,Zone_Number,Status_Description,KerbsideID,Location
450,2025-08-09T10:29:39+10:00,2025-08-09 10:28:08+10:00,7649.0,Present,50663,"-37.80908828360667, 144.97187134487461"
454,2025-08-09T10:29:39+10:00,2025-08-09 10:26:27+10:00,7649.0,Present,50669,"-37.80878490795517, 144.97173126985038"
455,2025-08-09T10:29:39+10:00,2025-08-09 10:17:32+10:00,7649.0,Present,50670,"-37.80873508250238, 144.97170828952787"
463,2025-08-09T10:29:39+10:00,2025-08-09 10:19:58+10:00,7558.0,Unoccupied,65183,"-37.810487086510605, 144.9687299278653"
464,2025-08-09T10:29:39+10:00,2025-08-09 10:09:06+10:00,7556.0,Present,65162,"-37.81057630472673, 144.9680868422238"


## 2. Clean Zones Data

**Purpose:** Map parking zones to street names

- Filter parking zones to match available sensor data
- Create street-to-zone mappings for location grouping
- Identify streets with unique parking zones from all total sensor zones

In [6]:
# Make a copy of the zones data and get street names
zones_clean = zones_df.copy()

# Just the zone numbers from cleaned sensor data
sensor_zones = sensors_clean['Zone_Number'].unique()
filtered_zones = zones_clean[zones_clean['ParkingZone'].isin(sensor_zones)]
street_zones_dict = filtered_zones.groupby('OnStreet')['ParkingZone'].apply(list).to_dict()

In [7]:
# Check result
print(f"Streets found: {len(street_zones_dict)}")
print("Street-zone mappings:")
for street, zones in list(street_zones_dict.items())[:10]:
   print(f"  {street}: {zones}")

Streets found: 52
Street-zone mappings:
  Albert Street: [7236, 7236, 7237, 7239, 7245, 7234]
  Aquitania Way: [7156, 7156]
  Blackwood Street: [7708]
  Bond Street: [7331, 7329]
  Bourke Street: [7340, 7332, 7335, 7579, 7178, 7173, 7333, 7345, 7334, 7339, 7336, 7343, 7320, 7170]
  Bromby Street: [7348, 7348, 7348]
  Cathedral Place: [7247]
  Church Street: [7350]
  Cobden Street: [7721, 7716, 7716, 7721]
  Collins Street: [7355, 7359, 7358, 7366, 7360, 7363, 7190, 7189, 7356, 7362, 7344]


In [8]:
# Count total unique zones across all streets
all_zones = set()
for zones in street_zones_dict.values():
   all_zones.update(zones)

total_unique_zones = len(all_zones)
print(f"Total unique zones across all streets: {total_unique_zones}")

# Compare with sensor data
sensor_zones_count = sensors_clean['Zone_Number'].nunique()
print(f"Zones in your sensor data: {sensor_zones_count}")

# List out the missing zones
missing_zones = set(sensors_clean['Zone_Number'].unique()) - set(all_zones)
print(f"Missing zones: {len(missing_zones)}")
print(missing_zones)

Total unique zones across all streets: 175
Zones in your sensor data: 199
Missing zones: 24
{7689.0, 7690.0, 7949.0, 7696.0, 7697.0, 7591.0, 7593.0, 7347.0, 7478.0, 7485.0, 7365.0, 7621.0, 7503.0, 7506.0, 7763.0, 7764.0, 7765.0, 7769.0, 7772.0, 7389.0, 7780.0, 7792.0, 7542.0, 7676.0}


## 3. Get Street Coordinates from Bays Data

**Purpose**: Get geographic coordinates for mapping

- Map Segment_IDs to street names from zones data
- Merge latitude/longitude coordinates from parking bays dataset
- **Calculate average coordinates per street name** (handling multiple segments per street)
- Results: X streets with representative coordinate points

In [9]:
# Get Segment_IDs for the streets
street_segments = filtered_zones["Segment_ID"].unique()

# Get lat/lon from parking bays using Segment_IDs
bays_clean = bays_df.copy()
street_locations = bays_clean[bays_clean["RoadSegmentID"].isin(street_segments)]

# Get average lat/lon for each segment
street_coordinates = street_locations.groupby('RoadSegmentID').agg({
    'Latitude': 'mean',
    'Longitude': 'mean'
}).reset_index()

# Link segments to street names and average by street
# Link segments to street names
segment_to_street = filtered_zones[['Segment_ID', 'OnStreet']].drop_duplicates()

# Merge coordinate data
street_coords = street_coordinates.merge(segment_to_street, left_on='RoadSegmentID', right_on='Segment_ID')

# Average coordinates by street name
street_avg_coords = street_coords.groupby('OnStreet').agg({
    'Latitude': 'mean',
    'Longitude': 'mean'
}).reset_index()

print(f"Got coordinates for {len(street_avg_coords)} streets")
print(street_avg_coords.head())

Got coordinates for 52 streets
           OnStreet   Latitude   Longitude
0     Albert Street -37.809793  144.977912
1     Aquitania Way -37.813803  144.942232
2  Blackwood Street -37.801301  144.955081
3       Bond Street -37.818115  144.962761
4     Bourke Street -37.815462  144.958880


## 4. Filter Restrictions Data

**Purpose:** Identify parkable zones and their time restrictions

- Filter restrictions to match sensor zone numbers
- Remove "Never Available" zones (Prohibited Parking/Disabled Parking)
- Extract parking time restrictions, days, and display information

In [10]:
# Get restrictions for your sensor zones
sensor_zone_numbers = sensors_clean["Zone_Number"].unique()

# Filter restrictions for your zones
zone_restrictions = restrictions_df[restrictions_df["ParkingZone"].isin(sensor_zone_numbers)].copy()

# Remove "Never Available" zones (PP, DP)
parkable_restrictions = zone_restrictions[~zone_restrictions["Restriction_Display"].str.contains("PP|DP", na=False)]

# Get the columns you need
restrictions_display = parkable_restrictions[[
    "ParkingZone", "Restriction_Days", "Time_Restrictions_Start",
    "Time_Restrictions_Finish", "Restriction_Display"
]].copy()

In [11]:
parkable_unique_zones = parkable_restrictions["ParkingZone"].nunique()
print(f"Unique parkable zones: {parkable_unique_zones}")

Unique parkable zones: 199
