This notebook analyzes the service alerts data from the City of Cape Town, both planned and unplanned, to provide a summary of alerts by service area.

In [23]:
import json
import pandas as pd
from pathlib import Path
import glob

# Find the latest service alerts files
service_alerts_dir = Path('service-alerts-data')
planned_file = max(service_alerts_dir.glob('service_alerts_planned_*.json'))
unplanned_file = max(service_alerts_dir.glob('service_alerts_unplanned_*.json'))

# Read the JSON files
with open(planned_file) as f:
    planned_alerts = json.load(f)
    
with open(unplanned_file) as f:
    unplanned_alerts = json.load(f)

# Convert to DataFrames
planned_df = pd.DataFrame(planned_alerts)
unplanned_df = pd.DataFrame(unplanned_alerts)

# Add a column to identify the alert type
planned_df['alert_type'] = 'Planned'
unplanned_df['alert_type'] = 'Unplanned'

# Combine the dataframes
all_alerts = pd.concat([planned_df, unplanned_df], ignore_index=True)

# Count all alerts by date
print(all_alerts['publish_date'].value_counts())
print()

# Count all alerts by date (Map display: effective_date >= today)
print(all_alerts['effective_date'].value_counts())
print()

# Count all alerts by date
print(all_alerts['expiry_date'].value_counts())
print()

# Count all alerts by date
print(all_alerts['start_timestamp'].value_counts())
print()

# Count all alerts by date
print(all_alerts['forecast_end_timestamp'].value_counts())
print()


publish_date
2025-06-06T22:00:00.000Z    9
2025-06-08T22:00:00.000Z    8
2025-06-07T22:00:00.000Z    4
2025-05-19T22:00:00.000Z    2
2024-07-14T22:00:00.000Z    2
Name: count, dtype: int64

effective_date
2025-06-06T22:00:00.000Z    9
2025-06-08T22:00:00.000Z    8
2025-06-07T22:00:00.000Z    4
2025-05-20T22:00:00.000Z    2
2024-07-14T22:00:00.000Z    2
Name: count, dtype: int64

expiry_date
2025-06-09T22:00:00.000Z    20
2025-06-11T22:00:00.000Z     2
2025-06-16T22:00:00.000Z     1
2027-04-15T22:00:00.000Z     1
2027-07-15T22:00:00.000Z     1
Name: count, dtype: int64

start_timestamp
2025-06-09T06:00:00.000Z    2
2025-06-07T16:20:00.000Z    2
2025-05-21T17:00:00.000Z    2
2025-06-08T22:00:00.000Z    1
2025-06-07T19:19:00.000Z    1
2024-07-15T15:00:00.000Z    1
2025-06-07T06:30:00.000Z    1
2025-06-07T09:00:00.000Z    1
2025-06-07T09:25:00.000Z    1
2025-06-07T10:02:00.000Z    1
2025-06-07T11:05:00.000Z    1
2025-06-07T17:13:00.000Z    1
2025-06-08T10:00:00.000Z    1
2025-06-07T23:00:0

Let's analyze the distribution of alerts across different service areas, split by planned vs unplanned:

In [24]:
# Create a pivot table
service_area_summary = pd.pivot_table(
    all_alerts,
    values='Id',
    index='service_area',
    columns='alert_type',
    aggfunc='count',
    fill_value=0
)

# Add a total column
service_area_summary['Total'] = service_area_summary.sum(axis=1)

# Sort by total number of alerts
service_area_summary = service_area_summary.sort_values('Total', ascending=False)

print("Service Alerts Summary:")
print("-" * 60)
print(service_area_summary)
print("\nTotal Alerts:", len(all_alerts))
print("Planned Alerts:", len(planned_df))
print("Unplanned Alerts:", len(unplanned_df))

Service Alerts Summary:
------------------------------------------------------------
alert_type                 Planned  Unplanned  Total
service_area                                        
Water & Sanitation               1         11     12
Electricity                      0          9      9
Drivers Licence Enquiries        0          2      2
Roads and Stormwater             2          0      2

Total Alerts: 25
Planned Alerts: 3
Unplanned Alerts: 22


Let's create a bar chart to visualize the distribution of alerts by service area: