In [None]:
import json
import pandas as pd
import glob
import os

# Ensure we are in the project root
if os.path.basename(os.getcwd()) == "notebooks_clean":
    os.chdir("..")
print(f"Current Working Directory: {os.getcwd()}")

## **1. Process Incidents Data**
Flatten the nested JSON structure of incidents into a tabular format.

In [None]:
with open("datasets/processed-incidents.json", "r") as file:
    json_data = json.load(file)

all_incidents = []

for incident_id, incident_data in json_data.items():
    incidents_list = incident_data.get('incidents', [])
    affected_sensors = incident_data.get('affected-sensors', [])
    
    for incident in incidents_list:
        incident_row = incident.copy()
        incident_row['incident_id'] = incident_id
        incident_row['affected_sensors'] = affected_sensors
        
        if 'coordinate' in incident_row and incident_row['coordinate']:
            incident_row['latitude'] = float(incident_row['coordinate'][0])
            incident_row['longitude'] = float(incident_row['coordinate'][1])
        
        all_incidents.append(incident_row)

incidents_df = pd.DataFrame(all_incidents)
incidents_df.to_csv("datasets/incidents.csv", index=False)
print(f"Saved incidents.csv with {len(incidents_df)} records.")
incidents_df.head(3)

## **2. Process Traffic Counts (2018 & 2019)**
Aggregate traffic count JSON files from multiple directories.

In [None]:
traffic_data_list = []

# Process both 2018 and 2019 folders
for year in ['2018', '2019']:
    files = glob.glob(f"./trafficcounts/{year}/*.json")
    print(f"Found {len(files)} files in {year}...")
    
    for file_path in files:
        with open(file_path, "r") as file:
            data = json.load(file)
        
        for camera_name, image_dict in data.items():
            for img_filename, record in image_dict.items():
                traffic_data_list.append({
                    'camera_name': camera_name,
                    'img_jpg': img_filename,
                    'inbound': record.get('inbound', None),
                    'outbound': record.get('outbound', None),
                    'timestamp': record.get('timestamp', None)
                })

traffic_df = pd.DataFrame(traffic_data_list)
traffic_df.to_csv("datasets/traffic_dataset.csv", index=False)
print(f"Saved traffic_dataset.csv with {len(traffic_df)} records.")
traffic_df.head(3)

## **3. Process Graph Topology (Sensor Locations)**
Extract sensor metadata (latitude, longitude) from graph JSONs.

In [None]:
graph_rows = []

graph_files = [
    "./graphs/buffalogrove/buffalogrove-graph.json",
    "./graphs/gurnee/gurnee-graph.json"
]

for g_file in graph_files:
    with open(g_file, "r") as file:
        data = json.load(file)
        
    for sensor_id, info in data.get("sensor-dictionary", {}).items():
        # info structure: [[lat, long], "camera_name", ...]
        graph_rows.append({
            "sensor_id": sensor_id,
            "latitude": info[0][0],
            "longitude": info[0][1],
            "camera_name": info[1]
        })

graph_df = pd.DataFrame(graph_rows)
graph_df.to_csv("datasets/graph_dataset.csv", index=False)
print(f"Saved graph_dataset.csv with {len(graph_df)} sensors.")
graph_df.head(3)