In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import rasterra as rt
import numpy as np
from pathlib import Path
import contextily as ctx
from matplotlib.colors import Normalize, PowerNorm

from measure_io import load_measure
from measure_io import create_measure

# global vars
CRS = 'EPSG:3310' # this is california albers

# file paths
RAW_DATA_DIR = Path("/Users/laurenwilner/Desktop/Desktop/epidemiology_PhD/01_data/raw/")

In [2]:
# functions
def load_circuit(path):
    gdf = gpd.read_file(path)
    gdf = gdf.rename(columns = lambda c: c.lower()).dissolve("circuit_na")["geometry"].reset_index().to_crs(CRS)
    return gdf

def load_all_circuits():
    # clean polylines
    # delete extra cols
    # union polylines for a given circuit 
    # join all three provider circuit files into 1
    # NOTE: using polygon files since SDGE is only polygons. Polygons are Polylines with a 18m buffer.
    
    pge = load_circuit(RAW_DATA_DIR/ "psps_circuit_data/ICA_circuits/PGE/PGE_circuits_polygons/PGE_circuits_polygons.shp")
    sce = load_circuit(RAW_DATA_DIR/ "psps_circuit_data/ICA_circuits/SCE/SCE_circuits_polygon/SCE_circuits_polygon.shp")
    sdge = load_circuit(RAW_DATA_DIR/ "psps_circuit_data/ICA_circuits/SDGE_circuits/SDGE_circuits.shp")
    return pd.concat([pge,sce,sdge])

In [3]:
# load and format data 

# ca census tracts
ca_ct = load_measure("ca_census_tracts").to_crs(CRS)
ca_shape = ca_ct.unary_union

# zcta boundaries
zcta_boundaries = load_measure("us_zcta_boundaries").to_crs(CRS)
print("loaded boundaries")
zcta_ca = zcta_boundaries[zcta_boundaries.intersects(ca_shape)]

# psps data
psps = load_measure("us_circuit_psps_daily_plus_eDME")
print("loaded psps")

# load circuits
all_circuits = load_all_circuits()
print("loaded all circuits")

  ca_shape = ca_ct.unary_union


loaded boundaries
loaded psps
loaded all circuits


In [4]:
ca_shape = ca_ct.unary_union
zcta_boundaries = load_measure("us_zcta_boundaries").to_crs(CRS)
zcta_ca = zcta_boundaries[zcta_boundaries.intersects(ca_shape)]
# create_measure(zcta_ca, "ca_zcta_shp")

  ca_shape = ca_ct.unary_union


In [6]:
# Overlay circuits with zctas  (using overlap since the circuit:zctas ratio is smaller)
circuits_zcta = all_circuits.overlay(zcta_ca, keep_geom_type = True) # join zctas  onto circuit df using an overlay

In [10]:
# Number of customers and eDME customers impacted by each circuit-ZCTA overlap.

# calculate the weight for each circuit-ZCTA overlap based on the proportion of the circuit that falls within the ZCTA.
circuits_zcta["intersect_area"] = circuits_zcta.area
circuits_zcta["circuit_area"] = circuits_zcta.groupby("circuit_na")["intersect_area"].transform("sum")
circuits_zcta["weight"] = circuits_zcta["intersect_area"] / circuits_zcta["circuit_area"]

# collapse psps to one row per event-circuit 
psps_collapse = psps[["circuit_name_ica", "psps_event_id", "total_customers_impacted", "medical_baseline_customers_impacted"]].drop_duplicates()

# multiply the weight by the total_customers_impacted to find the customers_impacted for each ZCTA.
customers_impacted_zcta = psps_collapse.merge(circuits_zcta, how = 'left', left_on = 'circuit_name_ica', right_on = 'circuit_na')
customers_impacted_zcta["total_customers_impacted"] = customers_impacted_zcta["total_customers_impacted"] * customers_impacted_zcta["weight"]
customers_impacted_zcta["medical_baseline_customers_impacted"] = customers_impacted_zcta["medical_baseline_customers_impacted"] * customers_impacted_zcta["weight"]

# sum up the customers_impacted for circuits that fall within the same zcta
customers_impacted_zcta = customers_impacted_zcta.groupby(['zcta', 'psps_event_id'])[['total_customers_impacted', 'medical_baseline_customers_impacted']].sum().reset_index()

# subset to rows with total_customers_impacted>0
customers_impacted_zcta = customers_impacted_zcta[customers_impacted_zcta["total_customers_impacted"]>0]

# merge back on psps metadata
# take the min outage start, max outage end, and recalc duration
psps_summary = pd.concat([
    psps.groupby('psps_event_id')['outage_start'].min(),
    psps.groupby('psps_event_id')['outage_end'].max(),
], axis=1)
psps_summary['duration'] = psps_summary['outage_end']-psps_summary['outage_start']
customers_impacted_zcta = customers_impacted_zcta.merge(psps_summary, how = 'inner', left_on = 'psps_event_id', right_on = 'psps_event_id')

customers_impacted_zcta

Unnamed: 0,zcta,psps_event_id,total_customers_impacted,medical_baseline_customers_impacted,outage_start,outage_end,duration
0,90265,SCE_2019/10/27,2273.000000,13.000000,2019-10-27 10:57:00,2019-11-04 01:28:00,7 days 14:31:00
1,90265,SCE_2020/11/29,455.000000,6.000000,2020-12-03 01:20:50,2020-12-05 02:30:00,2 days 01:09:10
2,90265,SCE_2020/12/04,2336.000000,19.000000,2020-12-07 13:29:00,2020-12-11 19:45:00,4 days 06:16:00
3,90265,SCE_2021/01/12,3050.465022,31.629761,2021-01-14 12:13:00,2021-01-22 02:01:00,7 days 13:48:00
4,90265,SCE_2021/11/21,1129.000000,20.000000,2021-11-21 13:41:00,2021-11-22 21:45:00,1 days 08:04:00
...,...,...,...,...,...,...,...
3638,96155,PG&E_2020/10/25,205.944517,4.271096,2020-10-25 17:00:00,2020-10-29 05:25:00,3 days 12:25:00
3639,96161,PG&E_2019/10/09,650.505891,3.428655,2019-10-09 07:09:00,2019-10-13 00:41:00,3 days 17:32:00
3640,96161,PG&E_2019/10/26,645.751591,3.825208,2019-10-26 23:00:00,2019-11-02 21:25:00,6 days 22:25:00
3641,96161,PG&E_2020/09/07,644.833546,9.169497,2020-09-07 11:25:00,2020-11-13 20:38:00,67 days 09:13:00


In [12]:
# write out parquet of numerator data
create_measure(customers_impacted_zcta, "ca_zcta_psps_eDME-customers_impacted")

Writing data to  /Users/laurenwilner/Desktop/Desktop/epidemiology_PhD/01_data/clean/ca_zcta_psps_eDME-customers_impacted.parquet
