In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import rasterra as rt
import numpy as np
from pathlib import Path

from measure_io import load_measure

# global vars
CRS = 'EPSG:3310' # this is california albers

# file paths
RAW_DATA_DIR = Path("/Users/laurenwilner/Desktop/Desktop/epidemiology_PhD/data/raw/")

In [None]:
# functions
def load_circuit(path):
    gdf = gpd.read_file(path)
    gdf = gdf.rename(columns = lambda c: c.lower()).dissolve("circuit_na")["geometry"].reset_index().to_crs(CRS)
    return gdf

def load_all_circuits():
    # clean polylines
    # delete extra cols
    # union polylines for a given circuit 
    # join all three provider circuit files into 1
    # NOTE: using polygon files since SDGE is only polygons. Polygons are Polylines with a 18m buffer.
    
    pge = load_circuit(RAW_DATA_DIR/ "2023.07.psps_data/ICA_circuits/PGE/PGE_circuits_polygons/PGE_circuits_polygons.shp")
    sce = load_circuit(RAW_DATA_DIR/ "2023.07.psps_data/ICA_circuits/SCE/SCE_circuits_polygon/SCE_circuits_polygon.shp")
    sdge = load_circuit(RAW_DATA_DIR/ "2023.07.psps_data/ICA_circuits/SDGE_circuits/SDGE_circuits.shp")
    return pd.concat([pge,sce,sdge])
    

In [None]:
# load and format data 

# ca census tracts
ca_ct = load_measure("ca_census_tracts").to_crs(CRS)
ca_shape = ca_ct.unary_union

# zcta boundaries
zcta_boundaries = load_measure("us_zcta_boundaries").to_crs(CRS)
print("loaded boundaries")
zcta_ca = zcta_boundaries[zcta_boundaries.intersects(ca_shape)]

# households and businesses
zcta_hh = load_measure("us_zcta_households")
print("loaded households")
zcta_cbp = load_measure("us_zcta_cbp")
print("loaded businesses")

# psps data
psps = load_measure("us_circuit_psps_by_hr")
print("loaded psps")

# ca pop
ca_pop = rt.load_raster(RAW_DATA_DIR/ "CAPOP_2020_100m_TOTAL.tif").set_no_data_value(np.nan).to_crs(CRS)
print("loaded ca pop")

# load circuits
all_circuits = load_all_circuits()
print("loaded all circuits")

In [None]:
# step 1: map to pixels
    # overlay circuits with zctas 
    # overlay pixels with zctas
    # by zcta: 
        # turn raster data into a geodataframe
        # subset to only pixels with population in them
        # intersect this subset with the circuit polys 
        # proportionally split pixel pop if there is some of 2 circuits in a pixel
        # now we have the pop for each segment of each circuit so we can aggregate to circuit_name-zcta
        # end with df with zcta, geometry, pop, circuit_na

ca_gdf = ca_pop.to_gdf()
ca_gdf = ca_gdf[ca_gdf["value"].notnull()]
ca_gdf = ca_gdf.reset_index().rename(columns = {"index":"pixel_id"})
ca_gdf = ca_gdf.sjoin(zcta_ca) # join zcta onto pop pixel 
# now join zcta onto circuit df using an overlay

# then loop over zcta to do what i did below
# then gather up dfs and concat them together
# then end up with df with zcta, circuit_na, pop. write this out as a clean df to parquet (fix measure_io to do this)

In [None]:
# not all of our population is covered by these circuits. 
# ca pop covered by these circuits = 29399918
# circuits.loc[~circuits["pixel_id"].duplicated(), "value"].sum()
# ca pop not covered by these circuits = 39538230
# ca_gdf["value"].sum()
circuits = all_circuits.overlay(ca_gdf, keep_geom_type = True)

# for any pixel with circuit coverage in this dataset: 
# assuming that pixel is entirely covered by the circuits in the dataset
# so, we are saying that on a pixel by pixel basis we do or do not have coverage
# we can thus define the pop_weight as the relative size of each circuit segment in that pixel
# note: there are also circuits that cover no people, which we already know.
circuits["intersect_area"] = circuits.area
circuits["pixel_area"] = circuits.groupby("pixel_id")["intersect_area"].transform("sum")
circuits["pop_weight"] = circuits["intersect_area"]/circuits["pixel_area"]
circuits["intersect_population"] = circuits["value"]*circuits["pop_weight"]
circuit_pop = circuits.groupby("circuit_na")["intersect_population"].sum()

# assign makes a new col called pop that will become a col in all_circuits.
# our function takes as an arg the entire df at the point of assign (after index has been set).
# then we return circuit_pop reindexed with the index from all_circuits
# essentially giving circuit_pop the same index as all_circuits
# left merge circuit_pop (right) onto all_circuits (left). keep all things in all_circuits.
circuit_pop = all_circuits.set_index("circuit_na").assign(pop = lambda x: circuit_pop.reindex(x.index, fill_value = 0)).reset_index()

In [None]:
# Questions 
# 1. how do we incorporate businesses with pixel pop?
# 2. for the numerator (customers impacted), how will we get from circuit to zcta? we can either: 
# a. proportionally split the circuit across all the zctas it is in and say that % of people are out in each zip
# b. probabilistically determine which zcta had the outage based on which zcta has the most of the circuit in it OR has the most customers in it, etc
    # since outages are usually in clusters, this may be better? 
# 3. union of polylines - is this ok? 

# Notes
# 1. using polygons not polylines. these are polylines with 18m buffer. 
# 2. not all of our pop is covered by these circuits
    # ca pop covered by these circuits = 29399918
    # ca pop not covered by these circuits = 39538230
# 3. there are circuits that cover no people
# 4. potential source of bias:
    # most zctas have 1000+ pixels covering them
    # ~300 have less than 1000 pixels in them
    # for now we will assume that most zctas have sufficient pixel coverage

# Next steps: 
# finish zcta incorporation
# finish precommit stuff to put this on git 

In [None]:
alameda_ct = ca_ct[ca_ct["geoid"].str[:5] == "06001"].to_crs(ca_pop.crs)

In [None]:
fig,ax = plt.subplots(figsize = (10,10))
ca_pop.clip(alameda_ct).mask(alameda_ct).plot(ax=ax, vmax = 200)
alameda_ct.boundary.plot(ax=ax, color = "black", linewidth = 0.05)

In [None]:
# potential source of bias:
    # most zctas have 1000+ pixels covering them
    # ~300 have less than 1000 pixels in them
    # for now we will assume that most zctas have sufficient pixel coverage
pixels_per_zcta = zcta_ca.area/10000
pixels_per_zcta[pixels_per_zcta<1000].hist()

In [None]:
# diagnostic for circuit line bits

# future: make a diagnostics.py and put all of these in there.
def examine_multiline_circuits():
    pge_polyline = gpd.read_file(raw_data_dir/ "2023.07.psps_data/ICA_circuits/PGE/PGE_circuits_lines/PGE_circuits_lines.shp")
    mask=pge_polyline["Circuit_Na"].duplicated(keep = False)
    mask=pge_polyline["Circuit_Na"] == "NORTH DUBLIN-VINEYARD" # can change this to any circuit! 
    pge_polyline[mask].sort_values("Circuit_Na").reset_index().plot("index", alpha=0.5)
    plt.show()
    
    pge_polyline["circuit_len"] = pge_polyline.length
    pge_polyline["circuit_len_prop"] = pge_polyline["circuit_len"]/pge_polyline.groupby("Circuit_Na")["circuit_len"].transform("sum")
    print(pge_polyline.groupby("Circuit_Na")["circuit_len_prop"].max().sort_values())

examine_multiline_circuits()