# Tuning G* Calculations

Things to try:

- Using percentiles instead of aggregated burden score
- Using raw values instead of aggregared burden score
- Using a different weight that is distance based rather than border based?

In [19]:
# Load packages 
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns         
import numpy as np
import os
import libpysal as lps 
from libpysal.weights import W 
from esda.getisord import G_Local
from functions import state_gstar
import pprint

# Remove max columns
pd.set_option('display.max_columns', None)

In [2]:
# Import data
base_dir = "/capstone/justice40"
# base_dir = "~/MEDS/justice40/data-exploration"

# Complete 2.0 USA file
usa_v2 = pd.read_csv(os.path.join(base_dir, "data", "usa_v2.csv"))

# 2.0 communities files (from current CEJST website)
comm_v2 = pd.read_csv(os.path.join(base_dir, "data", "2.0-communities.csv"))

# Version 2.0 shapefile data
v2 = pd.read_csv(os.path.join(base_dir, "data", "2.0-shapefile-codebook", "2.0-codebook.csv"))
v2_geo = gpd.read_file(os.path.join(base_dir, "data", "2.0-shapefile-codebook", "usa", "usa.shp"))

  usa_v2 = pd.read_csv(os.path.join(base_dir, "data", "usa_v2.csv"))
  comm_v2 = pd.read_csv(os.path.join(base_dir, "data", "2.0-communities.csv"))


In [4]:
usa_v2 = usa_v2.select_dtypes(exclude=['bool'])
usa_v2.shape

(74134, 179)

In [5]:
comm_v2 = comm_v2.select_dtypes(exclude=['bool'])
comm_v2.shape

(74134, 93)

In [16]:
# my_list = usa_v2.columns.to_list()
# pprint.pprint(my_list, compact=False) 

### Definitions
- Burden: One of the eight indicator categories. Also referred to as categories. 
    - Climate change
        - expected agriculture loss rate 
        - expected building loss rate 
        - expected population loss rate 
        - projected flood risk 
        - projected wildfire risk
    - Energy
        - energy cost 
        - PM2.5 in the air
    - Housing
        - historic underinvestment 
        - housing cost 
        - lack of green space 
        - lack of indoor plumbing 
        - lead paint
    - Health
        - asthma 
        - diabetes 
        - heart disease 
        - low life expectancy
    - Legacy pollution
        - abandoned mine land
        - Formerly Used Defense Sites 
        - proximity to hazardous waste facilities 
        - proximity to Superfund sites (National Priorities List (NPL)) 
        - proximity to Risk Management Plan (RMP) facilities
    - Transportation
        -  diesel particulate matter exposure 
        - transportation barriers 
        - traffic proximity and volume
    - Waste and Wastewater
        -  underground storage tanks and releases 
        - wastewater discharge
    - Workforce development
        -  linguistic isolation 
        - low median income 
        - poverty 
        - unemployment
- Category: Another way CEJST refers to the 8 burdens
- Criteria: The sub-categories that compose the larger 8 burdens. 

In [22]:
cc = usa_v2[['Expected agricultural loss rate (Natural Hazards Risk Index) (percentile)', 
            'Expected building loss rate (Natural Hazards Risk Index) (percentile)', 
            'Expected population loss rate (Natural Hazards Risk Index) (percentile)', 
            'Share of properties at risk of flood in 30 years (percentile)', 
            'Share of properties at risk of fire in 30 years (percentile)']]
cc =cc.rename(columns={
    'Expected agricultural loss rate (Natural Hazards Risk Index) (percentile)':'ag_loss', 
    'Expected building loss rate (Natural Hazards Risk Index) (percentile)':'building_loss', 
    'Expected population loss rate (Natural Hazards Risk Index) (percentile)':'population_loss', 
    'Share of properties at risk of flood in 30 years (percentile)':'flood_risk', 
    'Share of properties at risk of fire in 30 years (percentile)':'fire_risk'
})
cc.head()

Unnamed: 0,ag_loss,building_loss,population_loss,flood_risk,fire_risk
0,0.215315,0.784638,0.614474,0.497477,0.841147
1,0.086538,0.916615,0.559042,0.885039,0.336647
2,0.083697,0.648245,0.634793,0.847378,0.336647
3,0.072682,0.900051,0.553442,0.826693,0.336647
4,,0.951725,0.55544,0.808244,0.336647


In [21]:
energy = usa_v2[['Energy burden (percentile)', 'PM2.5 in the air (percentile)']]
energy = energy.rename(columns={
    'Energy burden (percentile)':'energy_burden', 
    'PM2.5 in the air (percentile)':'pm_25'
    })
energy.head()


Unnamed: 0,energy_burden,pm_25
0,0.86499,0.825669
1,0.970847,0.839065
2,0.932047,0.810017
3,0.977915,0.830914
4,0.989683,0.840601


In [24]:
housing = usa_v2[['Housing burden (percent) (percentile)', 
                'Share of homes with no kitchen or indoor plumbing (percent) (percentile)', 
                'Percent pre-1960s housing (lead paint indicator) (percentile)']]
housing = housing.rename(columns={
    'Housing burden (percent) (percentile)':'housing_burden',
    'Share of homes with no kitchen or indoor plumbing (percent) (percentile)':'no_plumbing', 
    'Percent pre-1960s housing (lead paint indicator) (percentile)':'lead_paint'
})
housing.head()

Unnamed: 0,housing_burden,no_plumbing,lead_paint
0,0.646637,0.215989,0.397913
1,0.342058,0.505139,0.846261
2,0.89959,0.591729,0.827268
3,0.875027,0.943543,0.83227
4,0.928449,0.673872,0.671988


In [25]:
health = usa_v2[['Current asthma among adults aged greater than or equal to 18 years (percentile)', 
                'Diagnosed diabetes among adults aged greater than or equal to 18 years (percentile)', 
                'Coronary heart disease among adults aged greater than or equal to 18 years (percentile)',
                'Low life expectancy (percentile)']]
health = health.rename(columns={
    'Current asthma among adults aged greater than or equal to 18 years (percentile)':'asthma', 
    'Diagnosed diabetes among adults aged greater than or equal to 18 years (percentile)':'diabetes', 
    'Coronary heart disease among adults aged greater than or equal to 18 years (percentile)':'heart_disease',
    'Low life expectancy (percentile)':'low_life_expectancy'
})
health.head()

Unnamed: 0,asthma,diabetes,heart_disease,low_life_expectancy
0,0.850988,0.965376,0.72648,0.970729
1,0.838593,0.987805,0.9218,0.960289
2,0.973547,0.989803,0.94228,0.954734
3,0.919545,0.957298,0.626278,0.866511
4,0.963833,0.996366,0.961202,0.990238


In [26]:
lp = usa_v2[['Is there at least one abandoned mine in this census tract?', 
            'Is there at least one Formerly Used Defense Site (FUDS) in the tract?', 
            'Proximity to hazardous waste sites (percentile)', 
            'Proximity to NPL sites (percentile)', 
            'Proximity to Risk Management Plan (RMP) facilities (percentile)']]
lp = lp.rename(columns={
    'Is there at least one abandoned mine in this census tract?':'abandoned_mines', 
    'Is there at least one Formerly Used Defense Site (FUDS) in the tract?':'defense_site', 
    'Proximity to hazardous waste sites (percentile)':'hazardous_waste', 
    'Proximity to NPL sites (percentile)':'npl_sites', 
    'Proximity to Risk Management Plan (RMP) facilities (percentile)':'rmp_facilites'
})
lp.head()

Unnamed: 0,abandoned_mines,defense_site,hazardous_waste,npl_sites,rmp_facilites
0,,,0.48194,0.753109,0.900738
1,,,0.800192,0.865335,0.879948
2,,,0.736861,0.753366,0.37888
3,,,0.68533,0.606602,0.745918
4,,,0.731102,0.716462,0.86086


In [29]:
transport = usa_v2[['Diesel particulate matter exposure (percentile)', 
                    'DOT Travel Barriers Score (percentile)', 
                    'Traffic proximity and volume (percentile)']]
transport = transport.rename(columns={
    'Diesel particulate matter exposure (percentile)':'diesel_pm', 
    'DOT Travel Barriers Score (percentile)':'travel_barriers', 
    'Traffic proximity and volume (percentile)':'traffic_proximity'
})
transport.head()

Unnamed: 0,diesel_pm,travel_barriers,traffic_proximity
0,0.840725,0.45332,0.391992
1,0.938048,0.37279,0.956325
2,0.765093,0.525417,0.617938
3,0.799755,0.751179,0.497279
4,0.866544,0.338863,0.490575


In [30]:
ww = usa_v2[['Leaky underground storage tanks (percentile)', 'Wastewater discharge (percentile)']]
ww =ww.rename(columns={
    'Leaky underground storage tanks (percentile)':'leaky_storage_tanks', 
    'Wastewater discharge (percentile)':'wastewater_discharge'
})
ww.head()

Unnamed: 0,leaky_storage_tanks,wastewater_discharge
0,0.52102,0.961966
1,0.641735,0.843141
2,0.817535,0.831253
3,0.850627,0.163562
4,0.667514,0.191097


In [32]:
wd = usa_v2[['Linguistic isolation (percent) (percentile)', 
            'Low median household income as a percent of area median income (percentile)', 
            'Poverty (Less than 200% of federal poverty line) (percentile)', 
            'Unemployment (percent) (percentile)']]
wd = wd.rename(columns={
    'Linguistic isolation (percent) (percentile)':'ling_isolation', 
    'Low median household income as a percent of area median income (percentile)':'low_income', 
    'Poverty (Less than 200% of federal poverty line) (percentile)':'poverty', 
    'Unemployment (percent) (percentile)':'unemployment'
})
wd.head()

Unnamed: 0,ling_isolation,low_income,poverty,unemployment
0,0.128771,0.821814,0.63476,0.029797
1,0.586708,0.836427,0.791676,0.90674
2,0.541384,0.938839,0.96903,0.941376
3,0.128771,0.794525,0.840408,0.877831
4,0.714412,0.985994,0.961596,0.984514
