In [1]:
import pandas as pd

df = pd.read_csv("data/SVI_2020_US.csv", na_values=-999)


Copied from the CDC site and not to be used as is -- 

# CDC/ATSDR Social Vulnerability Index

Social vulnerability refers to the potential negative effects on communities caused by external stresses on human health. Such stresses include natural or human-caused disasters, or disease outbreaks. Reducing social vulnerability can decrease both human suffering and economic loss.

The CDC/ATSDR Social Vulnerability Index (CDC/ATSDR SVI) uses 16 U.S. census variables to help local officials identify communities that may need support before, during, or after disasters.
https://www.atsdr.cdc.gov/placeandhealth/svi/index.html

## Understanding the variables
The variables are prefixed with some characters to indicate the type of measurement:

    E: Estimate

    EP:  Percentage Estimate

    F:  *Unable to find source, based on other experiences with census data this may indicate some degree of imputation*

    M: Margin of Error on Estimate (E)

    MP: Margin of Error on Percentage Estimate (EP)

    RPL: Percentile ranking of theme ()

    EPL: Percentile ranking of percentage estimate (EP)

    SPL: Sum of Series (as in, sum(EPL scores))

These definitions were cobbled together from the Minority Health SVI data dictionary. Their data dictionary has two additional themes, but appears to be largely the same. 


I did find a more detailed data dictionary and uploaded it to the folder. It has descriptions for a lot of the intermediate calculations. Unfortunately it looks like it would be a lot of effort to recreate them at the state level. 

In [2]:

abs(df[["EPL_MUNIT", 	"EPL_MOBILE",	"EPL_CROWD",	"EPL_NOVEH",	"EPL_GROUPQ"]].sum(axis=1) - df["SPL_THEME4"] )<.00001

0        True
1        True
2        True
3        True
4        True
         ... 
84117    True
84118    True
84119    True
84120    True
84121    True
Length: 84122, dtype: bool

In [3]:
e_socio = df.filter(like="E_").iloc[:, 3:8].columns
epl_socio = ["EPL_" + col[2:] for col in e_socio]
e_household = df.filter(like="E_").iloc[:, 8:13]
epl_household = ["EPL_" + col[2:] for col in e_household]
e_housing_transp = df.filter(like="E_").iloc[:, 14:19]
epl_housing_transp = ["EPL_" + col[2:] for col in e_housing_transp]
e_race = df["E_MINRTY"].to_frame()
epl_race = ["EPL_" + col[2:] for col in e_race]

In [4]:
df.filter(regex="UNEMP|POP|E_AGE17|E_AGE65")

Unnamed: 0,E_TOTPOP,M_TOTPOP,E_UNEMP,M_UNEMP,E_AGE65,E_AGE17,EP_UNEMP,MP_UNEMP,EPL_UNEMP,F_UNEMP,E_DAYPOP
0,1941,390,18,18,295,415,2.1,2.1,0.1731,0.0,1033.0
1,1757,310,29,26,284,325,4.0,3.5,0.4210,0.0,4080.0
2,3694,570,53,45,464,929,2.7,2.3,0.2487,0.0,2056.0
3,3539,500,39,34,969,510,2.4,2.0,0.2096,0.0,1908.0
4,4306,662,23,31,541,1136,1.0,1.3,0.0641,0.0,3774.0
...,...,...,...,...,...,...,...,...,...,...,...
84117,3124,198,50,34,677,696,3.4,2.3,0.3415,0.0,1577.0
84118,2231,273,31,29,402,548,2.5,2.2,0.2223,0.0,2603.0
84119,2578,314,89,67,637,644,7.0,5.4,0.7176,0.0,3110.0
84120,3276,280,42,39,710,552,3.2,2.8,0.3147,0.0,1519.0


In [5]:
display((df[epl_socio].sum(axis=1) - df["SPL_THEME1"] < 1e-4).mean())
display((df[epl_household].sum(axis=1) - df["SPL_THEME2"] < 1e-4).mean())

display((df[epl_housing_transp].sum(axis=1) - df["SPL_THEME4"] < 1e-4).mean())


0.9915836523144956

0.9920591521837332

0.9908347400204465

In [6]:
household_characteristics = (
    "E_AGE65",
    "M_AGE65",
    "E_AGE17",
    "M_AGE17",
    "E_DISABL",
    "M_DISABL",
    "E_SNGPNT",
    "M_SNGPNT",
    "E_LIMENG",
    "M_LIMENG",
)
household_characteristics = {k: "HH_CHARAC_" + k for k in household_characteristics}
household_characteristics


{'E_AGE65': 'HH_CHARAC_E_AGE65',
 'M_AGE65': 'HH_CHARAC_M_AGE65',
 'E_AGE17': 'HH_CHARAC_E_AGE17',
 'M_AGE17': 'HH_CHARAC_M_AGE17',
 'E_DISABL': 'HH_CHARAC_E_DISABL',
 'M_DISABL': 'HH_CHARAC_M_DISABL',
 'E_SNGPNT': 'HH_CHARAC_E_SNGPNT',
 'M_SNGPNT': 'HH_CHARAC_M_SNGPNT',
 'E_LIMENG': 'HH_CHARAC_E_LIMENG',
 'M_LIMENG': 'HH_CHARAC_M_LIMENG'}

In [7]:
state_density = df.groupby(["ST", "STATE"])[["AREA_SQMI", "E_TOTPOP"]].sum()
state_density = (state_density["E_TOTPOP"]/state_density["AREA_SQMI"]).to_frame( name="POP_PER_SQ_MILE")



In [8]:
weighted = df[["ST", "STATE", "E_TOTPOP"]].copy()
for col in df.filter(regex=r"SPL_THEME\d"):
    weighted.loc[:,col] = df[col] *weighted["E_TOTPOP"]


In [9]:
weighted = weighted.groupby(["ST", "STATE"]).sum()


In [10]:
df[df["STATE"].str.contains("Arizona")].filter(like="SPL")

Unnamed: 0,SPL_THEME1,SPL_THEME2,SPL_THEME3,SPL_THEME4,SPL_THEMES
1613,3.7020,2.7166,0.9959,2.7722,10.1867
1614,3.8904,3.2939,0.9862,3.2422,11.4127
1615,3.6007,2.9756,0.9713,3.0570,10.6046
1616,3.8883,3.1765,0.9604,3.7195,11.7447
1617,4.0139,3.3581,0.9575,3.6083,11.9378
...,...,...,...,...,...
3373,2.3575,3.2413,0.7209,2.9736,9.2933
3374,,,,,
3375,,,,,
3376,,,0.9734,,


In [11]:
for col in weighted.filter(like="SPL"):
    weighted[col] = weighted[col]/weighted["E_TOTPOP"]


In [12]:

out = weighted.drop(columns="E_TOTPOP").rename(columns={"SPL_THEME1":"SOCIO",
                                                  "SPL_THEME2":"HOUSEHOLD",
                                                  "SPL_THEME3":"RACE",
                                                  "SPL_THEME4":"HOUSING"
                                                  }
                                                )

out["POP_PER_SQ_MILE"] = state_density["POP_PER_SQ_MILE"]

In [13]:
ranked = out.rank(pct=True).sort_values("SOCIO")



In [14]:
ranked.to_csv("data/ranked_svi_factors.csv")

In [15]:
out.to_csv("data/weighted_average_svi_factors.csv")

In [16]:
unweighted = (df.filter(regex=r"ST$|STATE|(SPL_THEME\d)")
              .groupby(["ST", "STATE"])
              .mean()
              )

unweighted.to_csv("data/unweighted_average_svi_factors.csv")

