#### SVI+Mortality data

In [1]:
import pandas as pd
import geopandas as gpd

In [2]:
mortality = pd.read_excel('/home/h6x/git_projects/ornl-svi-data-processing/raw_data/HepVu_County_Opioid_Indicators_05DEC22.xlsx')
svi = gpd.read_file('/home/h6x/git_projects/ornl-svi-data-processing/raw_data/svi/2018/SVI2018_US_county.gdb')

In [3]:
def preprocess_overdose_data(overdose_df):
    """Preprocess overdose data."""
    overdose_df['GEO ID'] = overdose_df['GEO ID'].astype(str)
    overdose_df['GEO ID'] = overdose_df['GEO ID'].apply(lambda x: x.zfill(5))
    return overdose_df

In [4]:
def preprocess_svi_data(us_svi, raw_variables):
    """Preprocess SVI data by removing invalid values and normalizing."""
    for variable in raw_variables:
        us_svi = us_svi[us_svi[variable] != -999.00]

    # for var in raw_variables:
    #     max_val = us_svi[var].max()
    #     min_val = us_svi[var].min()
    #     us_svi[var] = (us_svi[var] - min_val) / (max_val - min_val)
    
    return us_svi

In [5]:
raw_variables = [
        'EP_POV', 'EP_UNEMP', 'EP_PCI', 'EP_NOHSDP', 'EP_UNINSUR', 'EP_AGE65',
        'EP_AGE17', 'EP_DISABL', 'EP_SNGPNT', 'EP_LIMENG', 'EP_MINRTY', 'EP_MUNIT',
        'EP_MOBILE', 'EP_CROWD', 'EP_NOVEH', 'EP_GROUPQ'
    ]

In [6]:
mortality = preprocess_overdose_data(mortality)
svi = preprocess_svi_data(svi, raw_variables)

In [7]:
# filter out the columns we need
mortality = mortality[['GEO ID', 'Narcotic Overdose Mortality Rate 2018']]

In [8]:
svi.head(2)

Unnamed: 0,ST,STATE,ST_ABBR,COUNTY,FIPS,LOCATION,AREA_SQMI,E_TOTPOP,M_TOTPOP,E_HU,...,F_THEME4,F_TOTAL,E_UNINSUR,M_UNINSUR,EP_UNINSUR,MP_UNINSUR,E_DAYPOP,Shape_Length,Shape_Area,geometry
1,1,ALABAMA,AL,Autauga,1001,"Autauga County, Alabama",594.443459,55200,0,23315,...,0,0,3875,508,7.1,0.9,37301,2.05274,0.150256,"MULTIPOLYGON (((-86.92120 32.65754, -86.92035 ..."
2,1,ALABAMA,AL,Blount,1009,"Blount County, Alabama",644.83046,57645,0,24222,...,0,0,6303,732,11.0,1.3,40036,2.392326,0.164403,"MULTIPOLYGON (((-86.96336 33.85822, -86.95967 ..."


In [9]:
mortality.head(2)

Unnamed: 0,GEO ID,Narcotic Overdose Mortality Rate 2018
0,1001,10.9
1,1003,14.9


In [10]:
df = pd.merge(svi, mortality,left_on='FIPS', right_on='GEO ID', how='left')

In [11]:
df.shape

(3141, 128)

In [12]:
df.head(3)

Unnamed: 0,ST,STATE,ST_ABBR,COUNTY,FIPS,LOCATION,AREA_SQMI,E_TOTPOP,M_TOTPOP,E_HU,...,E_UNINSUR,M_UNINSUR,EP_UNINSUR,MP_UNINSUR,E_DAYPOP,Shape_Length,Shape_Area,geometry,GEO ID,Narcotic Overdose Mortality Rate 2018
0,1,ALABAMA,AL,Autauga,1001,"Autauga County, Alabama",594.443459,55200,0,23315,...,3875,508,7.1,0.9,37301,2.05274,0.150256,"MULTIPOLYGON (((-86.92120 32.65754, -86.92035 ...",1001,10.9
1,1,ALABAMA,AL,Blount,1009,"Blount County, Alabama",644.83046,57645,0,24222,...,6303,732,11.0,1.3,40036,2.392326,0.164403,"MULTIPOLYGON (((-86.96336 33.85822, -86.95967 ...",1009,19.9
2,1,ALABAMA,AL,Butler,1013,"Butler County, Alabama",776.838201,20025,0,10026,...,2005,340,10.2,1.7,17280,1.818327,0.191747,"MULTIPOLYGON (((-86.90894 31.96167, -86.87498 ...",1013,8.8


In [13]:
all_selected_columns = raw_variables + ['Narcotic Overdose Mortality Rate 2018', 'geometry','ST', 'FIPS','ST_ABBR','E_TOTPOP']

In [14]:
# filter out the columns we need
df = df[all_selected_columns]

In [15]:
# rename col "Narcotic Overdose Mortality Rate 2018" to NOD
df = df.rename(columns={"Narcotic Overdose Mortality Rate 2018": "NOD"})

In [16]:
df.head(3)

Unnamed: 0,EP_POV,EP_UNEMP,EP_PCI,EP_NOHSDP,EP_UNINSUR,EP_AGE65,EP_AGE17,EP_DISABL,EP_SNGPNT,EP_LIMENG,...,EP_MOBILE,EP_CROWD,EP_NOVEH,EP_GROUPQ,NOD,geometry,ST,FIPS,ST_ABBR,E_TOTPOP
0,15.4,4.2,29372.0,11.3,7.1,14.6,24.2,19.3,7.5,0.8,...,18.4,1.4,5.6,1.0,10.9,"MULTIPOLYGON (((-86.92120 32.65754, -86.92035 ...",1,1001,AL,55200
1,14.4,4.1,22656.0,19.8,11.0,17.8,23.4,14.2,7.0,1.7,...,25.2,1.6,4.2,0.9,19.9,"MULTIPOLYGON (((-86.96336 33.85822, -86.95967 ...",1,1009,AL,57645
2,23.5,6.7,20430.0,15.4,10.2,19.0,22.8,17.7,10.5,0.5,...,26.2,1.8,7.8,1.6,8.8,"MULTIPOLYGON (((-86.90894 31.96167, -86.87498 ...",1,1013,AL,20025


In [17]:
gdf = gpd.GeoDataFrame(df, geometry='geometry')

In [18]:
output_path = '/home/h6x/git_projects/universal-experiment-lab/experiment_2_bym_modeling/bym_svi_opioid_data/county_svi_county_mortality/svi_mortality_2018.shp'

In [19]:
gdf.to_file(output_path, driver='ESRI Shapefile')