#### Creating the annotaion file

In [1]:
# Import libraries
import os
import geopandas as gpd
import pandas as pd

In [2]:
def get_folders(location):
    return [name for name in os.listdir(location) if os.path.isdir(os.path.join(location, name))]

states = get_folders('/Users/h6x/ORNL/git/modeling-ideas/overdose modeling for entire country/data/processed data/svi with hepvu/2018/SVI2018 census tracts with death rate HepVu-5 classes')

In [3]:
# loop through each state and load the shapefile and concatenate them into one dataframe

gdf = pd.DataFrame()
for state in states:
    # print(state)
    try:
        gdf = pd.concat([gdf, gpd.read_file(f'/Users/h6x/ORNL/git/modeling-ideas/overdose modeling for entire country/data/processed data/svi with hepvu/2018/SVI2018 census tracts with death rate HepVu-5 classes/{state}/{state}.shp')])
    except:
        print(f'Error reading {state}')
        continue


In [4]:
gdf.shape

(72655, 129)

In [5]:
gdf.head()

Unnamed: 0,ST,STATE,ST_ABBR,STCNTY,COUNTY,FIPS,LOCATION,AREA_SQMI,E_TOTPOP,M_TOTPOP,...,E_UNINSUR,M_UNINSUR,EP_UNINSUR,MP_UNINSUR,E_DAYPOP,Shape_Leng,Shape_Area,NOD_Rate,percentile,geometry
0,50,VERMONT,VT,50007,Chittenden,50007980000,"Census Tract 9800, Chittenden County, Vermont",1.141485,0,9,...,0,9,-999.0,-999.0,794,0.107289,0.000337,12.5,0,"POLYGON ((-73.16975 44.48154, -73.16620 44.484..."
1,50,VERMONT,VT,50001,Addison,50001960100,"Census Tract 9601, Addison County, Vermont",81.549039,3928,202,...,214,67,5.5,1.7,1619,0.743236,0.023958,12.9,0,"POLYGON ((-73.19680 44.26663, -73.16182 44.271..."
2,50,VERMONT,VT,50001,Addison,50001960200,"Census Tract 9602, Addison County, Vermont",47.443526,2736,23,...,151,75,5.5,2.7,1993,0.650662,0.017827,12.9,0,"POLYGON ((-73.39963 44.15533, -73.39873 44.162..."
3,50,VERMONT,VT,50001,Addison,50001960300,"Census Tract 9603, Addison County, Vermont",2.476118,2612,19,...,59,34,2.3,1.3,3360,0.109137,0.000744,12.9,0,"POLYGON ((-73.27275 44.17683, -73.24845 44.179..."
4,50,VERMONT,VT,50001,Addison,50001960400,"Census Tract 9604, Addison County, Vermont",124.237386,5083,179,...,183,41,3.6,0.8,2554,0.980535,0.040436,12.9,0,"POLYGON ((-73.43774 44.04501, -73.43199 44.063..."


In [6]:
# get the unque STCNTY values and their relevent Percentile column values
A = gdf[['STCNTY', 'percentile','NOD_Rate']].drop_duplicates().sort_values(by=['STCNTY'])


In [7]:
# Calculate the min and max of the column
min_value = A['NOD_Rate'].min()
max_value = A['NOD_Rate'].max()

In [8]:
# get the number of negative values in NOD_Rate
negative_values = A[A['NOD_Rate'] < 0].shape[0]
negative_values

9

In [9]:
# make negative values to zero
A.loc[A['NOD_Rate'] < 0, 'NOD_Rate'] = 0

In [10]:
min_value

-9.0

In [11]:
# Apply the normalization formula
A['NOD'] = (A['NOD_Rate'] - min_value) / (max_value - min_value)

In [12]:
# drop the NOD_Rate column
# A = A.drop(columns=['NOD_Rate'])

##### Adding labels based on the whole country

In [13]:
A['percen_US'] = pd.qcut(A['NOD_Rate'], q=[0, 0.2, 0.4, 0.6, 0.8, 1], labels=['0', '1', '2', '3', '4'])

In [14]:
# reset the index
A = A.reset_index(drop=True)

In [15]:
# get the columns where percen_US is 4
A[A['percen_US'] == '4']

Unnamed: 0,STCNTY,percentile,NOD_Rate,NOD,percen_US
26,01053,4,31.5,0.328201,4
27,01055,4,24.7,0.273096,4
36,01073,4,27.7,0.297407,4
57,01115,4,28.3,0.302269,4
63,01127,4,24.9,0.274716,4
...,...,...,...,...,...
3074,55059,4,25.2,0.277147,4
3083,55077,4,28.5,0.303890,4
3084,55078,4,24.6,0.272285,4
3085,55079,4,37.3,0.375203,4


In [16]:
# arange this highest to lowest A[A['percen_US'] == '4']
A.sort_values(by=['NOD_Rate'], ascending=False).head(40)

Unnamed: 0,STCNTY,percentile,NOD_Rate,NOD,percen_US
2995,54011,4,114.4,1.0,4
1214,24510,4,108.8,0.954619,4
2992,54005,4,94.7,0.840357,4
3013,54047,4,85.1,0.762561,4
1814,35039,4,80.0,0.721232,4
1595,29510,4,70.9,0.647488,4
3044,54109,4,66.7,0.613452,4
3030,54081,4,65.6,0.604538,4
3009,54039,4,64.5,0.595624,4
3012,54045,4,64.1,0.592382,4


In [17]:
A.head()

Unnamed: 0,STCNTY,percentile,NOD_Rate,NOD,percen_US
0,1001,2,10.9,0.161264,1
1,1003,3,14.9,0.193679,2
2,1005,0,5.2,0.115073,0
3,1007,4,23.1,0.26013,3
4,1009,4,19.9,0.234198,3


In [18]:
#make STCNTY column int
A['STCNTY'] = A['STCNTY'].astype(str)

In [19]:
# save A as a csv file
A.to_csv('/Users/h6x/ORNL/git/modeling-ideas/overdose modeling for entire country/data/processed data/svi with hepvu/2018/annotation 2018/annotation_NOD.csv', index=False)