In [1]:
!pip install geopandas
!pip install pymysql



In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import math
import zipfile
import geopandas as gpd
import sqlite3
from pandas.io import sql
# import pymysql
from sqlalchemy import create_engine

# Compilation of all code for indicators

This notebook first finds these values for each indicator:
* Census Tract ID (FIPS)
* Indicator Value 
* Indicator Confidence Interval (if it exists, indicates whether it’s 90% or 95%)
* Percentile rank 

Then, we compile these values for each indicator into one final table with all the tracts, overall scores, category scores (i.e. exposure, environmental effects, etc.), indicator scores, and the number of indicators that had null values when generating the overall score. We currently have calculated this final table for Colorado only. 

#### Our Indicators

Environmental Effects Indicators:
1. [Lead exposure:](#lead) __THIS IS UNFINISHED - still figuring how exclude erroneous estimates__
2. [Oil](#oil)
3. [Proximity to Risk Management Plan Sites (PRMP)](#from_ejscreen)
4. [Proximity to Treatment Storage and Disposal Facilities (PTSDF)](#from_ejscreen)
5. [Proximity to National Priorities List Sites (PNPL)](#from_ejscreen)
6. [Wastewater Discharge Indicator (PWDIS)](#from_ejscreen)

Exposure Indicators:
7. [Traffic Proximity and Volume (PTRAF)](#from_ejscreen)
8. [Ozone](#ozone)
9. [Particulate Matter (PM 2.5)](#pm25)
10. [Diesel](#diesel)
11. [Air Toxics](#air_toxics)
12. [(Cancer Risk)* ](#cancer_risk)

Socioeconomic Factor Indicators
13. [Housing Burden](#housingburden)
14. [Educational Attainment](#edu_attain)
15. [Linguistic isolation](#lin)
16. [Unemployment](#unemploy)
17. [Race](#race)
18. [Poverty](#poverty)

Sensitive Population Indicators
19. [Asthma**](#asthma)
20. [Cardiovascular disease**](#hd)
21. [Low birth weight infants**](#lbw)

[Final score calculation](#final)

__Things to keep in mind:__ 

* Have not yet done the exclusion of unreliable data for all indicators using margin of error: lead
* PM 2.5 and Ozone - values were given in block groups. Data appears to be the same for all block groups within a tract. Converted to census tract by dropping all but one block, and using its value for the tract. Need to check that this works. 
* *Cancer risk from EJScreen - this is calculated and ready to add, but is not included in the final data export.
* **Only done for Colorado data for now
* Much of commented out code is for reference for the sources of saved csv's

#### Establishing Database connection

In [2]:
conn = sqlite3.connect("MEJ.db")

## 1. Lead Exposure <a id='lead'></a>

For this indicator, we use methodology from Washington Environmental Health Disparities Map (description of Lead Exposure methodology https://fortress.wa.gov/doh/wtn/WTNPortal#!q0=722):

"This indicator reflects the number and percent of housing units built before 1980, including single homes and multiple residence units such as apartments. The age of a home is a marker of risk for presence of lead paint because paint typically contained high levels of lead in the decades leading up to 1980. In the early 1970s the paint industry issued voluntary standards limiting lead content in paint, and in 1978 lead was banned from use in the manufacture of residential paint."

Using the most recent 5-year ACS (2018), we use the census API to get census-tract estimates of number of houses in the tract and the percent of houses built before 1940, 1940-1959, and 1960-1979. 

Different housing “vintages” have different odds of containing lead paint so we then weight these proportions with the following weights: 1940 = 0.68; 1940-1959= 0.43; 1960-1979= 0.08. Then, we sum the weights for each census tract, and rank tracts according to their score.  

In [3]:
all_states_lead = pd.DataFrame(columns=['NAME','state','county','tract','B25034_001E','B25034_001M','B25034_011E','B25034_011M','B25034_010E','B25034_010M','B25034_009E','B25034_009M','B25034_008E','B25034_008M','B25034_007E','B25034_007M'])

In [7]:
# # Use census api to retrieve data

# date = "2018"
# dataset = '/acs/acs5'
# base_url = "https://api.census.gov/data"
# variables = "NAME,B25034_001E,B25034_001M,B25034_011E,B25034_011M,B25034_010E,B25034_010M,B25034_009E,B25034_009M,B25034_008E,B25034_008M,B25034_007E,B25034_007M"

# # Get all FIPS state codes in strings
# # 01 - 56 (FIPS: https://en.wikipedia.org/wiki/Federal_Information_Processing_Standard_state_code)
# state_codes = list(np.arange(1, 56))
# states = list(map(str, np.arange(1, 56)))
# states = list(map(lambda x: str.zfill(x, 2), states))
# # remove reserved codes that are empty :( (ex. American Samoa, Guam, etc.) >> where is this data 
# states.remove('03')
# states.remove('07')
# states.remove('14')
# states.remove('43')
# states.remove('52')

# for state in states:
#     query = base_url+"/"+date+dataset+"?get="+variables+'&for='+'tract:*&in=state:'+state

#     state_df = pd.read_json(query, dtype = True)
#     state_df.columns = state_df.iloc[0]
#     state_df = state_df.drop(state_df.index[0])
    
#     # Concat all data into one table
#     all_states_lead = pd.concat([all_states_lead, state_df[['NAME',
#                                                             'state',
#                                                             'county',
#                                                             'tract',
#                                                             'B25034_001E',
#                                                             'B25034_001M',
#                                                             'B25034_011E',
#                                                             'B25034_011M',
#                                                             'B25034_010E',
#                                                             'B25034_010M',
#                                                             'B25034_009E',
#                                                             'B25034_009M',
#                                                             'B25034_008E',
#                                                             'B25034_008M',
#                                                             'B25034_007E',
#                                                             'B25034_007M']]], sort = 'True')


# #Save dataframe to csv 
# all_states_lead.to_csv('all_states_lead.csv')

In [9]:
all_states_lead = pd.read_csv('data/all_states_lead.csv', dtype={'tract': object, 'state': object, 'county': object})
all_states_lead['FIPS'] = all_states_lead['state']+all_states_lead['county']+all_states_lead['tract']

all_states_lead = all_states_lead.rename(columns={
                                    'B25034_001E':'total_houses',
                                    'B25034_001M':'total_houses_M',
                                    "B25034_011E": "PRE_1940", # Estimated total # of houses built pre 1940
                                    "B25034_011M": "PRE_1940_M", # M for margin of error
                                    "B25034_010E": "1940-1949", 
                                    "B25034_010M": "1940-1949_M",
                                    "B25034_009E": "1950-1959",
                                    "B25034_009M": "1950-1959_M",
                                    "B25034_008E": "1960-1969",
                                    "B25034_008M": "1960-1969_M",
                                    "B25034_007E": "1970-1979",
                                    "B25034_007M": "1970-1979_M"
                                 })


# Make sure all are ints, not strings 
all_states_lead['PRE_1940'] = all_states_lead['PRE_1940'].astype(int)
all_states_lead['1940-1949'] = all_states_lead['1940-1949'].astype(int)
all_states_lead['1950-1959'] = all_states_lead['1950-1959'].astype(int)
all_states_lead['1960-1969'] = all_states_lead['1960-1969'].astype(int)
all_states_lead['1970-1979'] = all_states_lead['1970-1979'].astype(int)

# Condense into ranges pre-1940, 1940-1959, 1960-1979
all_states_lead['1940-1959'] = all_states_lead['1940-1949'] + all_states_lead['1950-1959']
all_states_lead['1960-1979'] = all_states_lead['1960-1969'] + all_states_lead['1970-1979']

# Construct FIPS Code to tract level
all_states_lead['FIPS_tract_id'] = all_states_lead['state']+all_states_lead['county']+all_states_lead['tract']

# Weight each range correspondingly 
all_states_lead['lead_score'] = ((0.68 * all_states_lead['PRE_1940']) +
                                 (0.43 * all_states_lead['1940-1959']) +
                                 (0.08 * all_states_lead['1960-1979']))/all_states_lead['total_houses']

In [10]:
all_states_lead.head()

Unnamed: 0.1,Unnamed: 0,total_houses,total_houses_M,1970-1979,1970-1979_M,1960-1969,1960-1969_M,1950-1959,1950-1959_M,1940-1949,...,PRE_1940_M,NAME,county,state,tract,FIPS,1940-1959,1960-1979,FIPS_tract_id,lead_score
0,1,1327,63,216,92,301,121,316,115,277,...,86,"Census Tract 57.01, Jefferson County, Alabama",73,1,5701,1073005701,593,517,1073005701,0.314024
1,2,1498,83,380,100,203,72,402,93,92,...,47,"Census Tract 107.04, Jefferson County, Alabama",73,1,10704,1073010704,494,583,1073010704,0.201535
2,3,2182,161,564,159,745,214,46,43,36,...,17,"Census Tract 129.08, Jefferson County, Alabama",73,1,12908,1073012908,82,1309,1073012908,0.064152
3,4,1656,81,331,127,388,123,657,141,74,...,46,"Census Tract 53.02, Jefferson County, Alabama",73,1,5302,1073005302,731,719,1073005302,0.250417
4,5,4415,205,606,155,62,58,135,96,26,...,31,"Census Tract 111.07, Jefferson County, Alabama",73,1,11107,1073011107,161,668,1073011107,0.032097


In [11]:
# Eliminate unreliable estimates - this is incomplete, unsure of how to treat MOE's
# after multiplying thm by weights - if 68% of pre 1940 homes have lead, is the MOE pre_1940M * .68?

def standard_error_prop_sq(x, y, error_x, error_y): 
    #this was missing the square root, 
    #but creating a function to do this was smart
    test = error_x**2 - ((x**2)/(y**2)*((error_y)**2))
    se = np.where(test < 0,
                   (1/y) * np.sqrt(error_x**2 + ((x**2)/(y**2)*((error_y)**2))), 
                   (1/y) * np.sqrt(error_x**2 - ((x**2)/(y**2)*((error_y)**2))))
    return se

#calculating overall MOE of estimate number of houses with lead
all_states_lead['numerator_moe'] = np.sqrt(all_states_lead['PRE_1940_M']**2 +
                               all_states_lead['1940-1949_M']**2 + 
                               all_states_lead['1950-1959_M']**2 +
                               all_states_lead['1960-1969_M']**2 +
                               all_states_lead['1970-1979_M']**2)

#calculating standard error of a proportion
all_states_lead['se'] = standard_error_prop_sq(all_states_lead['lead_score'],
                                                           all_states_lead['total_houses'],
                                                          all_states_lead['numerator_moe'],
                                                          all_states_lead['total_houses_M'])


all_states_lead['rse'] = all_states_lead['se']/all_states_lead['lead_score']*100

#have to calculate SE for each state

all_states_lead['stateSE'] = np.zeros_like(len(all_states_lead['state']))
for i in all_states_lead['state'].unique():
    subset = all_states_lead[data.state == i]
    meanSE = np.mean(subset['se'])
    all_states_lead['stateSE'].loc[all_states_lead['state'] == i] = meanSE
    

all_states_lead.loc[(all_states_lead['rse'] >= 50) & (all_states_lead['rse'] >= all_states_lead['stateSE']),'prop'] = np.nan

data['pct_rank'] = np.zeros_like(len(data['st']))

data['Hous_Rank'] = data[['prop','st']].groupby('st')\
                                            .rank(method = 'average', na_option='keep', pct=True)*100

#Calculating the overall standard error to eliminate unreliable estimates

NameError: name 'data' is not defined

In [9]:
# Calculate percentiles for each state, and create final table with only FIPS tract id, score, and percentile ranking 

all_states_lead['lead_rank'] = all_states_lead[['lead_score','state']].groupby('state')\
                                            .rank(method='average', na_option='keep', pct=True)*100

# Keep columns for NAME and state for use in the final table
all_states_lead = all_states_lead[['NAME', 'state', 'FIPS_tract_id', 'lead_score', 'lead_rank']]

KeyError: "['lead_score'] not in index"

## 2. Oil <a id='oil'></a>

Using Colorado oil and gas data at https://cogcc.state.co.us/data2.html#/downloads to calculate scores based on 

* the number of [wells](https://cogcc.state.co.us/documents/data/downloads/gis/metadata/Oil_and_Gas_Locations_Metadata.html) (under Oil & Gas Locations (3.7 Mb)) , [pits](https://cogcc.state.co.us/documents/data/downloads/gis/metadata/Pits_Metadata.html) (under Pits (1 Mb)), and [tank batteries](https://cogcc.state.co.us/documents/data/downloads/gis/metadata/Tank_Batteries_Metadata.html) (under Tank Batteries (87 Kb)) within a buffer distance of a populated block
* the [state](https://cogcc.state.co.us/documents/about/COGIS_Help/Status_Codes.pdf) of the well (ex. not active anymore, just being drilled, etc. where the older the well is the less harmful it is currently) 

for all populated census tracts in Colorado.

In [None]:
# oil and gas facilities data: https://cogcc.state.co.us/documents/data/downloads/gis/metadata/Oil_and_Gas_Locations_Metadata.html
col_oil_gas = gpd.read_file("./OIL_AND_GAS_LOCATIONS_SHP")

# metadata: https://cogcc.state.co.us/documents/data/downloads/gis/metadata/Tank_Batteries_Metadata.html
# Tank Battery is a device used to store crude oil which is produced from a well.
col_tank_batteries = gpd.read_file("./TANK_BATTERIES_SHP")

# oil pits: https://cogcc.state.co.us/documents/data/downloads/gis/metadata/Pits_Metadata.html
col_pits = gpd.read_file("./PITS_SHP")

In [None]:
#Filter out non-populated blocks

#non populated block codes in Colorado BY GEOID to get unique codes!!
non_pop_blocks_geoid = []

#get all county codes in Colorado: https://simple.wikipedia.org/wiki/List_of_counties_in_Colorado
# 001 - 125 by odd nums

counties = list(map(str, np.arange(1, 127, 2)))
counties = list(map(lambda x: str.zfill(x, 3), counties))

#Get population data
#Using Decennial SF1 (Summary File 1) 2010 because it goes down to block level
# https://api.census.gov/data.html << find Decennial SF1 in 2010
#ACS5 has more recent population data until 2018, but only goes down to block group level
#Colorado is state code 08
#Have to use for loop over all counties because api doesn't let us iterate over all blocks at once 

for county_code in counties: 
    url = "https://api.census.gov/data/2010/dec/sf1?get=NAME,group(P1)&for=block:*&in=state:08%county:" + county_code
    r = requests.get(url)

    r.raise_for_status()
    
    data = r.json()

    block_df = pd.DataFrame(data)
    block_df.columns = block_df.iloc[0]
    block_df = block_df.iloc[1:]

    # P001001	Total	TOTAL POPULATION
    block_df['P001001'] = block_df['P001001'].apply(int)
    
    #what to do with margin of error in population counts? P001001ERR
    non_pop_blocks_geoid.extend(block_df[block_df['P001001'] == 0]['GEO_ID'].values.tolist())

#get rid of first '1000000US' of strings 
non_pop_blocks_geoid = pd.Series(non_pop_blocks_geoid).apply(lambda x: x[9:])

# Colorado census block shapes data: https://catalog.data.gov/dataset/tiger-line-shapefile-2019-2010-state-colorado-2010-census-block-state-based
col_blocks = gpd.read_file("./tl_2019_08_tabblock10/tl_2019_08_tabblock10.shp")

col_blocks_copy = col_blocks.copy()

# Filter out blocks with no population
filter_col_blocks_pop = col_blocks_copy[~col_blocks_copy['GEOID10'].isin(non_pop_blocks_geoid)]

In [None]:
#Calculate distance to closest populated block for each well, tank, pit using QGIS

wells_distances = gpd.read_file("./wells_with_distance_meters")
tanks_distances = gpd.read_file("./tanks_with_distance_meters")
pits_distances = gpd.read_file("./pits_with_distance_meters")

In [None]:
#Filter for only sites within 1km of a populated block

wells_within_onekm = wells_distances[wells_distances['distance'] <= 1000]
tanks_within_onekm = tanks_distances[tanks_distances['distance'] <= 1000]
pits_within_onekm = pits_distances[pits_distances['distance'] <= 1000]

Create 1 km buffers around the census tracts to do a weighted aggregate of the oil sites in each buffered census tract

In [None]:
# Colorado census tract shapefile from CDPHE here: https://data- cdphe.opendata.arcgis.com/datasets/a9f5b1a67bd74b2fa22279d141625335_3/data

col_tracts = gpd.read_file("./Colorado_Census_Tract_Boundaries-shp")

col_tracts_copy = col_tracts.copy().drop(columns=["OBJECTID"])

In [None]:
#calculate buffer distance in degrees from km

lat_radians = 39.7392 * np.pi/180
buff_dist = 1/(111.32*np.cos(lat_radians))

In [None]:
#add buffers to census tracts

col_tracts_buffered = col_tracts_copy.apply(lambda x: x.iloc[-1].buffer(buff_dist), axis=1) #axis = 1 to apply to each row
col_tracts_buffered_df = col_tracts_copy
col_tracts_buffered_df['geometry'] = col_tracts_buffered

In [None]:
# lists all instances of wells within 1km of a populated block with the corresponding buffered census tract(s) they intersect with

#need crs to be same for spatial join
col_tracts_buffered_df = col_tracts_buffered_df.to_crs(wells_within_onekm.crs)

#spacial join
wells_w_buf_tract = gpd.sjoin(col_tracts_buffered_df, wells_within_onekm, op='intersects') #Spatial join Points to polygons

In [None]:
#weight each site based on CalEnviroscreen weights/distances method
#1 : <=250m
#0.5: <=500m
#0.25: <=750m
#0.1: <=1000m

In [None]:
# first add weights for the well sites

# make a column to turn into weights
wells_w_buf_tract['weights'] = wells_w_buf_tract['distance']

#assign weights
wells_w_buf_tract['weights'] = np.where(wells_w_buf_tract['distance']<=250, 1, 
                                     np.where(wells_w_buf_tract['distance']<=500, 0.5,
                                     np.where(wells_w_buf_tract['distance']<=750, 0.25,
                                     np.where(wells_w_buf_tract['distance']<=1000, 0.1,0))))

#sum weights for each census tract
wells_agg_df = wells_w_buf_tract.groupby('FIPS').sum()[['weights']].reset_index().rename(columns={'FIPS':'FIPS_tract_id', 'weights':'wells_agg'})

In [None]:
#repeat for tanks

# lists all instances of tanks within 1km of a populated block with the corresponding buffered census tract(s) they intersect with
tanks_w_buf_tract = gpd.sjoin(col_tracts_buffered_df, tanks_within_onekm, op='intersects') #Spatial join Points to polygons


# make a column to turn into weights
tanks_w_buf_tract['weights'] = tanks_w_buf_tract['distance']

#assign weights
tanks_w_buf_tract['weights'] = np.where(tanks_w_buf_tract['distance']<=250, 1, 
                                     np.where(tanks_w_buf_tract['distance']<=500, 0.5,
                                     np.where(tanks_w_buf_tract['distance']<=750, 0.25,
                                     np.where(tanks_w_buf_tract['distance']<=1000, 0.1,0))))

#sum weights for each census tract
tanks_agg_df = tanks_w_buf_tract.groupby('FIPS').sum()[['weights']].reset_index().rename(columns={'FIPS':'FIPS_tract_id', 'weights':'tanks_agg'})

In [None]:
#repeat for pits

# lists all instances of pits within 1km of a populated block with the corresponding buffered census tract(s) they intersect with
pits_w_buf_tract = gpd.sjoin(col_tracts_buffered_df, pits_within_onekm, op='intersects') #Spatial join Points to polygons


# make a column to turn into weights
pits_w_buf_tract['weights'] = pits_w_buf_tract['distance']

#assign weights
pits_w_buf_tract['weights'] = np.where(pits_w_buf_tract['distance']<=250, 1, 
                                     np.where(pits_w_buf_tract['distance']<=500, 0.5,
                                     np.where(pits_w_buf_tract['distance']<=750, 0.25,
                                     np.where(pits_w_buf_tract['distance']<=1000, 0.1,0))))

#sum weights for each census tract
pits_agg_df = pits_w_buf_tract.groupby('FIPS').sum()[['weights']].reset_index().rename(columns={'FIPS':'FIPS_tract_id', 'weights':'pits_agg'})

In [None]:
#merge all wells, pits, and tanks weighted sums for each census tract

merged = tanks_agg_df.merge(wells_agg_df, on="FIPS_tract_id", how='outer').merge(pits_agg_df, on='FIPS_tract_id', how='outer')


oil_df = merged.set_index("FIPS_tract_id")[['tanks_agg', 'wells_agg', 'pits_agg']].sum(axis=1)
oil_df = oil_df.to_frame().rename(columns={0: 'oil_score'})

#percentile rank the weighted sums 
oil_df['oil_rank'] = oil_df['oil_score'].rank(method='average', na_option='keep', pct=True)*100

## 3-7. PRMP, PTSDF, PNPL, PWDIS, PTRAF <a id='from_ejscreen'></a>

This section utilizes 2019 EJScreen data (ftp://newftp.epa.gov/EJSCREEN/2019/) at the census block-group level and uses an population-weighted average scheme to create a new dataframe at a coarser spatial granularity—in this case, census tract level.

The data we read in contains features for
* Proximity to Risk Management Plan Sites (PRMP)
* Proximity to Treatment Storage and Disposal Facilities (PTSDF)
* Proximity to National Priorities List Sites (PNPL)
* Wastewater Discharge Indicator (PWDIS)
* Traffic Proximity and Volume (PTRAF)

Data documentation can be found at 
https://ejscreen.epa.gov/arcgis/rest/services/ejscreen and https://catalog.data.gov/harvest/object/a486e515-4a96-4737-a10d-4aa8d22c4133/original

In [15]:
# # Read in the EJScreen dataframe and filter for the state of interest. This can be memory-intensive,
# # so ejscreen data is read in, filtered for a single state, then saved as a separate .csv file
# # which can then be re-opened and manipulated
# ejscreen_2019_data = pd.read_csv('EJSCREEN_2019_USPR.csv', dtype = {'ID': np.object_})
# # ejscreen_2019_df = ejscreen_2019_data.copy()

  interactivity=interactivity, compiler=compiler, result=result)


In [17]:
# # # Create a new column to uniquely identify tracts (gets rid of 12th digit representing block group)
# ejscreen_2019_data['FIPS_tract_id'] = ejscreen_2019_data['ID'].str[:-1]

# # # Create a new column to uniquely identify states (first 2 digits)
# ejscreen_2019_data['State_ID'] = ejscreen_2019_data['ID'].str[0:2]

# #Filter for state of choice
# state = '08'
# ejscreen_2019_onestate = ejscreen_2019_data[ejscreen_2019_data['State_ID']==state]
# ejscreen_2019_onestate.to_csv('EJscreendata_One_State.csv')

In [22]:
# # Read in filtered csv created in last step so that you aren't manipulating the entire dataset

# ejscreen_2019_df = pd.read_csv('EJscreendata_One_State.csv', dtype = {'FIPS_tract_id': np.object_,
#                                                                       'State_ID': np.object_,
#                                                                      'ID': np.object_})

# # Find total population of each tract by summing corresponding block groups (which now all have the same FIPS tract id)
# df_tracts = ejscreen_2019_df[['FIPS_tract_id','ACSTOTPOP']].groupby('FIPS_tract_id',as_index=False).sum().rename(columns={'ACSTOTPOP':'Tract_Pop'})
# df_tracts

Unnamed: 0,FIPS_tract_id,Tract_Pop
0,08001007801,4412
1,08001007802,4594
2,08001007900,5589
3,08001008000,6412
4,08001008100,1563
5,08001008200,5766
6,08001008308,5527
7,08001008309,3847
8,08001008353,7088
9,08001008401,4747


In [24]:
# Create new dataframe that for each block group row, specifies total tract population (for the tract in which the block group is located)
# We are only selecting the 5 indicators and shape information when merging.
# df_with_totpop = pd.merge(ejscreen_2019_df[['ID','State_ID','FIPS_tract_id','PTRAF','PTSDF','PRMP','PWDIS','PNPL','ACSTOTPOP']], 
#                           df_tracts[['FIPS_tract_id','Tract_Pop']], on='FIPS_tract_id')
# df_with_totpop

Unnamed: 0,ID,State_ID,FIPS_tract_id,PTRAF,PTSDF,PRMP,PWDIS,PNPL,ACSTOTPOP,Tract_Pop
0,080010078011,08,08001007801,1137.344519,1.016899,0.761345,3.578818e-04,0.115279,1933,4412
1,080010078012,08,08001007801,850.196184,0.986899,0.697082,2.570012e-03,0.126361,2479,4412
2,080010078021,08,08001007802,1009.033328,1.106776,0.752442,3.366881e-04,0.105660,1476,4594
3,080010078022,08,08001007802,2262.173008,1.461509,0.705932,1.027715e-03,0.098338,1279,4594
4,080010078023,08,08001007802,1082.991331,1.220760,0.721071,9.473644e-04,0.099952,1839,4594
5,080010079001,08,08001007900,90.122870,1.439974,1.966009,4.091886e-04,0.119663,1816,5589
6,080010079002,08,08001007900,56.733881,1.690448,1.952102,8.433198e-04,0.126050,613,5589
7,080010079003,08,08001007900,115.407257,1.151880,1.433039,4.027204e-04,0.117370,863,5589
8,080010079004,08,08001007900,182.310009,1.189574,1.376150,1.613578e-03,0.127368,2297,5589
9,080010080001,08,08001008000,648.254626,1.861332,1.823165,2.620262e-03,0.109449,597,6412


In [25]:
# # Create a new column that gives proportion of population that each block group contributes
# df_with_totpop['Tract_Proportion'] = df_with_totpop['ACSTOTPOP'] / df_with_totpop['Tract_Pop']
# df_with_totpop.head()

Unnamed: 0,ID,State_ID,FIPS_tract_id,PTRAF,PTSDF,PRMP,PWDIS,PNPL,ACSTOTPOP,Tract_Pop,Tract_Proportion
0,80010078011,8,8001007801,1137.344519,1.016899,0.761345,0.000358,0.115279,1933,4412,0.438123
1,80010078012,8,8001007801,850.196184,0.986899,0.697082,0.00257,0.126361,2479,4412,0.561877
2,80010078021,8,8001007802,1009.033328,1.106776,0.752442,0.000337,0.10566,1476,4594,0.321289
3,80010078022,8,8001007802,2262.173008,1.461509,0.705932,0.001028,0.098338,1279,4594,0.278407
4,80010078023,8,8001007802,1082.991331,1.22076,0.721071,0.000947,0.099952,1839,4594,0.400305


In [27]:
# # Create new column with how much each block group contributes to the indicator
# df_with_totpop['PTRAF_prop'] = df_with_totpop['PTRAF'] * df_with_totpop['Tract_Proportion']
# df_with_totpop['PTSDF_prop'] = df_with_totpop['PTSDF'] * df_with_totpop['Tract_Proportion']
# df_with_totpop['PRMP_prop'] = df_with_totpop['PRMP'] * df_with_totpop['Tract_Proportion']
# df_with_totpop['PWDIS_prop'] = df_with_totpop['PWDIS'] * df_with_totpop['Tract_Proportion']
# df_with_totpop['PNPL_prop'] = df_with_totpop['PNPL'] * df_with_totpop['Tract_Proportion']
# df_with_totpop.drop(columns={'PTRAF','PTSDF','PRMP','PWDIS','PNPL'}, inplace=True)

# df_with_totpop.head()

Unnamed: 0,ID,State_ID,FIPS_tract_id,ACSTOTPOP,Tract_Pop,Tract_Proportion,PTRAF_prop,PTSDF_prop,PRMP_prop,PWDIS_prop,PNPL_prop
0,80010078011,8,8001007801,1933,4412,0.438123,498.297134,0.445527,0.333563,0.000157,0.050506
1,80010078012,8,8001007801,2479,4412,0.561877,477.705426,0.554516,0.391674,0.001444,0.070999
2,80010078021,8,8001007802,1476,4594,0.321289,324.190943,0.355594,0.241751,0.000108,0.033947
3,80010078022,8,8001007802,1279,4594,0.278407,629.803935,0.406894,0.196536,0.000286,0.027378
4,80010078023,8,8001007802,1839,4594,0.400305,433.526569,0.488676,0.288648,0.000379,0.040011


In [28]:
# Create table with tract-average values for every tract
# Sums the proportions of all the block groups of a tract to create a total weighted average for each tract
# indicators_tracts = df_with_totpop[['State_ID',
#                                     'FIPS_tract_id',
#                                     'PTRAF_prop', 
#                                     'PTSDF_prop', 
#                                     'PRMP_prop', 
#                                     'PWDIS_prop', 
#                                     'PNPL_prop']].groupby('FIPS_tract_id').sum().rename(columns={'PTRAF_prop':'PTRAF_score',
#                                                                                                  'PTSDF_prop':'PTSDF_score', 
#                                                                                                  'PRMP_prop':'PRMP_score', 
#                                                                                                  'PWDIS_prop':'PWDIS_score', 
#                                                                                                  'PNPL_prop':'PNPL_score'})
# indicators_tracts

Unnamed: 0_level_0,PTRAF_score,PTSDF_score,PRMP_score,PWDIS_score,PNPL_score
FIPS_tract_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
08001007801,976.002560,1.000043,0.725237,1.600826e-03,0.121506
08001007802,1387.521446,1.251164,0.726935,7.735313e-04,0.101336
08001007900,128.252559,1.320050,1.639764,9.507921e-04,0.123176
08001008000,314.624480,1.651543,1.552685,1.365529e-03,0.107908
08001008100,1853.973994,2.510113,1.208683,1.665516e-03,0.105026
08001008200,2709.118899,1.961991,2.022874,2.440053e-03,0.118947
08001008308,2184.860481,1.527427,1.244636,5.456474e-04,0.102495
08001008309,608.517441,1.804490,3.011223,4.251968e-04,0.092019
08001008353,105.585510,1.163908,1.832401,8.736228e-03,0.099901
08001008401,6.892790,0.056466,1.009537,3.415246e-07,0.038035


In [33]:
## Calculate percentiles for each indicator for each state
# indicators_tracts['PTRAF_rank'] = indicators_tracts['PTRAF_score']\
#                                             .rank(method='average', na_option='keep', pct=True)
# indicators_tracts['PTSDF_rank'] = indicators_tracts['PTSDF_score']\
#                                             .rank(method='average', na_option='keep', pct=True)
# indicators_tracts['PRMP_rank'] = indicators_tracts['PRMP_score']\
#                                             .rank(method='average', na_option='keep', pct=True)
# indicators_tracts['PWDIS_rank'] = indicators_tracts['PWDIS_score']\
#                                             .rank(method='average', na_option='keep', pct=True)
# indicators_tracts['PNPL_rank'] = indicators_tracts['PNPL_score']\
#                                             .rank(method='average', na_option='keep', pct=True)

# indicators_tracts

Unnamed: 0_level_0,PTRAF_score,PTSDF_score,PRMP_score,PWDIS_score,PNPL_score,PTRAF_rank,PTSDF_rank,PRMP_rank,PWDIS_rank,PNPL_rank
FIPS_tract_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
08001007801,976.002560,1.000043,0.725237,1.600826e-03,0.121506,0.844676,0.779023,0.734187,0.413131,0.732586
08001007802,1387.521446,1.251164,0.726935,7.735313e-04,0.101336,0.902322,0.816653,0.735789,0.369095,0.659728
08001007900,128.252559,1.320050,1.639764,9.507921e-04,0.123176,0.277822,0.827862,0.907126,0.385909,0.737390
08001008000,314.624480,1.651543,1.552685,1.365529e-03,0.107908,0.481185,0.872698,0.895917,0.405124,0.687750
08001008100,1853.973994,2.510113,1.208683,1.665516e-03,0.105026,0.938351,0.933547,0.843074,0.416333,0.678943
08001008200,2709.118899,1.961991,2.022874,2.440053e-03,0.118947,0.974380,0.895917,0.943155,0.446757,0.722978
08001008308,2184.860481,1.527427,1.244636,5.456474e-04,0.102495,0.954363,0.854283,0.849480,0.341073,0.666934
08001008309,608.517441,1.804490,3.011223,4.251968e-04,0.092019,0.693355,0.883106,0.978383,0.329864,0.614892
08001008353,105.585510,1.163908,1.832401,8.736228e-03,0.099901,0.245797,0.804644,0.927942,0.551641,0.653323
08001008401,6.892790,0.056466,1.009537,3.415246e-07,0.038035,0.060048,0.200160,0.810248,0.118495,0.401922


In [37]:
# indicators_tracts = indicators_tracts.rename(columns = {
#     "Avg_PTRAF": "PTRAF_score",
#     "Avg_PTSDF": "PTSDF_score",
#     "Avg_PRMP": "PRMP_score",
#     "Avg_PWDIS": "PWDIS_score",
#     "Avg_PNPL": "PNPL_score"
# })

In [42]:
# indicators_tracts.to_csv('state_EJScreen_indicators.csv')

In [40]:
ejscreen_indicators = pd.read_csv("state_EJScreen_indicators.csv", dtype={'Tract_ID': object})

In [41]:
ejscreen_indicators['PTRAF_rank'] = ejscreen_indicators['PTRAF_rank']*100
ejscreen_indicators['PTSDF_rank'] = ejscreen_indicators['PTSDF_rank']*100
ejscreen_indicators['PRMP_rank'] = ejscreen_indicators['PRMP_rank']*100
ejscreen_indicators['PWDIS_rank'] = ejscreen_indicators['PWDIS_rank']*100
ejscreen_indicators['PNPL_rank'] = ejscreen_indicators['PNPL_rank']*100

## 8. Ozone <a id='ozone'></a>

Ground-level ozone (O3) can trigger a variety of health problems, and its precursors are emitted by motor vehicles, industrial facilities, and power plants as well as natural sources. It is also a primary constituent of smog. (https://www.epa.gov/ground-level-ozone-pollution)

This indicator utilizes 2019 EJScreen data (ftp://newftp.epa.gov/EJSCREEN/2019/), where the "OZONE" variable represents the May–September (summer/ ozone season) average of daily-maximum 8-hour-average ozone concentrations, in parts per billion (ppb). This was estimated by EPA from a combination of monitoring data and CMAQ air quality modeling from 2016 (documentation can be found on page 42 https://www.epa.gov/sites/production/files/2017-09/documents/2017_ejscreen_technical_document.pdf).

Note: 
* The census block results presented by EJSCREEN site are actually census tract values distributed homogeneously across all census blocks within a census tract.
* PM2.5 and ozone estimates were not available for Alaska or Hawaii for use in the 2019 version of EJSCREEN, due to a lack of CMAQ modeling. 

In [15]:
ejscreen_2019_data = pd.read_csv("data/EJSCREEN_2019_StatePctiles.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [16]:
ejscreen_2019_df = ejscreen_2019_data.copy()
ejscreen_2019_df = ejscreen_2019_df[['ID', 'STATE_NAME', 'OZONE', 'PM25']]

In [17]:
# ID column : Census block group fips code (12 digits, 12th digit as block group unique id)
# The ID column is missing that initial 0 in front of single digit state codes
# Turn block level FIPS to tract level FIPS
ejscreen_2019_df['ID'] = ejscreen_2019_df['ID'].apply(str) 
ejscreen_2019_df['FIPS_block_group_id'] = list(map(lambda x: str.zfill(x, 12), ejscreen_2019_df['ID'])) # Fill that initial 0 to get consistent 12 digits
ejscreen_2019_df['FIPS_tract_id'] = ejscreen_2019_df['FIPS_block_group_id'].str.slice(0, 11) # Now 11 digit FIPS to tract level

# Group by tract level FIPS to get distinct values 
ejscreen_2019_df = ejscreen_2019_df.groupby('FIPS_tract_id').first()

In [18]:
# Filter out nulls in the "OZONE" column (represented as str "None")
ozone_df = ejscreen_2019_df[ejscreen_2019_df['OZONE'] != 'None']

# Rename 'OZONE' column to 'ozone_score'
ozone_df = ozone_df.rename(columns = {'OZONE' : 'ozone_score'})

# Convert "ozone_score" to floats
ozone_df['ozone_score'] = ozone_df['ozone_score'].astype(float)

ozone_df = ozone_df.drop(columns = {'ID', 'FIPS_block_group_id', 'PM25'})

# Calculate percentile scores for each state
ozone_df['ozone_rank'] = ozone_df[['ozone_score','STATE_NAME']].groupby('STATE_NAME')\
                                            .rank(method='average', na_option='keep', pct=True)*100

ozone_df = ozone_df.drop(columns = {'STATE_NAME'})

## 9. Particulate Matter: PM 2.5 (cont. from Ozone code) <a id='pm25'></a>

PM2.5 is particulate matter that is 2.5 microns or less in diameter. Common sources of PM2.5 emissions
include power plants and industrial facilities. (https://www.epa.gov/pm-pollution)

This indicator utilizes 2019 EJScreen data (ftp://newftp.epa.gov/EJSCREEN/2019/), where the "PM25" variable represents the annual average PM2.5 concentration in micrograms per cubic meter (µg/m3). This was estimated by EPA from a combination of monitoring data and CMAQ air quality modeling from 2016 (documentation can be found on page 38 https://www.epa.gov/sites/production/files/2017-09/documents/2017_ejscreen_technical_document.pdf).

Note:

* The census block results presented by EJSCREEN site are actually census tract values distributed homogeneously across all census blocks within a census tract.
* PM2.5 and ozone estimates were not available for Alaska or Hawaii for use in the 2019 version of EJSCREEN, due to a lack of CMAQ modeling.

In [19]:
# Filter out nulls in the "PM25" column (represented as str "None")
pm25_df = ejscreen_2019_df[ejscreen_2019_df['PM25'] != 'None']

# Rename 'PM25' column to 'PM25_score'
pm25_df = pm25_df.rename(columns = {'PM25' : 'PM25_score'})

# Convert "PM25_score" to floats
pm25_df['PM25_score'] = pm25_df['PM25_score'].astype(float)

pm25_df = pm25_df.drop(columns = {'ID', 'FIPS_block_group_id', 'OZONE'})

In [20]:
# Calculate percentile scores for each state
pm25_df['PM25_rank'] = pm25_df[['PM25_score','STATE_NAME']].groupby('STATE_NAME')\
                                            .rank(method='average', na_option='keep', pct=True)*100

pm25_df = pm25_df[['PM25_score', 'PM25_rank']]

## 10. Diesel <a id='diesel'></a>

Diesel PM
From 2014 NATA, exposure numbers << can't find this data source on the NATA website...unless it's the mdb file I can't openD:

In [21]:
diesel = pd.read_csv("data/dieselpmexposure.csv", dtype={'Tract': object})

In [22]:
diesel.drop(diesel[diesel['County']=='Entire State'].index, inplace = True)
diesel.drop(diesel[diesel['County']=='Entire US'].index, inplace = True)
diesel["diesel_rank"] = diesel.groupby("State")["Total Exposure Conc"].rank(pct = True, na_option = 'keep')*100

diesel = diesel.rename(columns = {
    "Total Exposure Conc": "diesel_score"
})
diesel_for_combine = diesel[['Tract','diesel_score', 'diesel_rank']]

## 11. Air Toxics  <a id='air_toxics'></a>

This data is from 2014 NATA (released 2018), at https://www.epa.gov/national-air-toxics-assessment/2014-nata-assessment-results under "National noncancer hazard index summaries" with the download link listed as "2014 NATA all hazard indexes (XLS)". 

In [23]:
# For convenience, the data is saved as "natapollutanthazards.csv"
toxics = pd.read_csv("data/natapollutanthazards.csv", dtype={'Tract': object})

In [24]:
col_list = ['Respiratory HI',
 'Neurological HI',
 'Liver HI',
 'Developmental HI',
 'Reproductive HI',
 'Kidney HI',
 'Ocular HI',
 'Endocrine HI',
 'Hematological HI',
 'Immunological HI',
 'Skeletal HI',
 'Spleen HI',
 'Thyroid HI',
 'Whole Body HI']

# Adding a State identifier to the toxics table for grouping
toxics['State'] = toxics.Tract.astype(str).str[:2].astype(int)
toxics['AirToxicsHI'] = toxics[col_list].mean(axis=1)
toxics["toxics_rank"] = toxics.groupby("State")["AirToxicsHI"].rank(pct = True, method = 'average', na_option = 'keep')*100
toxics = toxics.rename(columns = {
    "AirToxicsHI": "toxics_score"
})
toxics_for_combine = toxics[['Tract', 'toxics_score', 'toxics_rank']]

## 12. Cancer Risk  <a id='cancer_risk'></a>

This data is from 2014 NATA (released 2018), at https://www.epa.gov/national-air-toxics-assessment/2014-nata-assessment-results with the download link listed as "2014 NATA natl cancer risk by pollutant (XLS)".

In [25]:
# For convenience, the columns State, EPA Region, County, FIPS, Tract, Population, and Total Cancer Risk (per million) 
# from the data are saved here as "natacancerrisk.csv" 
cancer = pd.read_csv("data/natacancerrisk.csv",dtype={'Tract': object})

In [26]:
cancer['Total Cancer Risk (per million)'] = pd.to_numeric(cancer['Total Cancer Risk (per million)'], errors = 'coerce')
cancer["cancer_rank"] = cancer.groupby("State")['Total Cancer Risk (per million)'].rank(pct = True,method = 'average',
                                                                                        na_option = 'keep')*100
cancer = cancer.rename(columns = {
    "Total Cancer Risk (per million)": "cancer_score"
})
cancer_for_combine = cancer[['Tract','cancer_score', 'cancer_rank']]

## 13. Housing Burden  <a id='housingburden'></a>

We use the 2012-2016 Comprehensive Housing Affordability Strategy (CHAS) dataset from the Department of Housing and Urban Development (HUD). This dataset contains cost burdens for households by percent HUD-adjusted median family income (HAMFI) category and is found at https://www.huduser.gov/portal/datasets/cp.html. The data is summarized for eight levels of Census (FIPS) geography, so we use "Table8.csv", the dataset for the summary level 080, or census tract level. 

For each census tract, the data was analyzed to estimate the "housing burden," or, the number of people making less than 80% of the HUD Area Median Family Income (HAMFI) and spending more than 50% of their income on rent or housing costs (cost burden). We then estimate the percentage of households that meet the above criteria within each census tract. Using T8_est1 (the total number of homes in a census tract) as the denominator, we follow the CalEnviroScreen methodology as described in the CES handbook:

    "The SE was calculated for each census tract using the formula for approximating the SE of proportions provided by the ACS (American Community Survey Office, 2013, pg. 13, equation 4 https://www2.census.gov/programs-surveys/acs/methodology/design_and_methodology/acs_design_methodology_report_2014.pdf). 

    * The RSE is calculated by dividing a tract’s SE by its estimate of the percentage of housing burdened low income households, and taking the absolute value of the result.

    * Census tract estimates that met either of the following criteria were considered reliable and included in the analysis: RSE less than 50 (meaning the SE was less than half of the estimate) or SE was less than the mean SE of all California census tract estimates for housing burdened low income households. 

    * Census tracts with unreliable estimates receive no score for the indicator (null). The indicator is not factored into that tract’s overall CalEnviroScreen score.

    * Census tracts that met the inclusion criteria were ordered by percent housing burdened low income households. The census tracts were assigned percentiles based on the distribution across all tracts."

The relevant variables in table 8 of the CHAS dataset are the following:
* T1_est1:	Total Occupied housing units
* T8_est10:	Owner occupied	less than or equal to 30% of HAMFI	cost burden greater than 50%
* T8_est23:	Owner occupied	greater than 30% but less than or equal to 50% of HAMFI	cost burden greater than 50%
* T8_est36:	Owner occupied	greater than 50% but less than or equal to 80% of HAMFI	cost burden greater than 50%
* T8_est76:	Renter occupied	less than or equal to 30% of HAMFI	cost burden greater than 50% 
* T8_est89:	Renter occupied	greater than 30% but less than or equal to 50% of HAMFI	cost burden greater than 50%
* T8_est102:  Renter occupied	greater than 50% but less than or equal to 80% of HAMFI	cost burden greater than 50%

In [27]:
housing = pd.read_csv("data/Table8.csv", encoding = "ISO-8859-1",  dtype={'Tract_ID': object, 'st': object, 'geoid': object})

In [28]:
housing = housing[['geoid', 'name', 'st', 
             'T8_est10', 'T8_moe10',
             'T8_est23', 'T8_moe23', 
             'T8_est36','T8_moe36', 
             'T8_est76', 'T8_moe76',
             'T8_est89', 'T8_moe89',
             'T8_est102', 'T8_moe102', 
             'T8_est1', 'T8_moe1']]

housing['summed'] = (housing['T8_est10'] + 
                  housing['T8_est23'] + 
                  housing['T8_est36'] + 
                  housing['T8_est76'] + 
                  housing['T8_est89'] + 
                  housing['T8_est102'])

housing['summed_MOE'] = np.sqrt((housing['T8_moe10']/1.645)**2 + 
                             (housing['T8_moe23']/1.645)**2 + 
                             (housing['T8_moe36']/1.645)**2 + 
                             (housing['T8_moe76']/1.645)**2 + 
                             (housing['T8_moe89']/1.645)**2 + 
                             (housing['T8_moe102']/1.645)**2)

# Truncate 14000US01001020100 to FIPS Census Tract ID 01001020100
housing['geoid'] = housing['geoid'].str[-11:]

# Filtering out census tracts with 0 population
housing = housing[housing.T8_est1 > 0]

# Finding estimate of proportion heavily rent burdened
housing['prop'] = housing['summed']/housing['T8_est1']

In [29]:
def standard_error_prop_sq(x, y, error_x, error_y): 
    test = error_x**2 - ((x**2)/(y**2)*((error_y)**2))
    se = np.where(test < 0,
                   (1/y) * np.sqrt(error_x**2 + ((x**2)/(y**2)*((error_y)**2))), 
                   (1/y) * np.sqrt(error_x**2 - ((x**2)/(y**2)*((error_y)**2))))
    return se

# data = data[data.geoid == '14000US06079010901'] This is one of the few tracts with a negative value

housing['se'] = standard_error_prop_sq(housing['summed'], housing['T8_est1'], housing['summed_MOE'], housing['T8_moe1'])

# Calculate Relative Standard Error
housing['rse'] = housing['se']/housing['prop']*100

# Calculate SE for each state
housing['stateSE'] = np.zeros_like(len(housing['st']))
for i in housing['st'].unique():
    subset = housing[housing.st == i]
    meanSE = np.mean(subset['se'])
    housing['stateSE'].loc[housing['st']== i] = meanSE
    
housing.loc[(housing['rse'] >= 50) & (housing['rse'] >= housing['stateSE']),'prop'] = np.nan

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [30]:
housing['HouseBurden_rank'] = housing[['prop','st']].groupby('st')\
                                            .rank(method = 'average', na_option='keep', pct=True)*100

housing = housing.rename(columns = {'prop':'HouseBurden_score'})

houseburden = housing[['geoid','st','name','HouseBurden_score','HouseBurden_rank']]

## 14-18. Educational Attainment, Linguistic Isolation, Unemployment, Race, Poverty 

Using 2018 ACS 5-year data for finding Poverty, Education, Race, Linguistic Isolation, Unemployment indicators.

URL for geography codes: https://api.census.gov/data/2018/acs/acs5/profile/geography.html

In [31]:
# date = "2018"
# dataset = '/acs/acs5/profile'
# base_url = "https://api.census.gov/data"
# variables = "NAME,DP02_0086E,DP02_0086M,DP02_0066PE,DP02_0066PM,DP02_0113PE,DP02_0113PM,DP03_0005E,DP03_0005M,DP05_0077PE,DP05_0077PM"

# state = '08'
# query = base_url+"/"+date+dataset+"?get="+variables+'&for='+'tract:*&in=state:'+state+'+county:*'


# r = pd.read_json(query, dtype = True)
# r.columns = r.iloc[0]
# r = r.drop(r.index[0])
# r = r.rename(columns={"DP02_0086E": "total_pop",
#                       "DP02_0086M": "total_popMOE",
#                       "DP02_0066PE": "hs_degree", 
#                       "DP02_0066PM": "hs_degreeMOE",
#                       "DP02_0113PE": "linguistic_isolation",
#                      "DP02_0113PM": "linguistic_isolationMOE",
#                      "DP03_0005E": "unemployment",
#                      "DP03_0005M": "unemploymentMOE",
#                       "DP05_0077PE": "whitePerc",
#                       "DP05_0077PM": "nonwhiteMOE"
#                      })
# #Constructing FIPS Code
# r['FIPS'] = r['state']+r['county']+r['tract']

In [32]:
# #Cleaning data, replacing negatives with nan
# colnames = ["total_pop",
#             "total_popMOE",
#             "hs_degree", 
#             "hs_degreeMOE", 
#             "linguistic_isolation",
#             "linguistic_isolationMOE",
#             "unemployment",
#             "unemploymentMOE",
#            'whitePerc',
#            'nonwhiteMOE']
# for i in colnames:
#     r[i] = pd.to_numeric(r[i], errors = 'coerce')
# r = r.replace(-222222222.0, np.nan)
# r = r.replace(-666666666.0, np.nan)
# r['total_pop'] = r['total_pop'].replace(0,np.nan)

### Educational Attainment  <a id='edu_attain'></a>

This indicator describes the percent of the population over age 25 with less than a high school
education.

Following CES methods for Educational Attainment on page 122 (https://oehha.ca.gov/media/downloads/calenviroscreen/report/ces3report.pdf):

    "* This percentage was subtracted from 100 to obtain the proportion of the population with less than a high school education.

    * Unlike the US Census, ACS estimates come from a sample of the population and may be unreliable if they are based on a small sample or population size. The standard error (SE) and relative standard error (RSE) were used to evaluate the reliability of each estimate.

    * The SE was calculated for each census tract by dividing the margin of error (MOE) reported in the ACS by 1.645, a statistical value associated with a 90 percent confidence interval. The MOE is the difference between an estimate and the upper or lower bounds of its confidence interval. All ACS-published MOEs are based on a 90 percent confidence interval.

    * The RSE is calculated by dividing a tract’s SE by its estimate of educational attainment, and taking the absolute value of the result.

    * Census tract estimates that met either of the following criteria were considered reliable and included in the analysis: RSE less than 50 (meaning the SE was less than half of the estimate) or SE was less than the mean SE of all California census tract estimates for education.

    * Census tracts with unreliable estimates received no score for the indicator (null). The indicator was not factored into that tract’s overall CalEnviroScreen score.

    * Census tracts that met the inclusion criteria were ordered by the percentage of the population over age 25 with less than a high school education and percentiles were assigned to each based on the distribution across all census tracts."

In [33]:
# r['no_hs'] = 100-r['hs_degree']
# r['hs_degreeSE'] = r['hs_degreeMOE']/1.645

# #The RSE is calculated by dividing a tract’s SE by its 
# #estimate of educational attainment, and taking the 
# #absolute value of the result.
# r['hs_degreeRSE'] = (r['hs_degreeSE']/r['no_hs'])*100

# meanSE = r['hs_degreeSE'].mean()

# #Removing values that are statistically suspect, before assigning ranks
# r.loc[(r['hs_degreeRSE'] >= 50) & (r['hs_degreeSE'] >= meanSE), 'no_hs']=np.nan

# r['edu_rank'] = r['no_hs'].rank(pct = True, na_option = 'keep')*100
# r = r.rename(columns = {'no_hs_pct':'edu_rank'})

### Linguistic Isolation  <a id='lin'></a>

This variable is referred to as “linguistic isolation” and measures households where no one speaks English well.

Following CES methods on page 133 at https://oehha.ca.gov/media/downloads/calenviroscreen/report/ces3report.pdf: 

    "* The SE was calculated for each census tract by dividing the margin of error (MOE) reported in the ACS by 1.645, a statistical value associated with a 90 percent confidence interval. The MOE is the difference between an estimate and the upper or lower bounds of its confidence interval. All ACS-published MOEs are based on a 90 percent confidence interval.

    * The RSE is calculated by dividing a tract’s SE by its estimate of the percent of linguistically isolated households, and taking the absolute value of the result.

    * Census tract estimates that met either of the following criteria were considered reliable and included in the analysis: RSE less than 50 (meaning the SE was less than half of the estimate) or SE was less than the mean SE of all California census tract estimates for linguistic isolation.

    * Census tracts with unreliable estimates received no score for the indicator (null). The indicator was not factored into that tract’s overall CalEnviroScreen score.

    * Census tracts that met the inclusion criteria were ordered by the percent linguistically isolated and percentiles were assigned to each based on the distribution across all tracts."

In [34]:
# r['linguistic_isolationSE'] = r['linguistic_isolationMOE']/1.645

# #The RSE is calculated by dividing a tract’s SE by its 
# #estimate of educational attainment, and taking the 
# #absolute value of the result.
# r['linguistic_isolationRSE'] = (r['linguistic_isolationSE']/r['linguistic_isolation'])*100

# meanSE = r['linguistic_isolationSE'].mean()

# #Removing values that are statistically suspect, before assigning ranks
# r.loc[(r['linguistic_isolationRSE'] >= 50) & (r['linguistic_isolationSE'] >= meanSE),'linguistic_isolation']=np.nan

# r['lin_rank'] = r['linguistic_isolation'].rank(pct = True, na_option = 'keep')*100
# r = r.rename(columns = {'linguistic_isolation':'lin_score'})

### Unemployment  <a id='unemploy'></a>

This indicator describes the percent of the population over the age of 16 that is unemployed and eligible for the labor force. Excludes retirees, students, homemakers, institutionalized persons except prisoners, those not looking for work, and military personnel on active duty.

Following CES methods on page 143 at https://oehha.ca.gov/media/downloads/calenviroscreen/report/ces3report.pdf: 

    "* The Census Bureau calculates an unemployment rate by dividing the 'Population Unemployed in the Civilian Labor Force' by 'Population in the Civilian Labor Force' and then converting to a percentage.

    * Unlike the US Census, ACS estimates come from a sample of the population and may be unreliable if they are based on a small sample or population size. The standard error (SE) and relative standard error (RSE) were used to evaluate the reliability of each estimate.

    * The SE was calculated for each census tract using the formula for approximating the SE of proportions provided by the ACS (American Community Survey Office, 2013, pg. 13, equation 4 https://www2.census.gov/programs-surveys/acs/tech_docs/accuracy/MultiyearACSAccuracyofData2013.pdf). When this approximation could not be used, the formula for approximating the SE of ratios (equation 3) was used instead (as in calculating the poverty and unemployment indicators).

    * The RSE is calculated by dividing a tract’s SE by its estimate of unemployment rate, and taking the absolute value of the result.

    * Census tract estimates that met either of the following criteria were considered reliable and included in the analysis: RSE less than 50 (meaning the SE was less than half of the estimate) or SE was less than the mean SE of all California census tract estimates for unemployment rate.

    * Census tracts with unreliable estimates received no score for the indicator (null). The indicator was not factored into that tract’s overall CalEnviroScreen score.

    * Census tracts that met the inclusion criteria were ordered by unemployment rate. A percentile score for a census tract was determined by its place in the distribution of all census tracts."

In [35]:
# r['unemployment_rate'] = (r['unemployment']/r['total_pop'])*100

# #calculating value under square root sign to check if negative
# r['proportionvalues'] = (r['unemploymentMOE']/1.645)**2 - (
#                     r['unemployment']**2/r['total_pop']**2)*r['total_popMOE']**2

# #turning negative sqrt values into na's
# r.loc[(r['proportionvalues'] < 0), 'proportionvalues']=np.nan

# #calculating SE for tracts with positive sqrt values
# r['unemploymentSE'] = ((1/r['total_pop'])*np.sqrt(r['proportionvalues']))*100

# #calculating SE using ratio method for tracts with negative values under radical in previous equation
# r.loc[(r['unemploymentSE'] == np.nan), 'unemploymentSE'] = ((1/r['total_pop']) * np.sqrt(
#             (r['unemploymentMOE']/1.645)**2 + (
#                 r['unemployment']**2/r['total_pop']**2)*r['total_popMOE']**2))*100

# #calculating RSE
# r['unemploymentRSE'] = (r['unemploymentSE']/r['unemployment_rate'])*100

# meanSE = r['unemploymentSE'].mean()

# #Removing tracts with non-dependable values
# r.loc[(r['unemploymentRSE'] >= 50) &
#       (r['unemploymentSE'] >= meanSE),
#       'unemployment_rate']=np.nan

# r['unemploy_rank'] = r['unemployment_rate'].rank(pct = True, na_option = 'keep')*100
# r = r.rename(columns = {'unemployment_rate':'unemploy_score'})

### Race  <a id='race'></a>

This indicator is a sum of all race/ethnicity categories except White/Non-Hispanic. It includes Black, American Indian/Alaskan Native, Asian, Native Hawaiian-Other Pacific Islander and two or more races. 

    * Unlike the US Census, ACS estimates come from a sample of the population and may be unreliable if they are based on a small sample or population size. The standard error (SE) and relative standard error (RSE) were used to evaluate the reliability of each estimate.

    * The SE was calculated for each census tract by dividing the margin of error (MOE) reported in the ACS by 1.645, a statistical value associated with a 90 percent confidence interval. The MOE is the difference between an estimate and the upper or lower bounds of its confidence interval. All ACS-published MOEs are based on a 90 percent confidence interval.

    * The RSE is calculated by dividing a tract’s SE by its estimate of educational attainment, and taking the absolute value of the result.

    * Census tract estimates that met either of the following criteria were considered reliable and included in the analysis: RSE less than 50 (meaning the SE was less than half of the estimate) or SE was less than the mean SE of all California census tract estimates for education.

    * Census tracts with unreliable estimates received no score for the indicator (null). The indicator was not factored into that tract’s overall CalEnviroScreen score.

    * Census tracts that met the inclusion criteria were ordered by the percentage of non-white population and percentiles were assigned to each based on the distribution across all census tracts.

In [36]:
# #correcting race from white to non-white
# r['nonwhitePerc'] = 100 - r['whitePerc'].astype(float)

# r['nonwhiteSE'] = r['nonwhiteMOE']/1.645
# #The RSE is calculated by dividing a tract’s SE by its 
# #estimate of educational attainment, and taking the 
# #absolute value of the result.
# r['nonwhiteRSE'] = (r['nonwhiteSE']/r['nonwhitePerc'])*100

# meanSE = r['nonwhiteSE'].mean()

# #Removing values that are statistically suspect, before assigning ranks
# #Not sure that this works
# r.loc[(r['nonwhiteRSE'] >= 50) & (r['nonwhiteSE'] >= meanSE), 'nonwhitePerc']=np.nan

# r['nonwhite_rank'] = r['nonwhitePerc'].rank(pct = True, method = 'average', na_option = 'keep')*100

### Poverty  <a id='poverty'></a>

This indicator describes the percent of the population living below two times the federal poverty
level. 

Following CES methods on page 138 at https://oehha.ca.gov/media/downloads/calenviroscreen/report/ces3report.pdf: 

     "* From the 2011-2015 American Community Survey, a dataset containing the number of individuals below 200 percent of the federal poverty level was downloaded by census tracts for the state of California.

     * The number of individuals below the poverty level was divided by the total population for whom poverty status was determined to obtain a percent.

     * Unlike the US Census, ACS estimates come from a sample of the population and may be unreliable if they are based on a small sample or population size. The standard error (SE) and relative standard error (RSE) were used to evaluate the reliability of each estimate.

     * The SE was calculated for each census tract using the formula for approximating the SE of proportions provided by the ACS (American Community Survey Office, 2013, pg. 13, equation 4 https://www2.census.gov/programs-surveys/acs/tech_docs/accuracy/MultiyearACSAccuracyofData2013.pdf). When this approximation could not be used, the formula for approximating the SE of ratios (equation 3) was used instead.

     * The RSE is calculated by dividing a tract’s SE by its estimate of the percentage of the population living below twice the federal poverty level, and taking the absolute value of the result.

     * Census tract estimates that met either of the following criteria were considered reliable and included in the analysis: RSE less than 50 (meaning the SE was less than half of the estimate) or SE was less than the mean SE of all California census tract estimates for poverty.

     * Census tracts with unreliable estimates received no score for the indicator (null). The indicator was not factored into that tract’s overall CalEnviroScreen score.

     * Census tracts that met the inclusion criteria were ordered by the percentage of the population below twice the federal poverty level. A percentile score for a census tract was determined by its place in the distribution of all census tracts."

In [37]:
# date = "2018"
# dataset = '/acs/acs5/subject' # This data was taken from the SUBJECT tables, not PROFILE tables 
# base_url = "https://api.census.gov/data"
# variables = "NAME,S0101_C01_001E,S0101_C01_001M,S1701_C01_042E,S1701_C01_042M"
# geography = 'tract:*&in=state:36+county:*'
# query = base_url+"/"+date+dataset+"?get="+variables+'&for='+'tract:*&in=state:'+state+'+county:*'

# df = pd.read_json(query, dtype = True)
# df.columns = df.iloc[0]
# df = df.drop(df.index[0])

# df = df.rename(columns={"S0101_C01_001E": 'total_pop',
#                       'S0101_C01_001M': 'total_popMOE',
#                       'S1701_C01_042E': 'below200fpl', #Percent of all people living below 200% of federal poverty line
#                       'S1701_C01_042M': 'below200fplMOE'
#                      })

# #Cleaning data, replacing negatives with nan
# colnames = ["total_pop",
#             "total_popMOE",
#             "below200fpl", 
#             "below200fplMOE"]
# for i in colnames:
#     df[i] = pd.to_numeric(df[i], errors = 'coerce')
# df = df.replace(-222222222.0, np.nan)
# df = df.replace(-666666666.0, np.nan)
# df['total_pop'] = df['total_pop'].replace(0,np.nan)

# # creating FIPS codes
# df['FIPS'] = df['state']+df['county']+df['tract']

In [38]:
# df['poverty_rate'] = (df['below200fpl']/df['total_pop'])*100

# # calculating value under square root sign to check if negative
# df['proportionvalues'] = (df['below200fplMOE']/1.645)**2 - (
#                     df['below200fpl']**2/df['total_pop']**2)*df['total_popMOE']**2

# # turning negative sqrt values into na's
# df.loc[(df['proportionvalues'] < 0), 'proportionvalues']=np.nan

# # calculating SE for tracts with positive sqrt values
# df['povertySE'] = ((1/df['total_pop'])*np.sqrt(df['proportionvalues']))*100

# # calculating SE using ratio method for tracts with negative values under radical in previous equation
# df.loc[(df['povertySE'] == np.nan), 'povertySE'] = ((1/df['total_pop']) * np.sqrt(
#             (df['below200fplMOE']/1.645)**2 + (
#                 df['below200fpl']**2/df['total_pop']**2)*df['total_popMOE']**2))*100

# # calculating RSE
# df['povertyRSE'] = (df['povertySE']/df['poverty_rate'])*100

# meanSE = df['povertySE'].mean()

# # Removing tracts with non-dependable values
# df.loc[(df['povertyRSE'] >= 50) & (df['povertySE'] >= meanSE), 'poverty_rate']=np.nan

# df['poverty_rank'] = df['poverty_rate'].rank(pct = True, na_option = 'keep')*100
# df = df.rename(columns = {'poverty_rate':'poverty_score'})

In [39]:
#Joining and calculating demographic indicators
combined = df.join(r.set_index('FIPS'), how = 'left', on = 'FIPS', rsuffix = 'r')

demographicdata = combined[['NAME','FIPS','total_pop','poverty_score','poverty_rank', 
                            'no_hs', 'edu_rank',
                           'lin_score', 'lin_rank',
                            'unemploy_score','unemploy_rank', 
                            'nonwhitePerc', 'nonwhite_rank'
                           ]]

demographicdata.to_csv('data/COdemographicdata.csv')

NameError: name 'df' is not defined

In [40]:
# only for colorado?
demographic = pd.read_csv("data/COdemographicdata.csv", dtype={'FIPS': object})

## 19. Asthma  <a id='asthma'></a>

For Colorado only: data from https://data-cdphe.opendata.arcgis.com/datasets/asthma-hospitalization-rate-census-tracts

In [41]:
a_df = pd.read_csv("data/Asthma_Hospitalization_Rate_Census_Tracts.csv",dtype={'TRACT_FIPS': object})

In [42]:
# moe(a_df, "ASTHMA_ADJRATE", "ASTHMA_U95CI")
# a_df["se"] = a_df["moe"]/1.96
# a_df["rse"] = a_df["se"]/a_df["ASTHMA_ADJRATE"]
# mean_se = np.mean(a_df["se"])
# for i in a_df.index:
#     if a_df["rse"][i] > 0.5 or a_df["se"][i] < mean_se:
#         a_df.drop(i, inplace = True)

In [43]:
a_df["As_rank"] = a_df["ASTHMA_ADJRATE"].rank(pct = True, na_option = 'keep')*100

asthma = a_df[['TRACT_FIPS', 'ASTHMA_ADJRATE', 'As_rank']]
asthma = asthma.rename( columns = {
    "TRACT_FIPS": "FIPS_tract_id",
    "ASTHMA_ADJRATE" : "asthma_score",
    "As_rank": "asthma_rank"
})

## 20. Cardiovascular Disease  <a id='hd'></a>

For Colorado only: data from https://data-cdphe.opendata.arcgis.com/datasets/heart-disease-in-adults-cdphe-community-level-estimates-census-tracts

In [44]:
hd_df = pd.read_csv("data/Heart_Disease_in_Adults__CDPHE_Community_Level_Estimates_Census_Tracts.csv",dtype={'Census_Tract_FIPS': object})

In [45]:
# hd_df["upper_CI"] = hd_df["HeartDisease_Estimate_Confidence_Interval"].str.extract(r'(\d\.?\d?$)').astype(float)
# moe(hd_df, "HeartDisease_Census_Tract_Estimate", "upper_CI")
# hd_df["se"] = hd_df["moe"]/1.96
# hd_df["rse"] = hd_df["se"]/hd_df["HeartDisease_Census_Tract_Estimate"]
# mean_se = np.mean(hd_df["se"])
# for i in hd_df.index:
#     if hd_df["rse"][i] > 0.5 or hd_df["se"][i] < mean_se:
#         hd_df.drop(i, inplace = True)

In [46]:
hd_df["HD_rank"] = hd_df["HeartDisease_Census_Tract_Estimate"].rank(pct = True, na_option = 'keep')*100

heart_disease = hd_df[['Census_Tract_FIPS', 'HeartDisease_Census_Tract_Estimate', 'HD_rank']]
heart_disease = heart_disease.rename(columns = {
    "Census_Tract_FIPS": "FIPS_tract_id",
    "HeartDisease_Census_Tract_Estimate": "HD_score",
})

## 21. Low Birth Weight Infants <a id='lbw'></a>

For Colorado only: data from https://data-cdphe.opendata.arcgis.com/datasets/7673fa687a7a43b29c2f602db4d33cd9_9

In [47]:
lb_df = pd.read_csv("data/Low_Weight_Birth_Rate_Census_Tracts.csv",dtype={'TRACT_FIPS': object})

In [48]:
# moe(lb_df, "LWB_ADJRATE", "LWB_U95CI")
# lb_df["se"] = lb_df["moe"]/1.96
# lb_df["rse"] = lb_df["se"]/lb_df["LWB_ADJRATE"]
# mean_se = np.mean(lb_df["se"])
# for i in lb_df.index:
#     if lb_df["rse"][i] > 0.5 or lb_df["se"][i] < mean_se:
#         lb_df.drop(i, inplace = True)

In [49]:
lb_df["LB_rank"] = lb_df["LWB_ADJRATE"].rank(pct = True, na_option = 'keep')*100

low_birthweight = lb_df[['TRACT_FIPS', 'LWB_ADJRATE', 'LB_rank']]
low_bw = low_birthweight.rename(columns = {
    "TRACT_FIPS" : "FIPS_tract_id",
    "LWB_ADJRATE": "LB_score"
})

In [50]:
# Merge the 3 Sensitive Population Indicators into one dataframe
healthdata = pd.merge(asthma, low_bw,
                      how = 'left', 
                      left_on = 'FIPS_tract_id',
                      right_on = 'FIPS_tract_id', validate="1:1")
healthdata = pd.merge(healthdata, heart_disease, 
                      how = 'left',
                      left_on = "FIPS_tract_id",
                      right_on = 'FIPS_tract_id',
                     validate = "1:1")

## Final Table for Colorado <a id='final'></a>

In [51]:
#using lead dataframe as base table with reference columns 'NAME', 'state', 'FIPS_tract_id' that we need for final df
all_states = all_states_lead

In [52]:
pd.set_option('display.max_columns', None)

# change data type of FIPS cols
all_states = all_states.astype({'FIPS_tract_id' : str})
demographic = demographic.astype({'FIPS' : str})

In [53]:
# create colorado only dataset from all_state
co = all_states[all_states['state']=='08']

In [54]:
co_final = pd.merge(co, demographic, how = 'left',left_on = 'FIPS_tract_id',right_on = 'FIPS', validate="1:1")

In [None]:
#Oil
oil_df.astype({'FIPS_tract_id' : str})
co_final = pd.merge(co_final, oil_df, how = 'left', left_on = 'FIPS_tract_id',right_on = 'FIPS_tract_id', validate="1:1")

In [55]:
# Housing Burden
houseburden.astype({'geoid' : str})
houseco = houseburden[houseburden['st']=='08']
co_final = pd.merge(co_final, houseco, how = 'left', left_on = 'FIPS_tract_id',right_on = 'geoid', validate="1:1")

In [56]:
#Ozone
ozone_df.reset_index(inplace = True)
ozone_df.astype({'FIPS_tract_id' : str})
co_final = pd.merge(co_final, ozone_df, how = 'left', left_on = 'FIPS_tract_id',right_on = 'FIPS_tract_id', validate="1:1")

In [57]:
#Diesel
co_final = pd.merge(co_final, diesel_for_combine.astype(str), how = 'left', left_on = 'FIPS_tract_id',right_on = 'Tract', validate="1:1")

#Air Toxics
co_final = pd.merge(co_final, toxics_for_combine.astype(str), how = 'left', left_on = 'FIPS_tract_id',right_on = 'Tract', validate="1:1")

#Cancer
# all_states = pd.merge(all_states, cancer_for_combine, how = 'left', left_on = 'FIPS_tract_id',right_on = 'Tract', validate="1:1")

#Pm 2.5
co_final = pd.merge(co_final, pm25_df.astype(str), how = 'left', left_on = 'FIPS_tract_id',right_on = 'FIPS_tract_id', validate="1:1")

# Variety of environmental exposures
co_final = pd.merge(co_final, ejscreen_indicators.astype(str), how = 'left', left_on = 'FIPS_tract_id',right_on = 'Tract_ID', validate="1:1")

#health data
co_final = pd.merge(co_final, healthdata.astype(str), how = 'left', left_on = 'FIPS_tract_id', right_on = 'FIPS_tract_id', validate="1:1")

In [58]:
# Socioeconomic Factor Indicators
co_final['demographic_score'] = co_final[['poverty_rank',
                                        'edu_rank',
                                        'lin_rank',
                                        'unemploy_rank',
                                        'nonwhite_rank', # to be added 
                                        'HouseBurden_rank']].mean(axis = 1)

# Exposure Indicators
cols = ['ozone_rank',
        'diesel_rank',
        'toxics_rank', 
        'PM25_rank', 
        'PTRAF_rank']
co_final['exposure_score'] = co_final[cols].apply(pd.to_numeric, errors='coerce', axis=1).mean(axis = 1)

# Environmental Effects Indicators
cols = ['lead_rank',
        'oil_rank',
        'PTSDF_rank',
       'PRMP_rank',
       'PWDIS_rank',
       'PNPL_rank']
co_final['effects_score'] = co_final[cols].apply(pd.to_numeric, errors='coerce', axis=1).mean(axis = 1)

#Calculating Sensitive Population Score
cols = ['asthma_rank', 'LB_rank', 'HD_rank']
co_final['sensitive_score'] = co_final[cols].apply(pd.to_numeric, errors='coerce', axis=1).mean(axis = 1)

KeyError: "['lead_rank'] not in index"

In [59]:
#Calculating Pollution Burden 
co_final['Pollution_Burden'] = co_final['exposure_score']+(.5*co_final['effects_score'])/1.5

#Calculating Pop Characteristics
co_final['Pop_Char'] = (co_final['demographic_score']+co_final['sensitive_score'])/2

KeyError: 'effects_score'

In [60]:
#Calculating Final Score out of 10, then Rank
co_final['FinalScore'] = ((co_final['Pollution_Burden']/co_final['Pollution_Burden'].max() * 10) *
                     (co_final['Pop_Char']/co_final['Pop_Char'].max() *10))

# The number of indicators that had null values when generating this score to help account for nulls
co_final['missing_values'] = co_final.isnull().sum(axis=1)

# Eliminating Tracts with No Population
co_final['FinalScore'].loc[co_final['total_pop']==0] = np.nan
co_final['FinalScore'].loc[co_final['missing_values'] >= 4] = np.nan

co_final['Final_Rank'] = co_final['FinalScore'].rank(method = 'average', na_option='keep', pct=True)*100

KeyError: 'Pollution_Burden'

In [61]:
co_final.head()

Unnamed: 0.1,Unnamed: 0_x,total_houses,total_houses_M,1970-1979,1970-1979_M,1960-1969,1960-1969_M,1950-1959,1950-1959_M,1940-1949,1940-1949_M,PRE_1940,PRE_1940_M,NAME_x,county,state,tract,FIPS_x,1940-1959,1960-1979,FIPS_tract_id,lead_score,numerator_moe,se,rse,stateSE,Unnamed: 0_y,NAME_y,FIPS_y,total_pop,poverty_score,poverty_rank,no_hs,edu_rank,lin_score,lin_rank,unemploy_score,unemploy_rank,nonwhitePerc,nonwhite_rank,geoid,st,name,HouseBurden_score,HouseBurden_rank,ozone_score,ozone_rank,Tract_x,diesel_score,diesel_rank,Tract_y,toxics_score,toxics_rank,PM25_score,PM25_rank,Unnamed: 0,Tract_ID,Avg_PTRAF,Avg_PTSDF,Avg_PRMP,Avg_PWDIS,Avg_PNPL,State_ID,PTRAF_rank,PTSDF_rank,PRMP_rank,PWDIS_rank,PNPL_rank,asthma_score,asthma_rank,LB_score,LB_rank,HD_score,HD_rank,demographic_score,exposure_score
0,1,2037,129,409,110,47,27,108,53,88,32,524,104,"Census Tract 9646, Delta County, Colorado",29,8,964600,8029964600,196,456,8029964600,0.234207,165.764894,0.081377,34.745723,0,1,"Census Tract 9646, Delta County, Colorado",8029964600,3527.0,34.902183,72.639225,4.4,39.78671,1.4,25.945017,3.572441,80.17094,10.7,10.46042,8029964600,8,"Census Tract 9646, Delta County, Colorado",0.122924,48.939779,48.664195,6.32506,8029964600,0.026819495,10.413476263399694,8029964600,0.0141419315518567,10.09946442234124,4.48901560109,8.486789431545235,11973,8029964600,0.0,0.0120939352687225,0.0298778494341673,2.2912835811505644e-06,0.0210118982466883,8,1.120896717373899,6.965572457966374,4.803843074459568,14.171337069655726,32.826261008807045,181.58,4.26731078904992,11.03,94.30962343096236,4.5,92.2983870967742,46.323682,7.289137
1,2,1901,164,344,111,153,80,51,35,77,54,195,78,"Census Tract 9647, Delta County, Colorado",29,8,964700,8029964700,128,497,8029964700,0.119621,170.13524,0.089498,74.817608,0,2,"Census Tract 9647, Delta County, Colorado",8029964700,3815.0,35.989515,73.688458,10.2,70.016407,2.2,38.487973,1.625164,29.059829,16.6,27.66559,8029964700,8,"Census Tract 9647, Delta County, Colorado",0.167164,73.536896,49.591225,11.208967,8029964700,0.021451869,7.8866768759571215,8029964700,0.0152384956915561,12.700841622035194,4.9474284153,14.491593274619696,11974,8029964700,0.0448395088764211,0.018226080600503,0.0632143960516814,3.9804797203147104e-05,0.0127335192756538,8,2.802241793434748,10.968775020016013,11.208967173738992,21.77742193755004,8.246597277822257,371.75,16.344605475040257,8.16,69.163179916318,4.4,91.41129032258064,52.075859,9.818064
2,3,2338,114,409,170,160,79,66,63,39,45,145,105,"Census Tract 9649, Delta County, Colorado",29,8,964900,8029964900,105,569,8029964900,0.080954,228.385639,0.097684,120.666581,0,3,"Census Tract 9649, Delta County, Colorado",8029964900,5683.0,48.161182,88.539144,6.9,54.675964,2.9,48.496564,3.061763,70.854701,25.0,51.009693,8029964900,8,"Census Tract 9649, Delta County, Colorado",0.190698,83.79983,49.091474,8.88711,8029964900,0.061586285,21.822358346094948,8029964900,0.0196131130799592,25.325172149961745,5.0344636612,16.33306645316253,11976,8029964900,6.3800145464538405,0.0154548827417925,0.178903867958642,0.0001536542190577,0.0130258046773617,8,6.885508406725379,9.527622097678144,32.42594075260208,27.702161729383512,8.646917534027223,448.83,24.55716586151369,6.84,48.32635983263599,4.7,93.50806451612902,66.229316,15.850643
3,4,1394,99,310,109,85,47,42,34,109,65,161,75,"Census Tract 9648, Delta County, Colorado",29,8,964800,8029964800,151,395,8029964800,0.147783,158.417171,0.113642,76.897806,0,4,"Census Tract 9648, Delta County, Colorado",8029964800,4000.0,35.55,73.284907,17.9,86.792453,11.1,83.762887,2.1,43.846154,31.0,62.520194,8029964800,8,"Census Tract 9648, Delta County, Colorado",0.149606,66.412214,49.495988,10.728583,8029964800,0.050748453,19.98468606431853,8029964800,0.0199367086797322,25.860749808722268,5.21404617486,19.45556445156125,11975,8029964800,7.232443159742508,0.0179592186703014,0.3592918090394496,0.0159560351802621,0.0152083731168435,8,7.44595676541233,10.808646917534029,55.72457966373099,61.72938350680545,16.253002401921535,203.01,5.072463768115942,7.83,65.06276150627615,4.2,88.99193548387096,69.436468,16.695108
4,5,2477,152,434,142,96,74,177,84,4,6,420,161,"Census Tract 9650, Delta County, Colorado",29,8,965000,8029965000,181,530,8029965000,0.163839,242.183814,0.097773,59.676173,0,5,"Census Tract 9650, Delta County, Colorado",8029965000,4707.0,27.979605,58.11138,5.3,46.30845,1.8,32.216495,1.848311,36.837607,13.0,16.235864,8029965000,8,"Census Tract 9650, Delta County, Colorado",0.13822,58.778626,48.775071,7.125701,8029965000,0.017949119,5.819295558958652,8029965000,0.0140027362748026,9.793420045906656,4.68310054645,11.128903122498,11977,8029965000,0.0,0.0125856397512276,0.0357313840684724,5.639489452570328e-06,0.0178009793769434,8,1.120896717373899,7.686148919135308,5.844675740592474,15.77261809447558,28.74299439551641,322.14,12.640901771336557,5.7,27.280334728033477,3.5,76.04838709677419,41.414737,6.997643


In [77]:
cols = co_final.columns
cols

Index(['NAME_x', 'state', 'FIPS_tract_id', 'lead_score', 'lead_rank',
       'Unnamed: 0_x', 'NAME_y', 'FIPS', 'total_pop', 'poverty_score',
       'poverty_rank', 'no_hs', 'edu_rank', 'lin_score', 'lin_rank',
       'unemploy_score', 'unemploy_rank', 'nonwhitePerc', 'nonwhite_rank',
       'geoid', 'st', 'name', 'HouseBurden_score', 'HouseBurden_rank',
       'ozone_score', 'ozone_rank', 'Tract_x', 'diesel_score', 'diesel_rank',
       'Tract_y', 'toxics_score', 'toxics_rank', 'PM25_score', 'PM25_rank',
       'Unnamed: 0_y', 'Tract_ID', 'Avg_PTRAF', 'Avg_PTSDF', 'Avg_PRMP',
       'Avg_PWDIS', 'Avg_PNPL', 'State_ID', 'PTRAF_rank', 'PTSDF_rank',
       'PRMP_rank', 'PWDIS_rank', 'PNPL_rank', 'asthma_score', 'asthma_rank',
       'LB_score', 'LB_rank', 'HD_score', 'HD_rank', 'demographic_score',
       'exposure_score', 'effects_score', 'sensitive_score',
       'Pollution_Burden', 'Pop_Char', 'FinalScore', 'missing_values',
       'Final_Rank'],
      dtype='object')

In [109]:
co_final.drop(columns=['Unnamed: 0_x',
                       'NAME_y',
                      'FIPS',
                      'geoid',
                      'st',
                      'name',
                      'Tract_x',
                      'Tract_y',
                      'Unnamed: 0_y',
                      'Tract_ID',
                      'State_ID'], inplace=True)
#co_final.drop(columns = ['Unnamed: 0_x', 'NAME_y',"FIPS", "geoid"], inplace = True)

KeyError: "['Unnamed: 0_x' 'NAME_y' 'FIPS' 'geoid' 'st' 'name' 'Tract_x' 'Tract_y'\n 'Unnamed: 0_y' 'Tract_ID' 'State_ID'] not found in axis"

In [79]:


co_final = co_final.rename(columns = {
    "NAME_x": "NAME",
    "no_hs" : "edu_score"
})

In [None]:
co_final['county_name'] = co_final['NAME'].apply(lambda x: x.split(",")[1])

In [80]:
#TODO: round datapoints to 2nd decimal point

In [81]:
co_final

Unnamed: 0,NAME,state,FIPS_tract_id,lead_score,lead_rank,total_pop,poverty_score,poverty_rank,edu_score,edu_rank,lin_score,lin_rank,unemploy_score,unemploy_rank,nonwhitePerc,nonwhite_rank,HouseBurden_score,HouseBurden_rank,ozone_score,ozone_rank,diesel_score,diesel_rank,toxics_score,toxics_rank,PM25_score,PM25_rank,Avg_PTRAF,Avg_PTSDF,Avg_PRMP,Avg_PWDIS,Avg_PNPL,PTRAF_rank,PTSDF_rank,PRMP_rank,PWDIS_rank,PNPL_rank,asthma_score,asthma_rank,LB_score,LB_rank,HD_score,HD_rank,demographic_score,exposure_score,effects_score,sensitive_score,Pollution_Burden,Pop_Char,FinalScore,missing_values,Final_Rank
0,"Census Tract 9646, Delta County, Colorado",08,08029964600,0.234207,78.866397,3527.0,34.902183,72.639225,4.4,39.786710,1.4,25.945017,3.572441,80.170940,10.7,10.460420,0.122924,48.939779,48.664195,6.325060,0.026819495,10.413476263399694,0.014141931551856713,10.099464422341239,4.48901560109,8.486789431545235,0.0,0.012093935268722524,0.029877849434167342,2.2912835811505644e-06,0.021011898246688303,1.120896717373899,6.965572457966374,4.803843074459568,14.171337069655726,32.826261008807045,181.58,4.26731078904992,11.03,94.30962343096235,4.5,92.29838709677419,46.323682,7.289137,27.526682,63.625107,16.464698,54.974395,8.086725,0,6.416667
1,"Census Tract 9647, Delta County, Colorado",08,08029964700,0.119621,61.700405,3815.0,35.989515,73.688458,10.2,70.016407,2.2,38.487973,1.625164,29.059829,16.6,27.665590,0.167164,73.536896,49.591225,11.208967,0.021451869,7.8866768759571215,0.015238495691556145,12.700841622035194,4.9474284153,14.491593274619696,0.044839508876421116,0.01822608060050309,0.06321439605168144,3.9804797203147104e-05,0.012733519275653818,2.802241793434748,10.968775020016013,11.208967173738992,21.77742193755004,8.246597277822257,371.75,16.344605475040257,8.16,69.163179916318,4.4,91.41129032258064,52.075859,9.818064,22.780433,58.973025,17.411542,55.524442,8.637337,0,7.416667
2,"Census Tract 9649, Delta County, Colorado",08,08029964900,0.080954,52.388664,5683.0,48.161182,88.539144,6.9,54.675964,2.9,48.496564,3.061763,70.854701,25.0,51.009693,0.190698,83.799830,49.091474,8.887110,0.061586285,21.822358346094948,0.019613113079959284,25.325172149961745,5.0344636612,16.33306645316253,6.3800145464538405,0.015454882741792537,0.17890386795864202,0.00015365421905775357,0.013025804677361724,6.885508406725379,9.527622097678144,32.42594075260208,27.702161729383512,8.646917534027223,448.83,24.55716586151369,6.84,48.32635983263599,4.7,93.50806451612902,66.229316,15.850643,26.138261,55.463863,24.563397,60.846590,13.353131,0,17.250000
3,"Census Tract 9648, Delta County, Colorado",08,08029964800,0.147783,66.720648,4000.0,35.550000,73.284907,17.9,86.792453,11.1,83.762887,2.100000,43.846154,31.0,62.520194,0.149606,66.412214,49.495988,10.728583,0.050748453,19.98468606431853,0.019936708679732216,25.860749808722268,5.21404617486,19.45556445156125,7.232443159742508,0.017959218670301477,0.3592918090394496,0.01595603518026216,0.015208373116843503,7.44595676541233,10.808646917534027,55.72457966373099,61.72938350680545,16.253002401921535,203.01,5.072463768115942,7.83,65.06276150627615,4.2,88.99193548387096,69.436468,16.695108,42.247252,53.042387,30.777525,61.239427,16.839271,0,25.166667
4,"Census Tract 9650, Delta County, Colorado",08,08029965000,0.163839,68.744939,4707.0,27.979605,58.111380,5.3,46.308450,1.8,32.216495,1.848311,36.837607,13.0,16.235864,0.138220,58.778626,48.775071,7.125701,0.017949119,5.819295558958652,0.014002736274802644,9.793420045906656,4.68310054645,11.128903122497999,0.0,0.012585639751227609,0.035731384068472496,5.639489452570328e-06,0.017800979376943468,1.120896717373899,7.686148919135308,5.844675740592474,15.772618094475579,28.74299439551641,322.14,12.640901771336555,5.7,27.280334728033473,3.5,76.04838709677419,41.414737,6.997643,25.358275,38.656541,15.450402,40.035639,5.526434,0,1.916667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1244,"Census Tract 2, Summit County, Colorado",08,08117000200,0.033428,26.153846,7561.0,22.100251,45.439871,7.4,57.629204,14.7,89.733677,2.063219,42.478632,36.6,70.395800,0.177824,79.050042,51.522976,28.182546,0.044904368,17.993874425727412,0.011942411695752211,4.284621270084163,3.70491278689,1.2810248198558847,104.3431385133304,0.015355185773634096,0.02035366658217247,9.375247339414476e-05,0.0265865599212114,26.501200960768617,9.44755804643715,2.8823058446757406,25.540432345876702,35.22818254603683,263.13,7.890499194847021,9.54,85.89958158995816,1.2,5.201612903225807,64.121204,15.648654,19.850465,32.997231,22.265475,48.559218,9.659666,0,9.916667
1245,"Census Tract 1, Summit County, Colorado",08,08117000100,0.042711,31.578947,8408.0,25.618459,52.865214,11.9,74.528302,12.9,87.371134,1.034729,12.905983,21.0,40.306947,0.234432,93.723494,50.037261,12.970376,0.047638635,18.98928024502297,0.012384788832456072,5.279265493496557,3.78334032787,2.241793434747798,19.449840020160252,0.01289484362429214,0.02771590552496926,0.004127309705890178,0.029933265409871512,12.169735788630904,8.00640512409928,4.083266613290633,49.31945556445156,36.42914331465173,90.14,1.529790660225443,11.4,95.98326359832636,4.8,94.15322580645162,60.283512,10.330090,25.883444,63.888760,18.957905,62.086136,10.515827,0,12.083333
1246,"Census Tract 147, Clear Creek County, Colorado",08,08019014700,0.063248,44.210526,5114.0,12.514666,19.451170,0.0,0.533224,2.0,34.922680,1.134142,15.213675,7.1,3.311793,0.059211,6.615776,54.119733,89.751801,0.04135542,16.84532924961715,0.01288565032737857,7.03902065799541,4.15944923497,4.00320256204964,44.611063461044225,0.043597895689086286,0.04338778823234617,0.0020196084157539673,0.08678818910848106,17.934347477982385,17.293835068054445,7.686148919135308,43.39471577261809,58.60688550840673,539.0,37.11755233494364,5.88,30.83682008368201,8.4,99.43548387096774,13.341387,27.114740,34.238422,55.796619,38.527548,34.569003,11.899170,0,14.583333
1247,"Census Tract 148, Clear Creek County, Colorado",08,08019014800,0.443381,96.518219,1667.0,33.653269,70.137207,2.1,17.022149,0.0,1.288660,1.199760,17.692308,12.7,15.549273,0.112994,42.578456,54.186741,90.952762,0.080290988,25.114854517611025,0.01333840739331429,7.651109410864574,4.28366076503,5.604483586869495,1510.1069289457,0.0396001575175447,0.0395165290220553,0.0523444379651759,0.30613124956942606,90.9527622097678,16.573258606885506,6.5652522017614094,74.77982385908727,93.75500400320256,185.83,4.428341384863124,8.96,79.8326359832636,7.7,98.75,27.378009,44.055194,57.638311,61.003659,63.267965,44.190834,24.978969,0,44.666667


In [82]:
co_final.to_csv('data/Colorado_final.csv')

## Creating tables to input into database

In [119]:
co_final = pd.read_csv('data/Colorado_final.csv')
ct_df = pd.read_csv('data/Colorado_Census_Tract_Boundaries.csv')

In [120]:
co_final.drop(columns = ['Unnamed: 0', "state"], inplace = True)

In [121]:
co_final.head()

Unnamed: 0,NAME,FIPS_tract_id,lead_score,lead_rank,total_pop,poverty_score,poverty_rank,edu_score,edu_rank,lin_score,lin_rank,unemploy_score,unemploy_rank,nonwhitePerc,nonwhite_rank,HouseBurden_score,HouseBurden_rank,ozone_score,ozone_rank,diesel_score,diesel_rank,toxics_score,toxics_rank,PM25_score,PM25_rank,Avg_PTRAF,Avg_PTSDF,Avg_PRMP,Avg_PWDIS,Avg_PNPL,PTRAF_rank,PTSDF_rank,PRMP_rank,PWDIS_rank,PNPL_rank,asthma_score,asthma_rank,LB_score,LB_rank,HD_score,HD_rank,demographic_score,exposure_score,effects_score,sensitive_score,Pollution_Burden,Pop_Char,FinalScore,missing_values,Final_Rank
0,"Census Tract 9646, Delta County, Colorado",8029964600,0.234207,78.866397,3527.0,34.902183,72.639225,4.4,39.78671,1.4,25.945017,3.572441,80.17094,10.7,10.46042,0.122924,48.939779,48.664195,6.32506,0.026819,10.413476,0.014142,10.099464,4.489016,8.486789,0.0,0.012094,0.029878,2e-06,0.021012,1.120897,6.965572,4.803843,14.171337,32.826261,181.58,4.267311,11.03,94.309623,4.5,92.298387,46.323682,7.289137,27.526682,63.625107,16.464698,54.974395,8.086725,0,6.416667
1,"Census Tract 9647, Delta County, Colorado",8029964700,0.119621,61.700405,3815.0,35.989515,73.688458,10.2,70.016407,2.2,38.487973,1.625164,29.059829,16.6,27.66559,0.167164,73.536896,49.591225,11.208967,0.021452,7.886677,0.015238,12.700842,4.947428,14.491593,0.04484,0.018226,0.063214,4e-05,0.012734,2.802242,10.968775,11.208967,21.777422,8.246597,371.75,16.344605,8.16,69.16318,4.4,91.41129,52.075859,9.818064,22.780433,58.973025,17.411542,55.524442,8.637337,0,7.416667
2,"Census Tract 9649, Delta County, Colorado",8029964900,0.080954,52.388664,5683.0,48.161182,88.539144,6.9,54.675964,2.9,48.496564,3.061763,70.854701,25.0,51.009693,0.190698,83.79983,49.091474,8.88711,0.061586,21.822358,0.019613,25.325172,5.034464,16.333066,6.380015,0.015455,0.178904,0.000154,0.013026,6.885508,9.527622,32.425941,27.702162,8.646918,448.83,24.557166,6.84,48.32636,4.7,93.508065,66.229316,15.850643,26.138261,55.463863,24.563397,60.84659,13.353131,0,17.25
3,"Census Tract 9648, Delta County, Colorado",8029964800,0.147783,66.720648,4000.0,35.55,73.284907,17.9,86.792453,11.1,83.762887,2.1,43.846154,31.0,62.520194,0.149606,66.412214,49.495988,10.728583,0.050748,19.984686,0.019937,25.86075,5.214046,19.455564,7.232443,0.017959,0.359292,0.015956,0.015208,7.445957,10.808647,55.72458,61.729384,16.253002,203.01,5.072464,7.83,65.062762,4.2,88.991935,69.436468,16.695108,42.247252,53.042387,30.777525,61.239427,16.839271,0,25.166667
4,"Census Tract 9650, Delta County, Colorado",8029965000,0.163839,68.744939,4707.0,27.979605,58.11138,5.3,46.30845,1.8,32.216495,1.848311,36.837607,13.0,16.235864,0.13822,58.778626,48.775071,7.125701,0.017949,5.819296,0.014003,9.79342,4.683101,11.128903,0.0,0.012586,0.035731,6e-06,0.017801,1.120897,7.686149,5.844676,15.772618,28.742994,322.14,12.640902,5.7,27.280335,3.5,76.048387,41.414737,6.997643,25.358275,38.656541,15.450402,40.035639,5.526434,0,1.916667


In [70]:
ct_data = co_final[["NAME", "FIPS_tract_id"]]

In [71]:
ct_data.shape

(1249, 2)

In [73]:
ct_df.shape

(1249, 2)

asserting that our dataframe has the same amount of census tracts as the total census tracts in CO

In [76]:
ct_data.set_index("FIPS_tract_id")

Unnamed: 0_level_0,NAME
FIPS_tract_id,Unnamed: 1_level_1
8029964600,"Census Tract 9646, Delta County, Colorado"
8029964700,"Census Tract 9647, Delta County, Colorado"
8029964900,"Census Tract 9649, Delta County, Colorado"
8029964800,"Census Tract 9648, Delta County, Colorado"
8029965000,"Census Tract 9650, Delta County, Colorado"
...,...
8117000200,"Census Tract 2, Summit County, Colorado"
8117000100,"Census Tract 1, Summit County, Colorado"
8019014700,"Census Tract 147, Clear Creek County, Colorado"
8019014800,"Census Tract 148, Clear Creek County, Colorado"


In [122]:
co_final.head()

Unnamed: 0,NAME,FIPS_tract_id,lead_score,lead_rank,total_pop,poverty_score,poverty_rank,edu_score,edu_rank,lin_score,lin_rank,unemploy_score,unemploy_rank,nonwhitePerc,nonwhite_rank,HouseBurden_score,HouseBurden_rank,ozone_score,ozone_rank,diesel_score,diesel_rank,toxics_score,toxics_rank,PM25_score,PM25_rank,Avg_PTRAF,Avg_PTSDF,Avg_PRMP,Avg_PWDIS,Avg_PNPL,PTRAF_rank,PTSDF_rank,PRMP_rank,PWDIS_rank,PNPL_rank,asthma_score,asthma_rank,LB_score,LB_rank,HD_score,HD_rank,demographic_score,exposure_score,effects_score,sensitive_score,Pollution_Burden,Pop_Char,FinalScore,missing_values,Final_Rank
0,"Census Tract 9646, Delta County, Colorado",8029964600,0.234207,78.866397,3527.0,34.902183,72.639225,4.4,39.78671,1.4,25.945017,3.572441,80.17094,10.7,10.46042,0.122924,48.939779,48.664195,6.32506,0.026819,10.413476,0.014142,10.099464,4.489016,8.486789,0.0,0.012094,0.029878,2e-06,0.021012,1.120897,6.965572,4.803843,14.171337,32.826261,181.58,4.267311,11.03,94.309623,4.5,92.298387,46.323682,7.289137,27.526682,63.625107,16.464698,54.974395,8.086725,0,6.416667
1,"Census Tract 9647, Delta County, Colorado",8029964700,0.119621,61.700405,3815.0,35.989515,73.688458,10.2,70.016407,2.2,38.487973,1.625164,29.059829,16.6,27.66559,0.167164,73.536896,49.591225,11.208967,0.021452,7.886677,0.015238,12.700842,4.947428,14.491593,0.04484,0.018226,0.063214,4e-05,0.012734,2.802242,10.968775,11.208967,21.777422,8.246597,371.75,16.344605,8.16,69.16318,4.4,91.41129,52.075859,9.818064,22.780433,58.973025,17.411542,55.524442,8.637337,0,7.416667
2,"Census Tract 9649, Delta County, Colorado",8029964900,0.080954,52.388664,5683.0,48.161182,88.539144,6.9,54.675964,2.9,48.496564,3.061763,70.854701,25.0,51.009693,0.190698,83.79983,49.091474,8.88711,0.061586,21.822358,0.019613,25.325172,5.034464,16.333066,6.380015,0.015455,0.178904,0.000154,0.013026,6.885508,9.527622,32.425941,27.702162,8.646918,448.83,24.557166,6.84,48.32636,4.7,93.508065,66.229316,15.850643,26.138261,55.463863,24.563397,60.84659,13.353131,0,17.25
3,"Census Tract 9648, Delta County, Colorado",8029964800,0.147783,66.720648,4000.0,35.55,73.284907,17.9,86.792453,11.1,83.762887,2.1,43.846154,31.0,62.520194,0.149606,66.412214,49.495988,10.728583,0.050748,19.984686,0.019937,25.86075,5.214046,19.455564,7.232443,0.017959,0.359292,0.015956,0.015208,7.445957,10.808647,55.72458,61.729384,16.253002,203.01,5.072464,7.83,65.062762,4.2,88.991935,69.436468,16.695108,42.247252,53.042387,30.777525,61.239427,16.839271,0,25.166667
4,"Census Tract 9650, Delta County, Colorado",8029965000,0.163839,68.744939,4707.0,27.979605,58.11138,5.3,46.30845,1.8,32.216495,1.848311,36.837607,13.0,16.235864,0.13822,58.778626,48.775071,7.125701,0.017949,5.819296,0.014003,9.79342,4.683101,11.128903,0.0,0.012586,0.035731,6e-06,0.017801,1.120897,7.686149,5.844676,15.772618,28.742994,322.14,12.640902,5.7,27.280335,3.5,76.048387,41.414737,6.997643,25.358275,38.656541,15.450402,40.035639,5.526434,0,1.916667


## Uploading to MySQL

In [123]:
#conn = sqlite3.connect("EJMP.db")

engine = create_engine("mysql+pymysql://{user}:{pw}@localhost/{db}"
                       .format(user="root",
                               pw="envjust2020",
                               db="CO"))

In [124]:
ct_data.to_sql("CENSUS_TRACT", con = engine, if_exists = "append", index = False)

  result = self._query(query)


In [125]:
co_final.to_sql("CENSUS_TRACT", con = engine, if_exists = "append", index = False)

InternalError: (pymysql.err.InternalError) (1054, "Unknown column 'lead_score' in 'field list'") [SQL: 'INSERT INTO `CENSUS_TRACT` (`NAME`, `FIPS_tract_id`, lead_score, lead_rank, total_pop, poverty_score, poverty_rank, edu_score, edu_rank, lin_score, lin_rank, unemploy_score, unemploy_rank, `nonwhitePerc`, nonwhite_rank, `HouseBurden_score`, `HouseBurden_rank`, ozone_score, ozone_rank, diesel_score, diesel_rank, toxics_score, toxics_rank, `PM25_score`, `PM25_rank`, `Avg_PTRAF`, `Avg_PTSDF`, `Avg_PRMP`, `Avg_PWDIS`, `Avg_PNPL`, `PTRAF_rank`, `PTSDF_rank`, `PRMP_rank`, `PWDIS_rank`, `PNPL_rank`, asthma_score, asthma_rank, `LB_score`, `LB_rank`, `HD_score`, `HD_rank`, demographic_score, exposure_score, effects_score, sensitive_score, `Pollution_Burden`, `Pop_Char`, `FinalScore`, missing_values, `Final_Rank`) VALUES (%(NAME)s, %(FIPS_tract_id)s, %(lead_score)s, %(lead_rank)s, %(total_pop)s, %(poverty_score)s, %(poverty_rank)s, %(edu_score)s, %(edu_rank)s, %(lin_score)s, %(lin_rank)s, %(unemploy_score)s, %(unemploy_rank)s, %(nonwhitePerc)s, %(nonwhite_rank)s, %(HouseBurden_score)s, %(HouseBurden_rank)s, %(ozone_score)s, %(ozone_rank)s, %(diesel_score)s, %(diesel_rank)s, %(toxics_score)s, %(toxics_rank)s, %(PM25_score)s, %(PM25_rank)s, %(Avg_PTRAF)s, %(Avg_PTSDF)s, %(Avg_PRMP)s, %(Avg_PWDIS)s, %(Avg_PNPL)s, %(PTRAF_rank)s, %(PTSDF_rank)s, %(PRMP_rank)s, %(PWDIS_rank)s, %(PNPL_rank)s, %(asthma_score)s, %(asthma_rank)s, %(LB_score)s, %(LB_rank)s, %(HD_score)s, %(HD_rank)s, %(demographic_score)s, %(exposure_score)s, %(effects_score)s, %(sensitive_score)s, %(Pollution_Burden)s, %(Pop_Char)s, %(FinalScore)s, %(missing_values)s, %(Final_Rank)s)'] [parameters: ({'NAME': 'Census Tract 9646, Delta County, Colorado', 'FIPS_tract_id': 8029964600, 'lead_score': 0.2342071674030437, 'lead_rank': 78.8663967611336, 'total_pop': 3527.0, 'poverty_score': 34.90218315849163, 'poverty_rank': 72.63922518159806, 'edu_score': 4.400000000000006, 'edu_rank': 39.78671041837572, 'lin_score': 1.4, 'lin_rank': 25.945017182130588, 'unemploy_score': 3.572441168131556, 'unemploy_rank': 80.17094017094017, 'nonwhitePerc': 10.700000000000003, 'nonwhite_rank': 10.460420032310175, 'HouseBurden_score': 0.12292358803986712, 'HouseBurden_rank': 48.939779474130624, 'ozone_score': 48.66419542479999, 'ozone_rank': 6.325060048038432, 'diesel_score': 0.026819495, 'diesel_rank': 10.413476263399692, 'toxics_score': 0.014141931551856713, 'toxics_rank': 10.09946442234124, 'PM25_score': 4.48901560109, 'PM25_rank': 8.486789431545235, 'Avg_PTRAF': 0.0, 'Avg_PTSDF': 0.012093935268722524, 'Avg_PRMP': 0.029877849434167342, 'Avg_PWDIS': 2.2912835811505644e-06, 'Avg_PNPL': 0.021011898246688303, 'PTRAF_rank': 1.1208967173738993, 'PTSDF_rank': 6.965572457966374, 'PRMP_rank': 4.803843074459568, 'PWDIS_rank': 14.171337069655724, 'PNPL_rank': 32.82626100880705, 'asthma_score': 181.58, 'asthma_rank': 4.26731078904992, 'LB_score': 11.03, 'LB_rank': 94.30962343096235, 'HD_score': 4.5, 'HD_rank': 92.29838709677419, 'demographic_score': 46.323682076580894, 'exposure_score': 7.289137376539701, 'effects_score': 27.52668207440446, 'sensitive_score': 63.625107105595475, 'Pollution_Burden': 16.464698068007856, 'Pop_Char': 54.97439459108818, 'FinalScore': 8.08672511101665, 'missing_values': 0, 'Final_Rank': 6.416666666666666}, {'NAME': 'Census Tract 9647, Delta County, Colorado', 'FIPS_tract_id': 8029964700, 'lead_score': 0.11962125197264598, 'lead_rank': 61.70040485829959, 'total_pop': 3815.0, 'poverty_score': 35.98951507208388, 'poverty_rank': 73.68845843422115, 'edu_score': 10.200000000000003, 'edu_rank': 70.01640689089417, 'lin_score': 2.2, 'lin_rank': 38.48797250859107, 'unemploy_score': 1.6251638269986892, 'unemploy_rank': 29.059829059829067, 'nonwhitePerc': 16.599999999999994, 'nonwhite_rank': 27.665589660743127, 'HouseBurden_score': 0.16716417910447762, 'HouseBurden_rank': 73.53689567430025, 'ozone_score': 49.5912248366, 'ozone_rank': 11.20896717373899, 'diesel_score': 0.021451869, 'diesel_rank': 7.8866768759571215, 'toxics_score': 0.015238495691556143, 'toxics_rank': 12.700841622035195, 'PM25_score': 4.9474284153, 'PM25_rank': 14.491593274619696, 'Avg_PTRAF': 0.044839508876421116, 'Avg_PTSDF': 0.01822608060050309, 'Avg_PRMP': 0.06321439605168144, 'Avg_PWDIS': 3.9804797203147104e-05, 'Avg_PNPL': 0.012733519275653818, 'PTRAF_rank': 2.802241793434748, 'PTSDF_rank': 10.968775020016013, 'PRMP_rank': 11.208967173738992, 'PWDIS_rank': 21.777421937550038, 'PNPL_rank': 8.246597277822257, 'asthma_score': 371.75, 'asthma_rank': 16.344605475040257, 'LB_score': 8.16, 'LB_rank': 69.163179916318, 'HD_score': 4.4, 'HD_rank': 91.41129032258064, 'demographic_score': 52.075858704763135, 'exposure_score': 9.81806414795715, 'effects_score': 22.780433253485374, 'sensitive_score': 58.973025237979634, 'Pollution_Burden': 17.411541899118944, 'Pop_Char': 55.524441971371395, 'FinalScore': 8.637337484567011, 'missing_values': 0, 'Final_Rank': 7.416666666666668}, {'NAME': 'Census Tract 9649, Delta County, Colorado', 'FIPS_tract_id': 8029964900, 'lead_score': 0.08095380667236955, 'lead_rank': 52.38866396761134, 'total_pop': 5683.0, 'poverty_score': 48.16118247404541, 'poverty_rank': 88.53914447134787, 'edu_score': 6.900000000000006, 'edu_rank': 54.67596390484003, 'lin_score': 2.9, 'lin_rank': 48.49656357388315, 'unemploy_score': 3.061763153264121, 'unemploy_rank': 70.85470085470084, 'nonwhitePerc': 25.0, 'nonwhite_rank': 51.00969305331178, 'HouseBurden_score': 0.19069767441860466, 'HouseBurden_rank': 83.79983036471586, 'ozone_score': 49.0914738562, 'ozone_rank': 8.887109687750199, 'diesel_score': 0.061586285, 'diesel_rank': 21.822358346094948, 'toxics_score': 0.019613113079959284, 'toxics_rank': 25.325172149961748, 'PM25_score': 5.0344636612, 'PM25_rank': 16.33306645316253, 'Avg_PTRAF': 6.3800145464538405, 'Avg_PTSDF': 0.015454882741792537, 'Avg_PRMP': 0.17890386795864202, 'Avg_PWDIS': 0.00015365421905775357, 'Avg_PNPL': 0.013025804677361724, 'PTRAF_rank': 6.885508406725379, 'PTSDF_rank': 9.527622097678144, 'PRMP_rank': 32.425940752602074, 'PWDIS_rank': 27.702161729383512, 'PNPL_rank': 8.646917534027223, 'asthma_score': 448.83, 'asthma_rank': 24.557165861513692, 'LB_score': 6.84, 'LB_rank': 48.326359832635994, 'HD_score': 4.7, 'HD_rank': 93.50806451612901, 'demographic_score': 66.22931603713324, 'exposure_score': 15.850643008738961, 'effects_score': 26.138261216260453, 'sensitive_score': 55.463863403426245, 'Pollution_Burden': 24.563396747492448, 'Pop_Char': 60.84658972027974, 'FinalScore': 13.353131302926274, 'missing_values': 0, 'Final_Rank': 17.25}, {'NAME': 'Census Tract 9648, Delta County, Colorado', 'FIPS_tract_id': 8029964800, 'lead_score': 0.14778335724533714, 'lead_rank': 66.72064777327935, 'total_pop': 4000.0, 'poverty_score': 35.55, 'poverty_rank': 73.28490718321225, 'edu_score': 17.900000000000006, 'edu_rank': 86.7924528301887, 'lin_score': 11.1, 'lin_rank': 83.76288659793816, 'unemploy_score': 2.1, 'unemploy_rank': 43.846153846153854, 'nonwhitePerc': 31.0, 'nonwhite_rank': 62.520193861066225, 'HouseBurden_score': 0.14960629921259844, 'HouseBurden_rank': 66.41221374045801, 'ozone_score': 49.4959875817, 'ozone_rank': 10.728582866293037, 'diesel_score': 0.050748453, 'diesel_rank': 19.98468606431853, 'toxics_score': 0.019936708679732216, 'toxics_rank': 25.860749808722268, 'PM25_score': 5.21404617486, 'PM25_rank': 19.45556445156125, 'Avg_PTRAF': 7.232443159742508, 'Avg_PTSDF': 0.017959218670301477, 'Avg_PRMP': 0.3592918090394496, 'Avg_PWDIS': 0.01595603518026216, 'Avg_PNPL': 0.015208373116843503, 'PTRAF_rank': 7.445956765412331, 'PTSDF_rank': 10.808646917534027, 'PRMP_rank': 55.72457966373099, 'PWDIS_rank': 61.72938350680545, 'PNPL_rank': 16.253002401921535, 'asthma_score': 203.01, 'asthma_rank': 5.0724637681159415, 'LB_score': 7.83, 'LB_rank': 65.06276150627615, 'HD_score': 4.2, 'HD_rank': 88.99193548387095, 'demographic_score': 69.4364680098362, 'exposure_score': 16.69510799126149, 'effects_score': 42.24725205265427, 'sensitive_score': 53.042386919421006, 'Pollution_Burden': 30.77752534214624, 'Pop_Char': 61.23942746462861, 'FinalScore': 16.839270508751916, 'missing_values': 0, 'Final_Rank': 25.166666666666664}, {'NAME': 'Census Tract 9650, Delta County, Colorado', 'FIPS_tract_id': 8029965000, 'lead_score': 0.16383932176019378, 'lead_rank': 68.74493927125506, 'total_pop': 4707.0, 'poverty_score': 27.979604843849586, 'poverty_rank': 58.11138014527845, 'edu_score': 5.299999999999997, 'edu_rank': 46.3084495488105, 'lin_score': 1.8, 'lin_rank': 32.21649484536083, 'unemploy_score': 1.8483110261312936, 'unemploy_rank': 36.83760683760684, 'nonwhitePerc': 13.0, 'nonwhite_rank': 16.235864297253634, 'HouseBurden_score': 0.1382198952879581, 'HouseBurden_rank': 58.778625954198475, 'ozone_score': 48.775070588199995, 'ozone_rank': 7.125700560448358, 'diesel_score': 0.017949119, 'diesel_rank': 5.819295558958652, 'toxics_score': 0.014002736274802644, 'toxics_rank': 9.793420045906656, 'PM25_score': 4.6831005464499995, 'PM25_rank': 11.128903122498, 'Avg_PTRAF': 0.0, 'Avg_PTSDF': 0.012585639751227607, 'Avg_PRMP': 0.035731384068472496, 'Avg_PWDIS': 5.639489452570328e-06, 'Avg_PNPL': 0.017800979376943468, 'PTRAF_rank': 1.1208967173738993, 'PTSDF_rank': 7.686148919135307, 'PRMP_rank': 5.844675740592474, 'PWDIS_rank': 15.772618094475579, 'PNPL_rank': 28.74299439551641, 'asthma_score': 322.14, 'asthma_rank': 12.640901771336555, 'LB_score': 5.7, 'LB_rank': 27.280334728033477, 'HD_score': 3.5, 'HD_rank': 76.04838709677419, 'demographic_score': 41.414736938084786, 'exposure_score': 6.997643201037113, 'effects_score': 25.35827528419497, 'sensitive_score': 38.65654119871474, 'Pollution_Burden': 15.450401629102107, 'Pop_Char': 40.035639068399774, 'FinalScore': 5.526434064082168, 'missing_values': 0, 'Final_Rank': 1.9166666666666663}, {'NAME': 'Census Tract 2.01, Denver County, Colorado', 'FIPS_tract_id': 8031000201, 'lead_score': 0.3403755868544601, 'lead_rank': 90.04048582995951, 'total_pop': 3850.0, 'poverty_score': 30.181818181818183, 'poverty_rank': 63.2768361581921, 'edu_score': 22.8, 'edu_rank': 91.18129614438064, 'lin_score': 17.3, 'lin_rank': 92.09621993127148, 'unemploy_score': 3.922077922077922, 'unemploy_rank': 84.1025641025641, 'nonwhitePerc': 57.9, 'nonwhite_rank': 86.95476575121162, 'HouseBurden_score': 0.14012738853503184, 'HouseBurden_rank': 60.22052586938084, 'ozone_score': 52.1818294118, 'ozone_rank': 48.99919935948759, 'diesel_score': 0.6439806539999999, 'diesel_rank': 97.24349157733538, 'toxics_score': 0.05344240864985715, 'toxics_rank': 94.10864575363428, 'PM25_score': 7.9546172131100015, 'PM25_rank': 84.62770216172937, 'Avg_PTRAF': 2268.4568554630405, 'Avg_PTSDF': 2.4745680455736823, 'Avg_PRMP': 1.6568153397733911, 'Avg_PWDIS': 0.013796020179561386, 'Avg_PNPL': 0.6237893441857185, 'PTRAF_rank': 95.83666933546837, 'PTSDF_rank': 92.87429943955163, 'PRMP_rank': 91.11289031224979, 'PWDIS_rank': 60.1281024819856, 'PNPL_rank': 98.87910328262608, 'asthma_score': 1003.01, 'asthma_rank': 85.5877616747182, 'LB_score': 6.39, 'LB_rank': 41.004184100418406, 'HD_score': 2.1, 'HD_rank': 29.677419354838708, 'demographic_score': 79.63870132616678, 'exposure_score': 84.163141637531, 'effects_score': 86.60697626927454, 'sensitive_score': 52.08978837665844, 'Pollution_Burden': 113.03213372728918, 'Pop_Char': 65.86424485141261, 'FinalScore': 66.51354528070854, 'missing_values': 0, 'Final_Rank': 96.08333333333331}, {'NAME': 'Census Tract 6, Denver County, Colorado', 'FIPS_tract_id': 8031000600, 'lead_score': 0.1932681704260652, 'lead_rank': 73.92712550607287, 'total_pop': 3165.0, 'poverty_score': 33.08056872037915, 'poverty_rank': 68.84584342211461, 'edu_score': 12.599999999999996, 'edu_rank': 76.25102543068087, 'lin_score': 14.4, 'lin_rank': 89.26116838487972, 'unemploy_score': 2.6856240126382307, 'unemploy_rank': 61.965811965811966, 'nonwhitePerc': 37.0, 'nonwhite_rank': 70.84006462035542, 'HouseBurden_score': 0.17777777777777778, 'HouseBurden_rank': 78.96522476675148, 'ozone_score': 52.380609150299996, 'ozone_rank': 55.08406725380304, 'diesel_score': 0.7912817459999999, 'diesel_rank': 99.8468606431853, 'toxics_score': 0.05805862004278571, 'toxics_rank': 98.31675592960977, 'PM25_score': 7.7040505464499995, 'PM25_rank': 77.42193755004004, 'Avg_PTRAF': 3325.3530412433447, 'Avg_PTSDF': 1.4958495033336934, 'Avg_PRMP': 1.13581223192084, 'Avg_PWDIS': 0.05828111476518325, 'Avg_PNPL': 0.22626570395181045, 'PTRAF_rank': 98.55884707766212, 'PTSDF_rank': 85.18815052041633, 'PRMP_rank': 84.54763811048839, 'PWDIS_rank': 76.06084867894316, 'PNPL_rank': 90.0720576461169, 'asthma_score': 668.76, 'asthma_rank': 56.28019323671497, 'LB_score': 7.47, 'LB_rank': 59.456066945606686, 'HD_score': 1.0, 'HD_rank': 2.9838709677419355, 'demographic_score': 74.35485643176567, 'exposure_score': 85.84569369086005, 'effects_score': 81.95916409240752, 'sensitive_score': 39.57337705002121, 'Pollution_Burden': 113.16541505499588, 'Pop_Char': 56.96411674089343, 'FinalScore': 57.593509403474776, 'missing_values': 0, 'Final_Rank': 89.33333333333331}, {'NAME': 'Census Tract 8, Denver County, Colorado', 'FIPS_tract_id': 8031000800, 'lead_score': 0.3313111545988258, 'lead_rank': 89.23076923076924, 'total_pop': 1462.0, 'poverty_score': 94.04924760601915, 'poverty_rank': 100.0, 'edu_score': 41.5, 'edu_rank': 99.58982772764557, 'lin_score': 16.0, 'lin_rank': 91.28006872852234, 'unemploy_score': 7.045143638850889, 'unemploy_rank': 98.63247863247864, 'nonwhitePerc': 87.6, 'nonwhite_rank': 99.11147011308563, 'HouseBurden_score': 0.26136363636363635, 'HouseBurden_rank': 96.01357082273113, 'ozone_score': 52.5576045752, 'ozone_rank': 60.44835868694955, 'diesel_score': 0.6963461999999999, 'diesel_rank': 98.69831546707505, 'toxics_score': 0.055264395306857166, 'toxics_rank': 96.17444529456772, 'PM25_score': 7.613393169400001, 'PM25_rank': 74.77982385908727, 'Avg_PTRAF': 3154.97401841172, 'Avg_PTSDF': 2.54939981004455, 'Avg_PRMP': 3.05135677664668, 'Avg_PWDIS': 0.2277493115878, 'Avg_PNPL': 0.436095031618678, 'PTRAF_rank': 98.23859087269815, 'PTSDF_rank': 93.5148118494796, 'PRMP_rank': 97.51801441152922, 'PWDIS_rank': 90.39231385108086, 'PNPL_rank': 96.15692554043235, 'asthma_score': 2248.99, 'asthma_rank': 99.67793880837358, 'LB_score': 7.41, 'LB_rank': 58.451882845188294, 'HD_score': 2.2, 'HD_rank': 33.99193548387097, 'demographic_score': 97.43790267074387, 'exposure_score': 85.66790683607556, 'effects_score': 93.36256697665827, 'sensitive_score': 64.04058571247761, 'Pollution_Burden': 116.78876249496165, 'Pop_Char': 80.73924419161074, 'FinalScore': 84.24501083409197, 'missing_values': 0, 'Final_Rank': 99.91666666666669}  ... displaying 10 of 1249 total bound parameter sets ...  {'NAME': 'Census Tract 148, Clear Creek County, Colorado', 'FIPS_tract_id': 8019014800, 'lead_score': 0.4433809001097695, 'lead_rank': 96.51821862348177, 'total_pop': 1667.0, 'poverty_score': 33.653269346130784, 'poverty_rank': 70.13720742534302, 'edu_score': 2.0999999999999943, 'edu_rank': 17.022149302707135, 'lin_score': 0.0, 'lin_rank': 1.2886597938144329, 'unemploy_score': 1.199760047990402, 'unemploy_rank': 17.69230769230769, 'nonwhitePerc': 12.700000000000005, 'nonwhite_rank': 15.549273021001616, 'HouseBurden_score': 0.11299435028248588, 'HouseBurden_rank': 42.578456318914334, 'ozone_score': 54.18674117649999, 'ozone_rank': 90.9527622097678, 'diesel_score': 0.080290988, 'diesel_rank': 25.114854517611022, 'toxics_score': 0.013338407393314291, 'toxics_rank': 7.651109410864573, 'PM25_score': 4.2836607650300005, 'PM25_rank': 5.604483586869495, 'Avg_PTRAF': 1510.1069289457, 'Avg_PTSDF': 0.0396001575175447, 'Avg_PRMP': 0.0395165290220553, 'Avg_PWDIS': 0.0523444379651759, 'Avg_PNPL': 0.3061312495694261, 'PTRAF_rank': 90.9527622097678, 'PTSDF_rank': 16.573258606885506, 'PRMP_rank': 6.5652522017614094, 'PWDIS_rank': 74.77982385908727, 'PNPL_rank': 93.75500400320257, 'asthma_score': 185.83, 'asthma_rank': 4.428341384863124, 'LB_score': 8.96, 'LB_rank': 79.8326359832636, 'HD_score': 7.7, 'HD_rank': 98.75, 'demographic_score': 27.37800892568137, 'exposure_score': 44.05519438697614, 'effects_score': 57.638311458883706, 'sensitive_score': 61.0036591227089, 'Pollution_Burden': 63.26796487327072, 'Pop_Char': 44.190834024195134, 'FinalScore': 24.978969472253798, 'missing_values': 0, 'Final_Rank': 44.66666666666666}, {'NAME': 'Census Tract 149, Clear Creek County, Colorado', 'FIPS_tract_id': 8019014900, 'lead_score': 0.28979623025980644, 'lead_rank': 85.74898785425101, 'total_pop': 2598.0, 'poverty_score': 24.826789838337174, 'poverty_rank': 50.928167877320426, 'edu_score': 4.799999999999997, 'edu_rank': 42.616899097621, 'lin_score': 1.5, 'lin_rank': 27.49140893470791, 'unemploy_score': 0.8852963818321785, 'unemploy_rank': 9.059829059829061, 'nonwhitePerc': 16.799999999999994, 'nonwhite_rank': 27.98869143780291, 'HouseBurden_score': 0.07630522088353414, 'HouseBurden_rank': 16.20016963528414, 'ozone_score': 52.7558261438, 'ozone_rank': 64.93194555644516, 'diesel_score': 0.042482347, 'diesel_rank': 17.534456355283307, 'toxics_score': 0.011459320046292857, 'toxics_rank': 3.0604437643458304, 'PM25_score': 3.88446259563, 'PM25_rank': 2.5620496397117694, 'Avg_PTRAF': 825.9671322014491, 'Avg_PTSDF': 0.02489595051924168, 'Avg_PRMP': 0.02485910019569456, 'Avg_PWDIS': 0.06297011658311225, 'Avg_PNPL': 0.06618647903465867, 'PTRAF_rank': 80.22417934347477, 'PTSDF_rank': 12.810248198558845, 'PRMP_rank': 3.602882305844677, 'PWDIS_rank': 77.18174539631704, 'PNPL_rank': 48.99919935948759, 'asthma_score': 372.71, 'asthma_rank': 16.505636070853466, 'LB_score': 7.87, 'LB_rank': 65.43933054393305, 'HD_score': 7.6, 'HD_rank': 98.62903225806451, 'demographic_score': 29.04752767376091, 'exposure_score': 33.662614931852175, 'effects_score': 45.66861262289184, 'sensitive_score': 60.19133295761701, 'Pollution_Burden': 48.885485806149454, 'Pop_Char': 44.619430315688966, 'FinalScore': 19.487781908086703, 'missing_values': 0, 'Final_Rank': 31.166666666666664})] (Background on this error at: http://sqlalche.me/e/2j85)

## Additional Code

In [201]:
# co_final[['FIPS_tract_id']=='08029964600'] << ?

In [202]:
state = gpd.read_file('tl_2017_08_tract.shp')
state = state.join(co_final.set_index('FIPS_tract_id'), how = 'left', on = 'GEOID', rsuffix = 'r')

fig, (ax0) = plt.subplots(ncols=1, sharex = True, sharey = True, figsize = (20, 20))

state.plot(ax = ax0, 
               column = 'Final_Rank',  
               legend = True,
               vmin = 0,
               vmax = 100,
               cmap = 'rainbow')

state.to_file('colorado_sample.shp')

DriverError: tl_2017_08_tract.shp: No such file or directory

In [0]:
# all_states['FIPS_tract_id'] = all_states['FIPS_tract_id'].astype(int)
# demographic['FIPS'] = demographic['FIPS'].astype()
# x=all_states.FIPS_tract_id.astype(str)
#Census Data

#I'm trying to change the key column type here - it keeps not working. Some of the data is only for Colorado,
#so the box below can be used to check whether that data is merging correctly. all_states is the first dataset
#of lead in housing


all_states2 = pd.merge(all_states.assign(x=all_states['FIPS_tract_id'].astype(str)), 
                       demographic.assign(x=demographic['FIPS'].astype(str)), 
                       how = 'left',
                       left_on = 'FIPS_tract_id',
                       right_on = 'FIPS', 
                       validate="1:1")

# Housing Burden
all_states2 = pd.merge(all_states2, Housforcombine.astype(str), how = 'left', left_on = 'FIPS_tract_id',right_on = 'geoid', validate="1:1")

#Ozone
all_states2 = pd.merge(all_states2, ozone_df.astype(str), how = 'left', left_on = 'FIPS_tract_id',right_on = 'FIPS_tract_id', validate="1:1")

#Diesel
all_states2 = pd.merge(all_states2, diesel_for_combine.astype(str), how = 'left', left_on = 'FIPS_tract_id',right_on = 'Tract', validate="1:1")

#Air Toxics
all_states2 = pd.merge(all_states2, toxics_for_combine.astype(str), how = 'left', left_on = 'FIPS_tract_id',right_on = 'Tract', validate="1:1")

#Cancer
# all_states = pd.merge(all_states, cancer_for_combine, how = 'left', left_on = 'FIPS_tract_id',right_on = 'Tract', validate="1:1")

#Pm 2.5
all_states2 = pd.merge(all_states2, pm25_df.astype(str), how = 'left', left_on = 'FIPS_tract_id',right_on = 'FIPS_tract_id', validate="1:1")

# Variety of environmental exposures
all_states2 = pd.merge(all_states2, ejscreen_indicators.astype(str), how = 'left', left_on = 'FIPS_tract_id',right_on = 'Tract_ID', validate="1:1")