In [1]:
import pandas as pd

In [4]:
data_path = 'data'

### PREDICTOR: ENVIRONMENTAL QUALITY
**Metric:** Air quality index

The air quality index, or AQI, is an index for reporting daily air quality. It tells how clean or polluted the
air is and what associated health effects might be a concern in the community. The AQI includes five
major air pollutants regulated by the Clean Air Act: ground-level ozone, particle pollution (also known
as particulate matter), carbon monoxide, sulfur dioxide, and nitrogen dioxide. For each of these
pollutants, the Environmental Protection Agency has established national air quality standards to
protect public health. Ground-level ozone and airborne particles are the two pollutants that pose the
greatest threat to human health in the US. Values range from 0 to 500 and are categorized into a six-
point scale: good, moderate, unhealthy for sensitive groups, unhealthy, very unhealthy, and hazardous.

**Source:** https://aqs.epa.gov/aqsweb/airdata/annual_aqi_by_county_2019.zip  from https://aqs.epa.gov/aqsweb/airdata/download_files.html 

**Documentation:** https://aqs.epa.gov/aqsweb/airdata/FileFormats.html#_daily_summary_files 

**Notes:** Calculated the average of scores across each county. 


In [5]:
aqi_df =  pd.read_csv(f'{data_path}/raw/annual_aqi_by_county_2019.csv')

In [6]:
aqi_fips_df = aqi_df.rename(columns={'Median AQI': 'AQI'})

In [7]:
aqi_fips_df['County'] = aqi_df['County'].str.upper()
aqi_fips_df.head()

Unnamed: 0,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,AQI,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10
0,Alabama,BALDWIN,2019,271,237,34,0,0,0,0,80,52,37,0,0,220,51,0
1,Alabama,CLAY,2019,107,97,10,0,0,0,0,67,50,30,0,0,0,107,0
2,Alabama,COLBERT,2019,263,252,11,0,0,0,0,61,47,37,0,0,228,35,0
3,Alabama,DEKALB,2019,361,324,37,0,0,0,0,90,51,39,0,0,331,30,0
4,Alabama,ELMORE,2019,228,208,20,0,0,0,0,100,50,39,0,0,228,0,0


In [10]:
aqi_fips_df.columns

Index(['State', 'County', 'Year', 'Days with AQI', 'Good Days',
       'Moderate Days', 'Unhealthy for Sensitive Groups Days',
       'Unhealthy Days', 'Very Unhealthy Days', 'Hazardous Days', 'Max AQI',
       '90th Percentile AQI', 'AQI', 'Days CO', 'Days NO2', 'Days Ozone',
       'Days PM2.5', 'Days PM10'],
      dtype='object')

In [11]:
crosswalk_df = pd.read_csv(f'{data_path}/processed/state_county_fips.csv')

In [12]:
crosswalk_df.head()

Unnamed: 0.1,Unnamed: 0,NAME,state,county,County_Name,State_Name,State_Abbreviation,State_Abbreviation_County,FIPS
0,1,"Sebastian County, Arkansas",5,131,SEBASTIAN,Arkansas,AR,AR-SEBASTIAN,5131
1,2,"Sevier County, Arkansas",5,133,SEVIER,Arkansas,AR,AR-SEVIER,5133
2,3,"Sharp County, Arkansas",5,135,SHARP,Arkansas,AR,AR-SHARP,5135
3,4,"Stone County, Arkansas",5,137,STONE,Arkansas,AR,AR-STONE,5137
4,5,"Union County, Arkansas",5,139,UNION,Arkansas,AR,AR-UNION,5139


In [13]:
aqi_df = aqi_df.merge(crosswalk_df, how='left', left_on=['County', 'State'], right_on=['County_Name', 'State_Name'])
aqi_df.sample(5)

Unnamed: 0.1,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,...,Days PM10,Unnamed: 0,NAME,state,county,County_Name,State_Name,State_Abbreviation,State_Abbreviation_County,FIPS
889,Utah,WEBER,2019,308,243,63,2,0,0,0,...,0,1684.0,"Weber County, Utah",49.0,57.0,WEBER,Utah,UT,UT-WEBER,49057.0
953,Washington,SPOKANE,2019,365,301,63,1,0,0,0,...,25,2817.0,"Spokane County, Washington",53.0,63.0,SPOKANE,Washington,WA,WA-SPOKANE,53063.0
820,Tennessee,LAWRENCE,2019,330,306,24,0,0,0,0,...,0,1999.0,"Lawrence County, Tennessee",47.0,99.0,LAWRENCE,Tennessee,TN,TN-LAWRENCE,47099.0
614,North Carolina,HYDE,2019,102,100,2,0,0,0,0,...,0,757.0,"Hyde County, North Carolina",37.0,95.0,HYDE,North Carolina,NC,NC-HYDE,37095.0
673,Ohio,MONTGOMERY,2019,365,229,136,0,0,0,0,...,2,1129.0,"Montgomery County, Ohio",39.0,113.0,MONTGOMERY,Ohio,OH,OH-MONTGOMERY,39113.0


In [14]:
nonas_df = aqi_df[['FIPS', 'AQI']]
# print(len(aqi_fips_df))
nonas_df.dropna(subset=['FIPS'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nonas_df.dropna(subset=['FIPS'], inplace=True)


In [15]:
# nonas_df.sample(10)
# nonas_df[['FIPS', 'AQI']].groupby('FIPS').count().sort_values(by='AQI', ascending=True).head()

In [16]:
nonas_df.to_csv(f'{data_path}/processed/fips_air_quality_index.csv', index=False)