In [None]:
import pandas as pd

In [None]:
import os
if 'COLAB_GPU' in os.environ:
    from google.colab import  drive
    drive.mount('/drive')
    data_path = '/drive/Shared drives/Capstone/notebooks/data'
else:
    data_path = 'data'


Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


### PREDICTOR: ENVIRONMENTAL QUALITY
**Metric:** Air quality index

The air quality index, or AQI, is an index for reporting daily air quality. It tells how clean or polluted the
air is and what associated health effects might be a concern in the community. The AQI includes five
major air pollutants regulated by the Clean Air Act: ground-level ozone, particle pollution (also known
as particulate matter), carbon monoxide, sulfur dioxide, and nitrogen dioxide. For each of these
pollutants, the Environmental Protection Agency has established national air quality standards to
protect public health. Ground-level ozone and airborne particles are the two pollutants that pose the
greatest threat to human health in the US. Values range from 0 to 500 and are categorized into a six-
point scale: good, moderate, unhealthy for sensitive groups, unhealthy, very unhealthy, and hazardous.

**Source:** https://aqs.epa.gov/aqsweb/airdata/annual_aqi_by_county_2019.zip  from https://aqs.epa.gov/aqsweb/airdata/download_files.html 

**Documentation:** https://aqs.epa.gov/aqsweb/airdata/FileFormats.html#_daily_summary_files 

**Notes:** Calculated the average of scores across each county. 


In [None]:
aqi_df =  pd.read_csv(f'{data_path}/raw/annual_aqi_by_county_2019.csv')

In [None]:
aqi_df = aqi_df.rename(columns={'Median AQI': 'AQI'})

In [None]:
aqi_df['County'] = aqi_df['County'].str.upper()
aqi_df.head()

Unnamed: 0,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,AQI,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10
0,Alabama,BALDWIN,2019,271,237,34,0,0,0,0,80,52,37,0,0,220,51,0
1,Alabama,CLAY,2019,107,97,10,0,0,0,0,67,50,30,0,0,0,107,0
2,Alabama,COLBERT,2019,263,252,11,0,0,0,0,61,47,37,0,0,228,35,0
3,Alabama,DEKALB,2019,361,324,37,0,0,0,0,90,51,39,0,0,331,30,0
4,Alabama,ELMORE,2019,228,208,20,0,0,0,0,100,50,39,0,0,228,0,0


In [None]:
aqi_fips_df = aqi_df.copy()
aqi_fips_df['FIPS'] = aqi_df['State Code'].astype(int).astype(str).str.zfill(2)+ aqi_df['County Code'].astype(int).astype(str).str.zfill(3)

In [None]:
aqi_fips_df.columns

Index(['State Code', 'County Code', 'Site Num', 'Parameter Code', 'POC',
       'Latitude', 'Longitude', 'Datum', 'Parameter Name', 'Sample Duration',
       'Pollutant Standard', 'Date Local', 'Units of Measure', 'Event Type',
       'Observation Count', 'Observation Percent', 'Arithmetic Mean',
       '1st Max Value', '1st Max Hour', 'AQI', 'Method Code', 'Method Name',
       'Local Site Name', 'Address', 'State Name', 'County Name', 'City Name',
       'CBSA Name', 'Date of Last Change', 'FIPS'],
      dtype='object')

In [None]:
crosswalk_df = pd.read_csv(f'{data_path}/processed/state_county_fips.csv')

In [None]:
crosswalk_df.head()

Unnamed: 0.1,Unnamed: 0,NAME,state,county,County_Name,State_Name,State_Abbreviation,State_Abbreviation_County,FIPS
0,1,"Sebastian County, Arkansas",5,131,SEBASTIAN,Arkansas,AR,AR-SEBASTIAN,5131
1,2,"Sevier County, Arkansas",5,133,SEVIER,Arkansas,AR,AR-SEVIER,5133
2,3,"Sharp County, Arkansas",5,135,SHARP,Arkansas,AR,AR-SHARP,5135
3,4,"Stone County, Arkansas",5,137,STONE,Arkansas,AR,AR-STONE,5137
4,5,"Union County, Arkansas",5,139,UNION,Arkansas,AR,AR-UNION,5139


In [None]:
aqi_df = aqi_df.merge(crosswalk_df, how='left', left_on=['County', 'State'], right_on=['County_Name', 'State_Name'])
aqi_df.sample(5)

Unnamed: 0.1,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,...,Days PM10,Unnamed: 0,NAME,state,county,County_Name,State_Name,State_Abbreviation,State_Abbreviation_County,FIPS
700,Oklahoma,OKLAHOMA,2019,365,227,137,1,0,0,0,...,1,1634.0,"Oklahoma County, Oklahoma",40.0,109.0,OKLAHOMA,Oklahoma,OK,OK-OKLAHOMA,40109.0
443,Minnesota,BECKER,2019,360,336,23,1,0,0,0,...,0,1846.0,"Becker County, Minnesota",27.0,5.0,BECKER,Minnesota,MN,MN-BECKER,27005.0
785,South Carolina,BERKELEY,2019,216,189,27,0,0,0,0,...,0,2357.0,"Berkeley County, South Carolina",45.0,15.0,BERKELEY,South Carolina,SC,SC-BERKELEY,45015.0
353,Louisiana,ASCENSION,2019,361,319,41,1,0,0,0,...,0,,,,,,,,,
75,California,MONTEREY,2019,365,330,34,1,0,0,0,...,14,37.0,"Monterey County, California",6.0,53.0,MONTEREY,California,CA,CA-MONTEREY,6053.0


In [None]:
nonas_df = aqi_df[['FIPS', 'AQI']]
# print(len(aqi_fips_df))
nonas_df.dropna(subset=['FIPS'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [None]:
nonas_df.to_csv(f'{data_path}/processed/air_quality_index.csv', index=False)