## Initialize AQS Dataset

In [None]:
pip install uszipcode

In [1]:
#import libraries
import requests
import json
import time
import pandas as pd
from uszipcode import SearchEngine

Data from the Air Quality System (AQS) API is avaliavble for anyone to use as long as you sign up. Tables of variable description, variable values needed for service requests and required variables for specific requests.

https://aqs.epa.gov/aqsweb/documents/data_api.html

The API has the following limits imposed on request size:
- Length of time: end date must be in the same year as begin date (except monitors)
- Number of params: 5 max
- Limit the queries to 1mil rows of data
- Limit the frequency of queries; they request a 5 second pause between requests and to not make 10 requests per minute.

In [2]:
#setting up credentials
email = 'jacyyang04@gmail.com'
key = 'ochreosprey79'
creds = 'email={}&key={}'.format(email, key)

Obtaining daily data summaries from CA, we need:
- email
- key
- base url (api endpoint)
- param
- bdate
- edate
- state

In [3]:
#function sets up url and specific parameters needed to call api in year given

def api_url(year):
    #set up mmddyyyy format
    bdate = year + '0101'
    edate = year + '1231'
    date_range = 'bdate={}&edate={}'.format(bdate, edate)
    
    #PM2.5 is fine inhalable particles, with diameters that are generally 2.5 micrometers and smaller
    param = 'param=88101'
    
    #california code: 06
    state = 'state=06'
    
    base_url = 'https://aqs.epa.gov/data/api/dailyData/byState'
    
    #generates api request url
    url = '{}?{}&{}&{}&{}'.format(base_url, creds, param, date_range, state)
    
    return url

In [4]:
#calls for api request and returns api in json
def request_aqs(year):
    #calls set_url function which returns api url
    response = requests.get(api_url(year))
    return response

In [6]:
#set required years
years = ['2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']

#will hold list of df
raw_frames = []

#calls api request functions for each year
for year in years:
    #calls request_aqs function and returns json
    response = request_aqs(year)
    
    raw_data = response.json()
    
    #grab values with Data key and turns dictionary into dataframe
    df = pd.DataFrame(raw_data['Data'])
    
    #add df to raw_frames list
    raw_frames.append(df)
    #AQS API size restrictions
    time.sleep(8)
    
print('Success!')

Success!


In [7]:
#create dataframe by concating all items in raw_frames list
raw_df = pd.concat(raw_frames)

In [8]:
aqs_df = raw_df.copy()

In [9]:
aqs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1349571 entries, 0 to 21195
Data columns (total 31 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   state_code           1349571 non-null  object 
 1   county_code          1349571 non-null  object 
 2   site_number          1349571 non-null  object 
 3   parameter_code       1349571 non-null  object 
 4   poc                  1349571 non-null  int64  
 5   latitude             1349571 non-null  float64
 6   longitude            1349571 non-null  float64
 7   datum                1349571 non-null  object 
 8   parameter            1349571 non-null  object 
 9   sample_duration      1349571 non-null  object 
 10  pollutant_standard   1125128 non-null  object 
 11  date_local           1349571 non-null  object 
 12  units_of_measure     1349571 non-null  object 
 13  event_type           1349571 non-null  object 
 14  observation_count    1349571 non-null  int64  
 15  

In [10]:
#reset index
aqs_df = aqs_df.reset_index()

In [11]:
columns_desired = ['date_local',
                   'county',
                   'city',
                   'latitude',
                   'longitude',
                   'aqi',
                   'units_of_measure',
                   'first_max_value',
                   'first_max_hour',
]

In [12]:
pd.options.display.max_columns = 35
aqs_df.head()

Unnamed: 0,index,state_code,county_code,site_number,parameter_code,poc,latitude,longitude,datum,parameter,sample_duration,pollutant_standard,date_local,units_of_measure,event_type,observation_count,observation_percent,validity_indicator,arithmetic_mean,first_max_value,first_max_hour,aqi,method_code,method,local_site_name,site_address,state,county,city,cbsa_code,cbsa,date_of_last_change
0,0,6,23,1005,88101,1,40.71528,-124.20139,WGS84,PM2.5 - Local Conditions,1 HOUR,,2013-09-30,Micrograms/cubic meter (LC),,24,100.0,Y,10.083333,24.1,18,,195,GRIMM EDM Model 180 with naphion dryer - Laser...,Humboldt Hill,170 meters SE of Donna Dr. & Humboldt Hill Rd....,California,Humboldt,Not in a city,21700,"Eureka-Arcata-Fortuna, CA",2021-03-12
1,1,6,23,1005,88101,1,40.71528,-124.20139,WGS84,PM2.5 - Local Conditions,1 HOUR,,2013-09-30,Micrograms/cubic meter (LC),Included,24,100.0,Y,10.083333,24.1,18,,195,GRIMM EDM Model 180 with naphion dryer - Laser...,Humboldt Hill,170 meters SE of Donna Dr. & Humboldt Hill Rd....,California,Humboldt,Not in a city,21700,"Eureka-Arcata-Fortuna, CA",2021-03-12
2,2,6,23,1005,88101,1,40.71528,-124.20139,WGS84,PM2.5 - Local Conditions,1 HOUR,,2013-09-29,Micrograms/cubic meter (LC),,24,100.0,Y,1.766667,3.5,11,,195,GRIMM EDM Model 180 with naphion dryer - Laser...,Humboldt Hill,170 meters SE of Donna Dr. & Humboldt Hill Rd....,California,Humboldt,Not in a city,21700,"Eureka-Arcata-Fortuna, CA",2021-03-12
3,3,6,23,1005,88101,1,40.71528,-124.20139,WGS84,PM2.5 - Local Conditions,1 HOUR,,2013-09-29,Micrograms/cubic meter (LC),Included,24,100.0,Y,1.766667,3.5,11,,195,GRIMM EDM Model 180 with naphion dryer - Laser...,Humboldt Hill,170 meters SE of Donna Dr. & Humboldt Hill Rd....,California,Humboldt,Not in a city,21700,"Eureka-Arcata-Fortuna, CA",2021-03-12
4,4,6,23,1005,88101,1,40.71528,-124.20139,WGS84,PM2.5 - Local Conditions,1 HOUR,,2013-09-28,Micrograms/cubic meter (LC),,24,100.0,Y,1.5125,4.5,14,,195,GRIMM EDM Model 180 with naphion dryer - Laser...,Humboldt Hill,170 meters SE of Donna Dr. & Humboldt Hill Rd....,California,Humboldt,Not in a city,21700,"Eureka-Arcata-Fortuna, CA",2021-03-12


In [13]:
#dropduplicates
aqs_df = aqs_df.drop_duplicates()

In [16]:
len(aqs_df)

1349571

In [14]:
#change date_local values to datetime object
aqs_df['date_local'] = pd.to_datetime(aqs_df['date_local'])

In [15]:
#create new dataframe wth desired columns
aqs_filter_df = aqs_df[columns_desired]

In [17]:
len(aqs_filter_df)

1349571

In [18]:
aqs_filter_df

Unnamed: 0,date_local,county,city,latitude,longitude,aqi,units_of_measure,first_max_value,first_max_hour
0,2013-09-30,Humboldt,Not in a city,40.71528,-124.20139,,Micrograms/cubic meter (LC),24.1,18
1,2013-09-30,Humboldt,Not in a city,40.71528,-124.20139,,Micrograms/cubic meter (LC),24.1,18
2,2013-09-29,Humboldt,Not in a city,40.71528,-124.20139,,Micrograms/cubic meter (LC),3.5,11
3,2013-09-29,Humboldt,Not in a city,40.71528,-124.20139,,Micrograms/cubic meter (LC),3.5,11
4,2013-09-28,Humboldt,Not in a city,40.71528,-124.20139,,Micrograms/cubic meter (LC),4.5,14
...,...,...,...,...,...,...,...,...,...
1349566,2021-01-02,San Luis Obispo,Arroyo Grande,35.04673,-120.58777,39.0,Micrograms/cubic meter (LC),9.4,0
1349567,2021-01-01,San Luis Obispo,Arroyo Grande,35.04673,-120.58777,40.0,Micrograms/cubic meter (LC),9.6,0
1349568,2021-01-01,San Luis Obispo,Arroyo Grande,35.04673,-120.58777,40.0,Micrograms/cubic meter (LC),9.6,0
1349569,2021-01-01,San Luis Obispo,Arroyo Grande,35.04673,-120.58777,40.0,Micrograms/cubic meter (LC),9.6,0


In [19]:
#save into json
aqs_filter_df.to_json('aqs_PM2.5_data_clean.json')

In [20]:
#save into csv
aqs_filter_df.to_csv('aqs_PM2.5_data_clean.csv')

## Obtaining Zipcodes ##

**If we want zipcodes..**

In [None]:
#set library to access the bigger zipcode database
search = SearchEngine(simple_zipcode=False)

In [None]:
#attempt 1 in search for zipcode
attempt_1 = search.by

from uszipcode import Zipcode

# Search zipcode within 30 miles, ordered from closest to farthest
result = search.by_coordinates(39.122229, -77.133578, radius=30, returns=5)