## Initialize AQS Dataset

In [None]:
pip install uszipcode

In [1]:
#import libraries
import requests
import json
import time
import pandas as pd

Data from the Air Quality System (AQS) API is avaliavble for anyone to use as long as you sign up. Tables of variable description, variable values needed for service requests and required variables for specific requests.

https://aqs.epa.gov/aqsweb/documents/data_api.html

The API has the following limits imposed on request size:
- Length of time: end date must be in the same year as begin date (except monitors)
- Number of params: 5 max
- Limit the queries to 1mil rows of data
- Limit the frequency of queries; they request a 5 second pause between requests and to not make 10 requests per minute.

Obtain credentials from: https://aqs.epa.gov/aqsweb/documents/data_api.html#signup

In [2]:
#setting up credentials
email = 'email'
key = 'key'
creds = 'email={}&key={}'.format(email, key)

Obtaining daily data summaries from CA, we need:
- email
- key
- base url (api endpoint)
- param
- bdate
- edate
- state

In [3]:
#function sets up url and specific parameters needed to call api in year given

def api_url(year):
    #set up mmddyyyy format
    bdate = year + '0101'
    edate = year + '1231'
    date_range = 'bdate={}&edate={}'.format(bdate, edate)
    
    #PM2.5 is fine inhalable particles, with diameters that are generally 2.5 micrometers and smaller
    param = 'param=88101'
    
    #california code: 06
    state = 'state=06'
    
    base_url = 'https://aqs.epa.gov/data/api/dailyData/byState'
    
    #generates api request url
    url = '{}?{}&{}&{}&{}'.format(base_url, creds, param, date_range, state)
    
    return url

In [4]:
#calls for api request and returns api in json
def request_aqs(year):
    #calls set_url function which returns api url
    response = requests.get(api_url(year))
    return response

In [5]:
#set required years
years = ['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']

#will hold list of df
raw_frames = []

#calls api request functions for each year
for year in years:
    #calls request_aqs function and returns json
    response = request_aqs(year)
    
    raw_data = response.json()
    
    #grab values with Data key and turns dictionary into dataframe
    df = pd.DataFrame(raw_data['Data'])
    
    #add df to raw_frames list
    raw_frames.append(df)
    #AQS API size restrictions
    time.sleep(8)
    
print('Success!')

Success!


In [6]:
#create dataframe by concating all items in raw_frames list
raw_df = pd.concat(raw_frames)

In [7]:
raw_df = raw_df.reset_index(drop=True)

In [8]:
raw_df

Unnamed: 0,state_code,county_code,site_number,parameter_code,poc,latitude,longitude,datum,parameter,sample_duration,...,method_code,method,local_site_name,site_address,state,county,city,cbsa_code,cbsa,date_of_last_change
0,06,079,2007,88101,1,35.046730,-120.587770,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
1,06,079,2007,88101,1,35.046730,-120.587770,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
2,06,079,2007,88101,1,35.046730,-120.587770,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
3,06,079,2007,88101,1,35.046730,-120.587770,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
4,06,079,2007,88101,1,35.046730,-120.587770,NAD83,PM2.5 - Local Conditions,1 HOUR,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1590497,06,019,0500,88101,1,36.985119,-119.658339,WGS84,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Table Mountain Air Monitoring Site,Milerton Road and Winchell Cove Road,California,Fresno,Not in a city,23420,"Fresno, CA",2021-06-05
1590498,06,019,0500,88101,1,36.985119,-119.658339,WGS84,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Table Mountain Air Monitoring Site,Milerton Road and Winchell Cove Road,California,Fresno,Not in a city,23420,"Fresno, CA",2021-06-05
1590499,06,019,0500,88101,1,36.985119,-119.658339,WGS84,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Table Mountain Air Monitoring Site,Milerton Road and Winchell Cove Road,California,Fresno,Not in a city,23420,"Fresno, CA",2021-06-05
1590500,06,019,0500,88101,1,36.985119,-119.658339,WGS84,PM2.5 - Local Conditions,1 HOUR,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Table Mountain Air Monitoring Site,Milerton Road and Winchell Cove Road,California,Fresno,Not in a city,23420,"Fresno, CA",2021-05-23


In [9]:
#save to json file
raw_df.to_json('aqs_raw.json', date_format = 'iso')

# Cleaning the data #

In [10]:
aqs_df = raw_df.copy()

In [11]:
aqs_df.head()

Unnamed: 0,state_code,county_code,site_number,parameter_code,poc,latitude,longitude,datum,parameter,sample_duration,...,method_code,method,local_site_name,site_address,state,county,city,cbsa_code,cbsa,date_of_last_change
0,6,79,2007,88101,1,35.04673,-120.58777,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
1,6,79,2007,88101,1,35.04673,-120.58777,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
2,6,79,2007,88101,1,35.04673,-120.58777,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
3,6,79,2007,88101,1,35.04673,-120.58777,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
4,6,79,2007,88101,1,35.04673,-120.58777,NAD83,PM2.5 - Local Conditions,1 HOUR,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29


In [12]:
raw_df = raw_df.reset_index(drop=True)

In [13]:
raw_df

Unnamed: 0,state_code,county_code,site_number,parameter_code,poc,latitude,longitude,datum,parameter,sample_duration,...,method_code,method,local_site_name,site_address,state,county,city,cbsa_code,cbsa,date_of_last_change
0,06,079,2007,88101,1,35.046730,-120.587770,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
1,06,079,2007,88101,1,35.046730,-120.587770,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
2,06,079,2007,88101,1,35.046730,-120.587770,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
3,06,079,2007,88101,1,35.046730,-120.587770,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
4,06,079,2007,88101,1,35.046730,-120.587770,NAD83,PM2.5 - Local Conditions,1 HOUR,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1590497,06,019,0500,88101,1,36.985119,-119.658339,WGS84,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Table Mountain Air Monitoring Site,Milerton Road and Winchell Cove Road,California,Fresno,Not in a city,23420,"Fresno, CA",2021-06-05
1590498,06,019,0500,88101,1,36.985119,-119.658339,WGS84,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Table Mountain Air Monitoring Site,Milerton Road and Winchell Cove Road,California,Fresno,Not in a city,23420,"Fresno, CA",2021-06-05
1590499,06,019,0500,88101,1,36.985119,-119.658339,WGS84,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Table Mountain Air Monitoring Site,Milerton Road and Winchell Cove Road,California,Fresno,Not in a city,23420,"Fresno, CA",2021-06-05
1590500,06,019,0500,88101,1,36.985119,-119.658339,WGS84,PM2.5 - Local Conditions,1 HOUR,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Table Mountain Air Monitoring Site,Milerton Road and Winchell Cove Road,California,Fresno,Not in a city,23420,"Fresno, CA",2021-05-23


In [14]:
#save to json file
raw_df.to_json('aqs_raw.json')

# Cleaning the data #

In [15]:
aqs_df = raw_df.copy()

In [16]:
aqs_df.head()

Unnamed: 0,state_code,county_code,site_number,parameter_code,poc,latitude,longitude,datum,parameter,sample_duration,...,method_code,method,local_site_name,site_address,state,county,city,cbsa_code,cbsa,date_of_last_change
0,6,79,2007,88101,1,35.04673,-120.58777,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
1,6,79,2007,88101,1,35.04673,-120.58777,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
2,6,79,2007,88101,1,35.04673,-120.58777,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
3,6,79,2007,88101,1,35.04673,-120.58777,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
4,6,79,2007,88101,1,35.04673,-120.58777,NAD83,PM2.5 - Local Conditions,1 HOUR,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29


In [17]:
aqs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1590502 entries, 0 to 1590501
Data columns (total 31 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   state_code           1590502 non-null  object 
 1   county_code          1590502 non-null  object 
 2   site_number          1590502 non-null  object 
 3   parameter_code       1590502 non-null  object 
 4   poc                  1590502 non-null  int64  
 5   latitude             1590502 non-null  float64
 6   longitude            1590502 non-null  float64
 7   datum                1590502 non-null  object 
 8   parameter            1590502 non-null  object 
 9   sample_duration      1590502 non-null  object 
 10  pollutant_standard   1331133 non-null  object 
 11  date_local           1590502 non-null  object 
 12  units_of_measure     1590502 non-null  object 
 13  event_type           1590502 non-null  object 
 14  observation_count    1590502 non-null  int64  
 15

In [18]:
columns_keep = ['date_local',
                'county',
                'city',
                   'latitude',
                   'longitude',
                   'aqi',
                   'units_of_measure',
                   'first_max_value',
                   'first_max_hour',
]

In [19]:
#reset index and drop na and duplicates
aqs_df = aqs_df.dropna().drop_duplicates().reset_index(drop=True)

In [20]:
#pd.options.display.max_columns = 35
aqs_df.head()

Unnamed: 0,state_code,county_code,site_number,parameter_code,poc,latitude,longitude,datum,parameter,sample_duration,...,method_code,method,local_site_name,site_address,state,county,city,cbsa_code,cbsa,date_of_last_change
0,6,79,2007,88101,1,35.04673,-120.58777,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
1,6,79,2007,88101,1,35.04673,-120.58777,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
2,6,79,2007,88101,1,35.04673,-120.58777,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
3,6,79,2007,88101,1,35.04673,-120.58777,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29
4,6,79,2007,88101,1,35.04673,-120.58777,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Arroyo Grande CDF,"2391 Willow Road, Arroyo Grande, California",California,San Luis Obispo,Arroyo Grande,42020,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",2014-06-29


In [21]:
#change date_local values to datetime object
aqs_df['date_local'] = pd.to_datetime(aqs_df['date_local'])

In [22]:
#create new dataframe wth desired columns
aqs_filter_df = aqs_df[columns_keep]

In [23]:
aqs_filter_df

Unnamed: 0,date_local,county,city,latitude,longitude,aqi,units_of_measure,first_max_value,first_max_hour
0,2011-12-31,San Luis Obispo,Arroyo Grande,35.046730,-120.587770,55.0,Micrograms/cubic meter (LC),13.8,0
1,2011-12-31,San Luis Obispo,Arroyo Grande,35.046730,-120.587770,55.0,Micrograms/cubic meter (LC),13.8,0
2,2011-12-31,San Luis Obispo,Arroyo Grande,35.046730,-120.587770,55.0,Micrograms/cubic meter (LC),13.8,0
3,2011-12-31,San Luis Obispo,Arroyo Grande,35.046730,-120.587770,55.0,Micrograms/cubic meter (LC),13.8,0
4,2011-12-30,San Luis Obispo,Arroyo Grande,35.046730,-120.587770,48.0,Micrograms/cubic meter (LC),11.5,0
...,...,...,...,...,...,...,...,...,...
1272933,2021-01-02,Fresno,Not in a city,36.985119,-119.658339,23.0,Micrograms/cubic meter (LC),5.6,0
1272934,2021-01-01,Fresno,Not in a city,36.985119,-119.658339,55.0,Micrograms/cubic meter (LC),13.8,0
1272935,2021-01-01,Fresno,Not in a city,36.985119,-119.658339,55.0,Micrograms/cubic meter (LC),13.8,0
1272936,2021-01-01,Fresno,Not in a city,36.985119,-119.658339,55.0,Micrograms/cubic meter (LC),13.8,0


In [24]:
#save into json
aqs_filter_df.to_json('aqs_PM2.5_data_clean.json', date_format='iso')

In [25]:
len(aqs_filter_df)

1272938