#### Import Dependencies & needed API Keys

In [152]:
import pandas as pd
import requests
import json
from datetime import datetime, timedelta
from time import sleep, time

from config import airNowApiKeyShane, airNowApiKeyAudelia, airNowApiKeyCenez, airNowApiKeyJoseph, airNowApiKeyJoey

from largest_combined_statistical_areas_v4_t25 import largest_combined_statistical_areas

# from daylight_savings_time_dates_v1 import daylight_savings_time_dates
from daylight_savings_time_dates_v1_2015_to_2020 import daylight_savings_time_dates


#### Set output data file filepath variable

In [161]:
output_data_filepath = "output_data/"

#### Setup counters, empty list to hold data amd API Key Variables to be used

In [3]:
# Setup Counters
counterAttempts = 0
counterAttemptsPerAPI = 0
counterAPI = 1

# Setup empty list
apiData = []

# Setup API Key Variables
airNowApiKey1 = airNowApiKeyShane
airNowApiKey2 = airNowApiKeyAudelia
airNowApiKey3 = airNowApiKeyCenez
airNowApiKey4 = airNowApiKeyJoseph
airNowApiKey5 = airNowApiKeyJoey

#### Create a list of date ranges to iterate through (due to API limitations, list for each month: [monthStartDate, monthEndDate] -or- quarter [quarterStartDate, quarterEndDate]; January 2015 to April 2020)

In [4]:
# # Set Date Range to use for API
# # startDate = "2015-01-01"
# startDate = "2019-12-01" # Used for testing
# # endDate = "2020-04-30"
# endDate = "2019-12-31" # Used for testing

# # Create empty list to start
# datesToUseList = []
# monthStartDateList = pd.date_range(startDate,endDate, freq = '1M') - pd.offsets.MonthBegin(1)
# monthEndDateList = pd.date_range(startDate,endDate, freq = '1M')

# for date in monthEndDateList:
#     datesToUseList.append([(date - pd.offsets.MonthBegin(1)).strftime("%Y-%m-%d"), date.strftime("%Y-%m-%d")])

# Create list of Q ranges for each year of data we need
datesToUseList = [
    ['2015-01-01', '2015-03-31'],
    ['2015-04-01', '2015-06-30'],
    ['2015-07-01', '2015-09-30'],
    ['2015-10-01', '2015-12-31'],
    ['2016-01-01', '2016-03-31'],
    ['2016-04-01', '2016-06-30'],
    ['2016-07-01', '2016-09-30'],
    ['2016-10-01', '2016-12-31'],
    ['2017-01-01', '2017-03-31'],
    ['2017-04-01', '2017-06-30'],
    ['2017-07-01', '2017-09-30'],
    ['2017-10-01', '2017-12-31'],
    ['2018-01-01', '2018-03-31'],
    ['2018-04-01', '2018-06-30'],
    ['2018-07-01', '2018-09-30'],
    ['2018-10-01', '2018-12-31'],
    ['2019-01-01', '2019-03-31'],
    ['2019-04-01', '2019-06-30'],
    ['2019-07-01', '2019-09-30'],
    ['2019-10-01', '2019-12-31'],
    ['2020-01-01', '2020-03-31'],
    ['2020-04-01', '2020-06-30']
]


# datesToUseList
len(datesToUseList)

22

#### Create a list of date that are within the Daylight Savings Time period to determine which UTC Offset to use for each city (January 2015 to Dec  2020)

In [5]:
# Create a date list of DST dates
dstDateList = []
for year in daylight_savings_time_dates:
    dstStart = year["dates"]["start"]
    dstEnd = year["dates"]["end"]
    dstDateListHolder = pd.date_range(dstStart, dstEnd).strftime("%Y-%m-%d").tolist()
    for date in dstDateListHolder:
        dstDateList.append(date)

# dstDateList
len(dstDateList)

1434

#### Setup API Call Function and with URL, API Key, and variables to pass through in the requests.get() API call

In [129]:
def apiAirNowObsByMonitoringSite(csaRank, csaName, csaPrimaryCity, csaPrimaryCityState, csaPopulation2018Estimate, csaPopulation2010Census, csaPrimaryCityLat, csaPrimaryCityLong, csaPrimaryCityZip, csaTimeZone, csaStandardTimeUtcOffset, csaDaylightSavingsTimeUtcOffset, csaMonitoringStationLat, csaMonitoringStationLong, csaSearchRadius, csaBboxVar, obsStartDate, obsStartHour, obsEndDate, ObsEndHour, apiKey, apiKeyUsed, counterAttempts, counterAttemptsPerAPI, counterAPI):
    # Empty response list variable every time function is called
    response = []
    
    # Set variables to pass through to API parameters
#     particulates = "ozone,pm25,pm10,co,no2,so2"
    particulates = "ozone,pm25,pm10"
    longMin = round(csaMonitoringStationLong - csaBboxVar,3)
    longMax = round(csaMonitoringStationLong + csaBboxVar,3)
    latMin = round(csaMonitoringStationLat - csaBboxVar,3)
    latMax = round(csaMonitoringStationLat + csaBboxVar,3)
    
    # Set API Parameters to be passed through to API requests.get
    params = {}
    params["baseURL"] = "http://www.airnowapi.org/aq/data/"
    params["obsStartDate"] = f"{obsStartDate}"
    params["obsStartHour"] = f"{obsStartHour}"
    params["obsEndDate"] = f"{obsEndDate}"
    params["obsEndHour"] = f"{obsEndHour}"
    params["particulates"] = f"{particulates}"
    params["bbox"] = f"{longMin},{latMin},{longMax},{latMax}"
    params["dataType"] = "A" # A = AQI (C = Concentrations; B = AQI & Concentrations)
    params["format"] = "application/json"
    params["verbose"] = "1" # 1 = True; provides additional site information including Site Name, Agency Name, AQS ID, and Full AQS ID (0 = False)
    params["nowCastOnly"] = "0" # 0 = False; concentrations and AQI will transition to midpoint averages as data becomes available (1 = True; always provides Nowcast concentrations and AQI regardless of date/time)
    params["includeRawConcentrations"] = "0" # 0 = False (1 = True; an additional field that contains the raw concentration will be added to the output. For CO, NO2, and SO2, these values are the same as the concentration fields. For Ozone, PM2.5, and PM10, these are raw hourly concentrations measured by the instrument (Not Nowcast or Midpoint avg. concentrations) Raw concentration units are the same as those specified in the Unit field)
    params["apiKey"] = apiKey
    
    # Build API Request URL, passing through parameters - by GEOGRAPHIC BOUNDING BOX
    requestURL = params["baseURL"] \
                + "?startDate=" + params["obsStartDate"] + "T" + params["obsStartHour"] \
                + "&endDate=" + params["obsEndDate"] + "T" + params["obsEndHour"] \
                + "&parameters=" + params["particulates"] \
                + "&BBOX=" + params["bbox"] \
                + "&dataType=" + params["dataType"] \
                + "&format=" + params["format"] \
                + "&verbose=" + params["verbose"] \
                + "&nowcastonly=" + params["nowCastOnly"] \
                + "&includerawconcentrations=" + params["includeRawConcentrations"] \
                + "&API_KEY=" + params["apiKey"]
    
    # Logger: Print status message
    print("--------------------------------------------------")
    print("Requesting AirNow API Data for...")
    print("     **************************************************")
    print(datetime.now().strftime('%Y-%m-%d.%H.%M.%S'))
    print(f"CSA: {csaName} | CITY: {csaPrimaryCity}, {csaPrimaryCityState} | minLatLong: [{latMin},{longMin}] | maxLatLong: [{latMax},{longMax}] | DATE RANGE: {obsStartDate} - {obsEndDate}")
    print(f"ATTEMPT:{counterAttempts} | ATTEMPT PER BATCH/API: {counterAttemptsPerAPI} | BATCH/API: {counterAPI} | API Key Used: {apiKeyUsed}")
    print("     **************************************************")
    
    # Set up error handling in the even there is an error in the API requests.get()
    try:
        
        # Execute requests.get, passing through built requestURL
        response = requests.get(requestURL).json()
        responseCode = requests.get(requestURL)
        
        # Results sum and response Code:
        print("     **************************************************")
        print(f"Number of results found: {len(response)}")
        print(f"Response Code: {responseCode}")
        print("     **************************************************")

        # Loop through response, appending each element (dictionary) as a new item in the apiData list
        for i in range(len(response)):
            
            # Add values brought in from the largest_combined_statistical_areas dictionary / json object to the response
            response[i]["csaRank"] = csaRank
            response[i]["csaName"] = csaName
            response[i]["csaPrimaryCity"] = csaPrimaryCity
            response[i]["csaPrimaryCityState"] = csaPrimaryCityState
            response[i]["csaPopulation2018Estimate"] = csaPopulation2018Estimate
            response[i]["csaPopulation2010Census"] = csaPopulation2010Census
            response[i]["csaPrimaryCityLat"] = csaPrimaryCityLat
            response[i]["csaPrimaryCityLong"] = csaPrimaryCityLong
            response[i]["csaPrimaryCityZip"] = csaPrimaryCityZip
          
            response[i]["csaTimeZone"] = csaTimeZone
            response[i]["csaStandardTimeUtcOffset"] = csaStandardTimeUtcOffset
            response[i]["csaDaylightSavingsTimeUtcOffset"] = csaDaylightSavingsTimeUtcOffset
            response[i]["csaMonitoringStationLat"] = csaMonitoringStationLat
            response[i]["csaMonitoringStationLong"] = csaMonitoringStationLong
            response[i]["csaSearchRadius"] = csaSearchRadius
            response[i]["csaBboxVar"] = csaBboxVar            
            
            # Append the response to the apiData list
            apiData.append(response[i])
    
    except Exception as e:
        print("     **************************************************")
        print(f"Response Code: {responseCode}")
        print("     **************************************************")
        print(f"ERROR: Unable to perform AirNow API request for CSA: {csaName} | CITY: {csaPrimaryCity}, {csaPrimaryCityState} | minLatLong: [{latMin},{longMin}] | maxLatLong: [{latMax},{longMax}] | DATE RANGE: {obsStartDate} - {obsEndDate}")
        print("%s" % e)
        print("--------------------------------------------------")
        pass


#### Loop through cities and dates, while updating variables and passing them through to the apiAirNowObsByMonitoringSite Function

In [154]:
# Loop through cities in the largest_combined_statistical_areas dictionary / json object
for csa in largest_combined_statistical_areas:
    
    # Loop through datesToUseList, calling the apiAirNowObsByMonitoringSite function, passing through variables
    for date in datesToUseList:
    
        # Set varaiable values to pass into the apiAirNowObsByMonitoringSite function
        csaRank = csa["csa_rank"]
        csaName = csa["csa_name"]
        csaPrimaryCity = csa["primary_city"]
        csaPrimaryCityState = csa["primary_city_state"]
        csaPopulation2018Estimate = csa["population"]["2018_estimate"]
        csaPopulation2010Census = csa["population"]["2010_census"]
        csaPrimaryCityLat = csa["primary_city_location"]["lat"]
        csaPrimaryCityLong = csa["primary_city_location"]["long"]
        csaPrimaryCityZip = csa["primary_city_location"]["zip_code"]
        csaTimeZone = csa["timezone_params"]["timezone"]
        csaStandardTimeUtcOffset = csa["timezone_params"]["utc_offset"]["standard_time"]
        csaDaylightSavingsTimeUtcOffset = csa["timezone_params"]["utc_offset"]["daylight_savings_time"]
        csaMonitoringStationLat = csa["search_params"]["closest_monitoring_station"]["lat"]
        csaMonitoringStationLong = csa["search_params"]["closest_monitoring_station"]["long"]
        csaSearchRadius = csa["search_params"]["closest_monitoring_station"]["search_radius"]
        csaBboxVar = csa["search_params"]["closest_monitoring_station"]["bbox_latlong_var"]
        
        obsStartDate = date[0]
        obsStartHour = "00"
        obsEndDate = date[1]
        obsEndHour = "23"
                
        # Update overall attempt counter
        counterAttempts += 1
        
        # Once 250 attempts have been made for all five API Keys, reset counterAttemptsPerAPI, counterAPI and apiKey then sleep for one hour (3600 seconds)
        if counterAttemptsPerAPI >= 250 and counterAPI >= 5:
            counterAttemptsPerAPI = 1
            counterAPI = 1
            apiKey = airNowApiKey1
            print("*****  250 per API for all APIs avialable reached | RESET counterAttemptsPerAPI, counterAPI, apiKey & SLEEP  *****")
            sleep(3)

        # Once 250 attempts have been made for the currently used API Key, update counterAPI so a new apiKey is used next time through, reset counterAttemptsPerAPI
        elif counterAttemptsPerAPI >= 250:
            counterAPI += 1
            counterAttemptsPerAPI = 1
            print("*****  250 per current API reached | RESET counterAttemptsPerAPI; UPDATE counterAPI & apiKey  *****")
            
        else:
            counterAttemptsPerAPI += 1

        # Set apiKey variable based on the counterAPI variable
        if counterAPI == 1:
            apiKey = airNowApiKey1
            apiKeyUsed = "airNowApiKey1"
        elif counterAPI == 2:
            apiKey = airNowApiKey2
            apiKeyUsed = "airNowApiKey2"
        elif counterAPI == 3:
            apiKey = airNowApiKey3
            apiKeyUsed = "airNowApiKey3"
        elif counterAPI == 4:
            apiKey = airNowApiKey4
            apiKeyUsed = "airNowApiKey4"
        elif counterAPI == 5:
            apiKey = airNowApiKey5
            apiKeyUsed = "airNowApiKey5"
        
        # Call the apiCallLatLongHistorical function, passing through updated variables as parameters
        apiAirNowObsByMonitoringSite(csaRank, csaName, csaPrimaryCity, csaPrimaryCityState, csaPopulation2018Estimate, csaPopulation2010Census, csaPrimaryCityLat, csaPrimaryCityLong, csaPrimaryCityZip, csaTimeZone, csaStandardTimeUtcOffset, csaDaylightSavingsTimeUtcOffset, csaMonitoringStationLat, csaMonitoringStationLong, csaSearchRadius, csaBboxVar, obsStartDate, obsStartHour, obsEndDate, obsEndHour, apiKey, apiKeyUsed, counterAttempts, counterAttemptsPerAPI, counterAPI)

--------------------------------------------------
Requesting AirNow API Data for...
     **************************************************
2020-04-16.22.29.50
CSA: Washington-Baltimore-Arlington, DC-MD-VA-WV-PA Combined Statistical Area | CITY: Washington, DC | minLatLong: [38.869,-77.063] | maxLatLong: [38.969,-76.963] | DATE RANGE: 2015-01-01 - 2015-03-31
ATTEMPT:551 | ATTEMPT PER BATCH/API: 51 | BATCH/API: 3 | API Key Used: airNowApiKey3
     **************************************************
     **************************************************
Number of results found: 2142
Response Code: <Response [200]>
     **************************************************
--------------------------------------------------
Requesting AirNow API Data for...
     **************************************************
2020-04-16.22.30.00
CSA: Washington-Baltimore-Arlington, DC-MD-VA-WV-PA Combined Statistical Area | CITY: Washington, DC | minLatLong: [38.869,-77.063] | maxLatLong: [38.969,-76.963]

     **************************************************
Number of results found: 5488
Response Code: <Response [200]>
     **************************************************
--------------------------------------------------
Requesting AirNow API Data for...
     **************************************************
2020-04-16.22.33.37
CSA: Washington-Baltimore-Arlington, DC-MD-VA-WV-PA Combined Statistical Area | CITY: Washington, DC | minLatLong: [38.869,-77.063] | maxLatLong: [38.969,-76.963] | DATE RANGE: 2018-04-01 - 2018-06-30
ATTEMPT:564 | ATTEMPT PER BATCH/API: 64 | BATCH/API: 3 | API Key Used: airNowApiKey3
     **************************************************
     **************************************************
Number of results found: 5749
Response Code: <Response [200]>
     **************************************************
--------------------------------------------------
Requesting AirNow API Data for...
     **************************************************
2020-04-

     **************************************************
Number of results found: 13210
Response Code: <Response [200]>
     **************************************************
--------------------------------------------------
Requesting AirNow API Data for...
     **************************************************
2020-04-16.22.40.58
CSA: Phoenix-Mesa, AZ Combined Statistical Area | CITY: Phoenix, AZ | minLatLong: [33.493,-112.121] | maxLatLong: [33.593,-112.021] | DATE RANGE: 2016-01-01 - 2016-03-31
ATTEMPT:577 | ATTEMPT PER BATCH/API: 77 | BATCH/API: 3 | API Key Used: airNowApiKey3
     **************************************************
     **************************************************
Number of results found: 12556
Response Code: <Response [200]>
     **************************************************
--------------------------------------------------
Requesting AirNow API Data for...
     **************************************************
2020-04-16.22.41.49
CSA: Phoenix-Mesa

     **************************************************
Number of results found: 12874
Response Code: <Response [200]>
     **************************************************
--------------------------------------------------
Requesting AirNow API Data for...
     **************************************************
2020-04-16.22.51.41
CSA: Phoenix-Mesa, AZ Combined Statistical Area | CITY: Phoenix, AZ | minLatLong: [33.493,-112.121] | maxLatLong: [33.593,-112.021] | DATE RANGE: 2019-04-01 - 2019-06-30
ATTEMPT:590 | ATTEMPT PER BATCH/API: 90 | BATCH/API: 3 | API Key Used: airNowApiKey3
     **************************************************
     **************************************************
Number of results found: 13068
Response Code: <Response [200]>
     **************************************************
--------------------------------------------------
Requesting AirNow API Data for...
     **************************************************
2020-04-16.22.52.30
CSA: Phoenix-Mesa

     **************************************************
Number of results found: 7167
Response Code: <Response [200]>
     **************************************************
--------------------------------------------------
Requesting AirNow API Data for...
     **************************************************
2020-04-16.22.59.04
CSA: Sacramento-Roseville, CA Combined Statistical Area | CITY: Sacramento, CA | minLatLong: [38.517,-121.517] | maxLatLong: [38.617,-121.417] | DATE RANGE: 2017-01-01 - 2017-03-31
ATTEMPT:603 | ATTEMPT PER BATCH/API: 103 | BATCH/API: 3 | API Key Used: airNowApiKey3
     **************************************************
     **************************************************
Number of results found: 6331
Response Code: <Response [200]>
     **************************************************
--------------------------------------------------
Requesting AirNow API Data for...
     **************************************************
2020-04-16.22.59.30
CSA: Sa

     **************************************************
Number of results found: 9746
Response Code: <Response [200]>
     **************************************************
--------------------------------------------------
Requesting AirNow API Data for...
     **************************************************
2020-04-16.23.06.19
CSA: Sacramento-Roseville, CA Combined Statistical Area | CITY: Sacramento, CA | minLatLong: [38.517,-121.517] | maxLatLong: [38.617,-121.417] | DATE RANGE: 2020-04-01 - 2020-06-30
ATTEMPT:616 | ATTEMPT PER BATCH/API: 116 | BATCH/API: 3 | API Key Used: airNowApiKey3
     **************************************************
     **************************************************
Number of results found: 2308
Response Code: <Response [200]>
     **************************************************


#### Check the number of records in the apiData list

In [155]:
# apiData
len(apiData)

5043803

#### Loop through results and only use the records that are for a chosen datetime (noon local time for each city, checking to see if date is within Standard Time (ST) or Daylight Savings Time (DST) periods and then use the correct UTC offset

In [177]:
# Set an empty list to hold the data being passed through
selectedDateTimeApiData = []

# Loop through the apiData
for record in apiData:
    
    # Set variables and calculate desired local datetime for the record we want to use (12:00 noon, local time per each city)
    desiredLocalTime = "12:00"
    
    # Isolate just the date of the "UTC" datetime stamp string
    recordDate = pd.Timestamp(record["UTC"]).strftime("%Y-%m-%d")
    
    # Create variable for the city's local datetime we want to use
    desiredLocalDateTime = pd.Timestamp(f"{recordDate}T{desiredLocalTime}")

    # Create Standard Time variables to use
    stdTimeOffsetInt = int(record["csaStandardTimeUtcOffset"].replace("0","").replace(":",""))
    stdDesiredUTCDateTime = desiredLocalDateTime + timedelta(hours = (stdTimeOffsetInt * -1))
    stdDesiredUTCDateTimeStr = stdDesiredUTCDateTime.strftime("%Y-%m-%dT%H%:%M")
    
    # Create Daylight Savings Time variables to use
    dstTimeOffsetInt = int(record["csaDaylightSavingsTimeUtcOffset"].replace("0","").replace(":",""))
    dstDesiredUTCDateTime = desiredLocalDateTime + timedelta(hours = (dstTimeOffsetInt * -1))
    dstDesiredUTCDateTimeStr = dstDesiredUTCDateTime.strftime("%Y-%m-%dT%H%:%M")
    
    # If the record date is within a DST period and the "UTC" datetime stamp matches desired for DST
    if recordDate in dstDateList and record["UTC"] == dstDesiredUTCDateTimeStr:
        
        # Append the response to the apiData list
        record["st_dst"] = "DaylightSavings"
        record["obsDateTime"] = desiredLocalDateTime.strftime("%Y-%m-%dT%H%:%M")
        record["dateObserved"] = desiredLocalDateTime.strftime("%Y-%m-%d")
        record["timeObserved"] = desiredLocalDateTime.strftime("%H%:%M")
        selectedDateTimeApiData.append(record)
        
    # If the record date is within a ST period and the "UTC" datetime stamp matches desired for ST
    elif recordDate not in dstDateList and record["UTC"] == stdDesiredUTCDateTimeStr:
        
        # Append the response to the apiData list
        record["st_dst"] = "Standard"
        record["obsDateTime"] = desiredLocalDateTime.strftime("%Y-%m-%dT%H%:%M")
        record["dateObserved"] = desiredLocalDateTime.strftime("%Y-%m-%d")
        record["timeObserved"] = desiredLocalDateTime.strftime("%H%:%M")
        selectedDateTimeApiData.append(record)
        
    else:
        # Ignore the record, do not pass to selectedDateTimeApiData
        pass

# See how many records/result are included
# selectedDateTimeApiData
len(selectedDateTimeApiData)

209198

#### Create DataFrame holding the values from the selectedDateTimeApiData lists

In [179]:
selectedDateTimeApiData_df = pd.DataFrame(selectedDateTimeApiData)

# Visualize the DataFrame
selectedDateTimeApiData_df

Unnamed: 0,Latitude,Longitude,UTC,Parameter,Unit,AQI,Category,SiteName,AgencyName,FullAQSCode,...,csaStandardTimeUtcOffset,csaDaylightSavingsTimeUtcOffset,csaMonitoringStationLat,csaMonitoringStationLong,csaSearchRadius,csaBboxVar,st_dst,obsDateTime,dateObserved,timeObserved
0,40.853550,-73.966100,2015-01-01T17:00,PM2.5,UG/M3,51,2,Fort Lee Near Road,New Jersey Dept. of Environmental Protection,340030010,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
1,40.819700,-73.948100,2015-01-01T17:00,PM2.5,UG/M3,50,1,CCNY,New York Dept. of Environmental Conservation,360610135,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
2,40.819700,-73.948100,2015-01-01T17:00,OZONE,PPB,21,1,CCNY,New York Dept. of Environmental Conservation,360610135,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
3,40.849200,-73.931900,2015-01-01T17:00,PM2.5,UG/M3,63,2,Manhattan/IS143,New York Dept. of Environmental Conservation,360610115,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
4,40.694401,-73.928596,2015-01-01T17:00,PM2.5,UG/M3,44,1,Bklyn - PS274,New York Dept. of Environmental Conservation,360470118,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209193,38.561357,-121.508961,2020-04-16T19:00,PM2.5,UG/M3,24,1,MMCA82044,California Air Resources Board,840MMCA82044,...,-08:00,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00
209194,38.558761,-121.506429,2020-04-16T19:00,PM2.5,UG/M3,32,1,MMCA81033,California Air Resources Board,MMCA81033,...,-08:00,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00
209195,38.568440,-121.493110,2020-04-16T19:00,OZONE,PPB,33,1,Downtown Sacramento - T Street,California Air Resources Board,060670010,...,-08:00,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00
209196,38.568440,-121.493110,2020-04-16T19:00,PM2.5,UG/M3,33,1,Downtown Sacramento - T Street,California Air Resources Board,060670010,...,-08:00,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00


#### Convert & export raw DataFrame to CSV files (in the event we need to update/manipulate later from this point)

In [180]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
selectedDateTimeApiData_df.to_csv(f"{output_data_filepath}AirNowApiRawData_{timestamp}.csv", encoding="utf-8", index= False)

In [181]:
cleanedApiData_df = selectedDateTimeApiData_df

# Visualize the DataFrame
cleanedApiData_df

Unnamed: 0,Latitude,Longitude,UTC,Parameter,Unit,AQI,Category,SiteName,AgencyName,FullAQSCode,...,csaStandardTimeUtcOffset,csaDaylightSavingsTimeUtcOffset,csaMonitoringStationLat,csaMonitoringStationLong,csaSearchRadius,csaBboxVar,st_dst,obsDateTime,dateObserved,timeObserved
0,40.853550,-73.966100,2015-01-01T17:00,PM2.5,UG/M3,51,2,Fort Lee Near Road,New Jersey Dept. of Environmental Protection,340030010,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
1,40.819700,-73.948100,2015-01-01T17:00,PM2.5,UG/M3,50,1,CCNY,New York Dept. of Environmental Conservation,360610135,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
2,40.819700,-73.948100,2015-01-01T17:00,OZONE,PPB,21,1,CCNY,New York Dept. of Environmental Conservation,360610135,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
3,40.849200,-73.931900,2015-01-01T17:00,PM2.5,UG/M3,63,2,Manhattan/IS143,New York Dept. of Environmental Conservation,360610115,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
4,40.694401,-73.928596,2015-01-01T17:00,PM2.5,UG/M3,44,1,Bklyn - PS274,New York Dept. of Environmental Conservation,360470118,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209193,38.561357,-121.508961,2020-04-16T19:00,PM2.5,UG/M3,24,1,MMCA82044,California Air Resources Board,840MMCA82044,...,-08:00,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00
209194,38.558761,-121.506429,2020-04-16T19:00,PM2.5,UG/M3,32,1,MMCA81033,California Air Resources Board,MMCA81033,...,-08:00,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00
209195,38.568440,-121.493110,2020-04-16T19:00,OZONE,PPB,33,1,Downtown Sacramento - T Street,California Air Resources Board,060670010,...,-08:00,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00
209196,38.568440,-121.493110,2020-04-16T19:00,PM2.5,UG/M3,33,1,Downtown Sacramento - T Street,California Air Resources Board,060670010,...,-08:00,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00


#### Drop reporting results that have Category of 0 (AQI = -999; null/bad values)

In [182]:
cleanedApiData_df = cleanedApiData_df[cleanedApiData_df.Category != 0]
# cleanedApiData_df = cleanedApiData_df[cleanedApiData_df.Category != -999]

# Visualize the DataFrame
cleanedApiData_df

Unnamed: 0,Latitude,Longitude,UTC,Parameter,Unit,AQI,Category,SiteName,AgencyName,FullAQSCode,...,csaStandardTimeUtcOffset,csaDaylightSavingsTimeUtcOffset,csaMonitoringStationLat,csaMonitoringStationLong,csaSearchRadius,csaBboxVar,st_dst,obsDateTime,dateObserved,timeObserved
0,40.853550,-73.966100,2015-01-01T17:00,PM2.5,UG/M3,51,2,Fort Lee Near Road,New Jersey Dept. of Environmental Protection,340030010,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
1,40.819700,-73.948100,2015-01-01T17:00,PM2.5,UG/M3,50,1,CCNY,New York Dept. of Environmental Conservation,360610135,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
2,40.819700,-73.948100,2015-01-01T17:00,OZONE,PPB,21,1,CCNY,New York Dept. of Environmental Conservation,360610135,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
3,40.849200,-73.931900,2015-01-01T17:00,PM2.5,UG/M3,63,2,Manhattan/IS143,New York Dept. of Environmental Conservation,360610115,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
4,40.694401,-73.928596,2015-01-01T17:00,PM2.5,UG/M3,44,1,Bklyn - PS274,New York Dept. of Environmental Conservation,360470118,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209193,38.561357,-121.508961,2020-04-16T19:00,PM2.5,UG/M3,24,1,MMCA82044,California Air Resources Board,840MMCA82044,...,-08:00,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00
209194,38.558761,-121.506429,2020-04-16T19:00,PM2.5,UG/M3,32,1,MMCA81033,California Air Resources Board,MMCA81033,...,-08:00,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00
209195,38.568440,-121.493110,2020-04-16T19:00,OZONE,PPB,33,1,Downtown Sacramento - T Street,California Air Resources Board,060670010,...,-08:00,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00
209196,38.568440,-121.493110,2020-04-16T19:00,PM2.5,UG/M3,33,1,Downtown Sacramento - T Street,California Air Resources Board,060670010,...,-08:00,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00


In [183]:
# See what Columns are included and how they are named
list(cleanedApiData_df.columns)

['Latitude',
 'Longitude',
 'UTC',
 'Parameter',
 'Unit',
 'AQI',
 'Category',
 'SiteName',
 'AgencyName',
 'FullAQSCode',
 'IntlAQSCode',
 'csaRank',
 'csaName',
 'csaPrimaryCity',
 'csaPrimaryCityState',
 'csaPopulation2018Estimate',
 'csaPopulation2010Census',
 'csaPrimaryCityLat',
 'csaPrimaryCityLong',
 'csaPrimaryCityZip',
 'csaTimeZone',
 'csaStandardTimeUtcOffset',
 'csaDaylightSavingsTimeUtcOffset',
 'csaMonitoringStationLat',
 'csaMonitoringStationLong',
 'csaSearchRadius',
 'csaBboxVar',
 'st_dst',
 'obsDateTime',
 'dateObserved',
 'timeObserved']

#### Rename Columns (for relevancy downstream)

In [184]:
cleanedApiData_df = cleanedApiData_df.rename(columns={"Latitude": "SiteLatitude",\
                                                      "Longitude": "SiteLongitude",\
                                                      "UTC": "DateTimeObservedUTC",\
                                                      "Parameter": "ParameterName",\
                                                      "Category": "AQICategoryNumber",\
                                                      "AgencyName": "SiteAgencyName",\
                                                      "FullAQSCode": "SiteAQSCode",\
                                                      "IntlAQSCode": "SiteIntlAQSCode",\
                                                      "st_dst": "TimeMode",\
                                                      "obsDateTime": "DateTimeObserved",\
                                                      "dateObserved": "DateObserved",\
                                                      "timeObserved": "TimeObserved"})

# Visualize the updated DataFrame
cleanedApiData_df

Unnamed: 0,SiteLatitude,SiteLongitude,DateTimeObservedUTC,ParameterName,Unit,AQI,AQICategoryNumber,SiteName,SiteAgencyName,SiteAQSCode,...,csaStandardTimeUtcOffset,csaDaylightSavingsTimeUtcOffset,csaMonitoringStationLat,csaMonitoringStationLong,csaSearchRadius,csaBboxVar,TimeMode,DateTimeObserved,DateObserved,TimeObserved
0,40.853550,-73.966100,2015-01-01T17:00,PM2.5,UG/M3,51,2,Fort Lee Near Road,New Jersey Dept. of Environmental Protection,340030010,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
1,40.819700,-73.948100,2015-01-01T17:00,PM2.5,UG/M3,50,1,CCNY,New York Dept. of Environmental Conservation,360610135,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
2,40.819700,-73.948100,2015-01-01T17:00,OZONE,PPB,21,1,CCNY,New York Dept. of Environmental Conservation,360610135,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
3,40.849200,-73.931900,2015-01-01T17:00,PM2.5,UG/M3,63,2,Manhattan/IS143,New York Dept. of Environmental Conservation,360610115,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
4,40.694401,-73.928596,2015-01-01T17:00,PM2.5,UG/M3,44,1,Bklyn - PS274,New York Dept. of Environmental Conservation,360470118,...,-05:00,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209193,38.561357,-121.508961,2020-04-16T19:00,PM2.5,UG/M3,24,1,MMCA82044,California Air Resources Board,840MMCA82044,...,-08:00,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00
209194,38.558761,-121.506429,2020-04-16T19:00,PM2.5,UG/M3,32,1,MMCA81033,California Air Resources Board,MMCA81033,...,-08:00,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00
209195,38.568440,-121.493110,2020-04-16T19:00,OZONE,PPB,33,1,Downtown Sacramento - T Street,California Air Resources Board,060670010,...,-08:00,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00
209196,38.568440,-121.493110,2020-04-16T19:00,PM2.5,UG/M3,33,1,Downtown Sacramento - T Street,California Air Resources Board,060670010,...,-08:00,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00


#### Add AQICategory (Good, Moderate, Unhealthy for Sensitive Groups, Unhealthy, Very Unhealthy; based on AQICategoryNumber

In [187]:
# Create function to determine AQICategory value based on AQICategoryNumber
def fAQICategory(row):
    if row["AQICategoryNumber"] == 1:
        AQICategory = "Good"
    elif row["AQICategoryNumber"] == 2:
        AQICategory = "Moderate"
    elif row["AQICategoryNumber"] == 3:
        AQICategory = "Unhealthy for Sensitive Groups"
    elif row["AQICategoryNumber"] == 4:
        AQICategory = "Unhealthy"
    elif row["AQICategoryNumber"] == 5:
        AQICategory = "Very Unhealthy"
    elif row["AQICategoryNumber"] == 6:
        AQICategory = "Hazardous"
    else:
        AQICategory = "(invalid)"
    return AQICategory

# Create new column: AQICategory, using above function
cleanedApiData_df['AQICategory'] = cleanedApiData_df.apply(fAQICategory, axis=1)

# Visualize the DataFrame
cleanedApiData_df

Unnamed: 0,SiteLatitude,SiteLongitude,DateTimeObservedUTC,ParameterName,Unit,AQI,AQICategoryNumber,SiteName,SiteAgencyName,SiteAQSCode,...,csaDaylightSavingsTimeUtcOffset,csaMonitoringStationLat,csaMonitoringStationLong,csaSearchRadius,csaBboxVar,TimeMode,DateTimeObserved,DateObserved,TimeObserved,AQICategory
0,40.853550,-73.966100,2015-01-01T17:00,PM2.5,UG/M3,51,2,Fort Lee Near Road,New Jersey Dept. of Environmental Protection,340030010,...,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00,Moderate
1,40.819700,-73.948100,2015-01-01T17:00,PM2.5,UG/M3,50,1,CCNY,New York Dept. of Environmental Conservation,360610135,...,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00,Good
2,40.819700,-73.948100,2015-01-01T17:00,OZONE,PPB,21,1,CCNY,New York Dept. of Environmental Conservation,360610135,...,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00,Good
3,40.849200,-73.931900,2015-01-01T17:00,PM2.5,UG/M3,63,2,Manhattan/IS143,New York Dept. of Environmental Conservation,360610115,...,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00,Moderate
4,40.694401,-73.928596,2015-01-01T17:00,PM2.5,UG/M3,44,1,Bklyn - PS274,New York Dept. of Environmental Conservation,360470118,...,-04:00,40.8419,-73.8359,25,0.15,Standard,2015-01-01T12:00,2015-01-01,12:00,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209193,38.561357,-121.508961,2020-04-16T19:00,PM2.5,UG/M3,24,1,MMCA82044,California Air Resources Board,840MMCA82044,...,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00,Good
209194,38.558761,-121.506429,2020-04-16T19:00,PM2.5,UG/M3,32,1,MMCA81033,California Air Resources Board,MMCA81033,...,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00,Good
209195,38.568440,-121.493110,2020-04-16T19:00,OZONE,PPB,33,1,Downtown Sacramento - T Street,California Air Resources Board,060670010,...,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00,Good
209196,38.568440,-121.493110,2020-04-16T19:00,PM2.5,UG/M3,33,1,Downtown Sacramento - T Street,California Air Resources Board,060670010,...,-07:00,38.5670,-121.4670,25,0.05,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00,Good


In [188]:
# See what Columns are included and how they are named
list(cleanedApiData_df.columns)

['SiteLatitude',
 'SiteLongitude',
 'DateTimeObservedUTC',
 'ParameterName',
 'Unit',
 'AQI',
 'AQICategoryNumber',
 'SiteName',
 'SiteAgencyName',
 'SiteAQSCode',
 'SiteIntlAQSCode',
 'csaRank',
 'csaName',
 'csaPrimaryCity',
 'csaPrimaryCityState',
 'csaPopulation2018Estimate',
 'csaPopulation2010Census',
 'csaPrimaryCityLat',
 'csaPrimaryCityLong',
 'csaPrimaryCityZip',
 'csaTimeZone',
 'csaStandardTimeUtcOffset',
 'csaDaylightSavingsTimeUtcOffset',
 'csaMonitoringStationLat',
 'csaMonitoringStationLong',
 'csaSearchRadius',
 'csaBboxVar',
 'TimeMode',
 'DateTimeObserved',
 'DateObserved',
 'TimeObserved',
 'AQICategory']

#### Drop unneeded columns | Reorder columns (for efficiency downstream)

In [189]:
cleanedApiData_df = cleanedApiData_df[["csaRank", "csaName", "csaPrimaryCity", "csaPrimaryCityState",\
                                       "csaPrimaryCityLat", "csaPrimaryCityLong", "csaTimeZone", "TimeMode",\
                                       "DateTimeObserved", "DateObserved", "TimeObserved", "DateTimeObservedUTC",\
                                       "ParameterName", "Unit", "AQI", "AQICategoryNumber", "AQICategory",\
                                       "SiteName", "SiteAgencyName"]]

# Visualize the updated DataFrame
cleanedApiData_df

Unnamed: 0,csaRank,csaName,csaPrimaryCity,csaPrimaryCityState,csaPrimaryCityLat,csaPrimaryCityLong,csaTimeZone,TimeMode,DateTimeObserved,DateObserved,TimeObserved,DateTimeObservedUTC,ParameterName,Unit,AQI,AQICategoryNumber,AQICategory,SiteName,SiteAgencyName
0,1,"New York-Newark, NY-NJ-CT-PA Combined Statisti...",New York City,NY,40.7128,-74.0060,Eastern,Standard,2015-01-01T12:00,2015-01-01,12:00,2015-01-01T17:00,PM2.5,UG/M3,51,2,Moderate,Fort Lee Near Road,New Jersey Dept. of Environmental Protection
1,1,"New York-Newark, NY-NJ-CT-PA Combined Statisti...",New York City,NY,40.7128,-74.0060,Eastern,Standard,2015-01-01T12:00,2015-01-01,12:00,2015-01-01T17:00,PM2.5,UG/M3,50,1,Good,CCNY,New York Dept. of Environmental Conservation
2,1,"New York-Newark, NY-NJ-CT-PA Combined Statisti...",New York City,NY,40.7128,-74.0060,Eastern,Standard,2015-01-01T12:00,2015-01-01,12:00,2015-01-01T17:00,OZONE,PPB,21,1,Good,CCNY,New York Dept. of Environmental Conservation
3,1,"New York-Newark, NY-NJ-CT-PA Combined Statisti...",New York City,NY,40.7128,-74.0060,Eastern,Standard,2015-01-01T12:00,2015-01-01,12:00,2015-01-01T17:00,PM2.5,UG/M3,63,2,Moderate,Manhattan/IS143,New York Dept. of Environmental Conservation
4,1,"New York-Newark, NY-NJ-CT-PA Combined Statisti...",New York City,NY,40.7128,-74.0060,Eastern,Standard,2015-01-01T12:00,2015-01-01,12:00,2015-01-01T17:00,PM2.5,UG/M3,44,1,Good,Bklyn - PS274,New York Dept. of Environmental Conservation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209193,22,"Sacramento-Roseville, CA Combined Statistical ...",Sacramento,CA,38.5816,-121.4944,Pacific,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00,2020-04-16T19:00,PM2.5,UG/M3,24,1,Good,MMCA82044,California Air Resources Board
209194,22,"Sacramento-Roseville, CA Combined Statistical ...",Sacramento,CA,38.5816,-121.4944,Pacific,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00,2020-04-16T19:00,PM2.5,UG/M3,32,1,Good,MMCA81033,California Air Resources Board
209195,22,"Sacramento-Roseville, CA Combined Statistical ...",Sacramento,CA,38.5816,-121.4944,Pacific,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00,2020-04-16T19:00,OZONE,PPB,33,1,Good,Downtown Sacramento - T Street,California Air Resources Board
209196,22,"Sacramento-Roseville, CA Combined Statistical ...",Sacramento,CA,38.5816,-121.4944,Pacific,DaylightSavings,2020-04-16T12:00,2020-04-16,12:00,2020-04-16T19:00,PM2.5,UG/M3,33,1,Good,Downtown Sacramento - T Street,California Air Resources Board


#### Convert & export cleaned DataFrame to CSV files

In [190]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
cleanedApiData_df.to_csv(f"{output_data_filepath}AirNowApiData_{timestamp}.csv", encoding="utf-8", index= False)

#### See the column names we are working with

In [191]:
list(cleanedApiData_df.columns)

['csaRank',
 'csaName',
 'csaPrimaryCity',
 'csaPrimaryCityState',
 'csaPrimaryCityLat',
 'csaPrimaryCityLong',
 'csaTimeZone',
 'TimeMode',
 'DateTimeObserved',
 'DateObserved',
 'TimeObserved',
 'DateTimeObservedUTC',
 'ParameterName',
 'Unit',
 'AQI',
 'AQICategoryNumber',
 'AQICategory',
 'SiteName',
 'SiteAgencyName']

#### Create Grouped DataFrame (that groups all columns except values that are aggregated)

In [217]:
groupedApiData_df = cleanedApiData_df.groupby(["csaRank", "csaName", "csaPrimaryCity", "csaPrimaryCityState",\
                                               "csaPrimaryCityLat", "csaPrimaryCityLong", "csaTimeZone",\
                                               "TimeMode", "DateTimeObserved", "DateObserved", "TimeObserved",\
                                               "DateTimeObservedUTC", "ParameterName", "Unit"
                                              ])

# Visualize the DataFrame (use .count() to see it)
groupedApiData_df.count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,AQI,AQICategoryNumber,AQICategory,SiteName,SiteAgencyName
csaRank,csaName,csaPrimaryCity,csaPrimaryCityState,csaPrimaryCityLat,csaPrimaryCityLong,csaTimeZone,TimeMode,DateTimeObserved,DateObserved,TimeObserved,DateTimeObservedUTC,ParameterName,Unit,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,"New York-Newark, NY-NJ-CT-PA Combined Statistical Area",New York City,NY,40.7128,-74.0060,Eastern,DaylightSavings,2015-03-08T12:00,2015-03-08,12:00,2015-03-08T16:00,OZONE,PPB,4,4,4,4,4
1,"New York-Newark, NY-NJ-CT-PA Combined Statistical Area",New York City,NY,40.7128,-74.0060,Eastern,DaylightSavings,2015-03-08T12:00,2015-03-08,12:00,2015-03-08T16:00,PM10,UG/M3,1,1,1,1,1
1,"New York-Newark, NY-NJ-CT-PA Combined Statistical Area",New York City,NY,40.7128,-74.0060,Eastern,DaylightSavings,2015-03-08T12:00,2015-03-08,12:00,2015-03-08T16:00,PM2.5,UG/M3,5,5,5,5,5
1,"New York-Newark, NY-NJ-CT-PA Combined Statistical Area",New York City,NY,40.7128,-74.0060,Eastern,DaylightSavings,2015-03-09T12:00,2015-03-09,12:00,2015-03-09T16:00,OZONE,PPB,4,4,4,4,4
1,"New York-Newark, NY-NJ-CT-PA Combined Statistical Area",New York City,NY,40.7128,-74.0060,Eastern,DaylightSavings,2015-03-09T12:00,2015-03-09,12:00,2015-03-09T16:00,PM10,UG/M3,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,"San Antonio-New Braunfels-Pearsall, TX Combined Statistical Area",San Antonio,TX,29.4241,-98.4936,Central,Standard,2020-03-05T12:00,2020-03-05,12:00,2020-03-05T18:00,PM2.5,UG/M3,3,3,3,3,3
25,"San Antonio-New Braunfels-Pearsall, TX Combined Statistical Area",San Antonio,TX,29.4241,-98.4936,Central,Standard,2020-03-06T12:00,2020-03-06,12:00,2020-03-06T18:00,OZONE,PPB,1,1,1,1,1
25,"San Antonio-New Braunfels-Pearsall, TX Combined Statistical Area",San Antonio,TX,29.4241,-98.4936,Central,Standard,2020-03-06T12:00,2020-03-06,12:00,2020-03-06T18:00,PM2.5,UG/M3,4,4,4,4,4
25,"San Antonio-New Braunfels-Pearsall, TX Combined Statistical Area",San Antonio,TX,29.4241,-98.4936,Central,Standard,2020-03-07T12:00,2020-03-07,12:00,2020-03-07T18:00,OZONE,PPB,1,1,1,1,1


#### Aggregate values (aqiCount, aqiAvg, etc...)

In [218]:
# Calculate the count of AQI readings
aqiCount = groupedApiData_df["AQI"].count()
# aqiCount

# Calculate the avg (mean) AQI
aqiAvg = groupedApiData_df["AQI"].mean()
# aqiAvg

# Calculate the min AQI
aqiMin = groupedApiData_df["AQI"].min()
# aqiMin

# Calculate the min AQI
aqiMax = groupedApiData_df["AQI"].max()
# aqiMax

# Calculate the avg (mean) AQI Category Number
aqiCategoryNumberAvg = groupedApiData_df["AQICategoryNumber"].mean()
# aqiCategoryAvg

# Calculate the min AQI Category Number
aqiCategoryNumberMin = groupedApiData_df["AQICategoryNumber"].min()
# aqiCategoryMin

# Calculate the min AQI Category Number
aqiCategoryNumberMax = groupedApiData_df["AQICategoryNumber"].max()
# aqiCategoryMax

# Calculate the number of unique AQI Categories
aqiCategoryCount = groupedApiData_df["AQICategory"].nunique()

# Calculate the number of unique sites
sitesCount = groupedApiData_df["SiteName"].nunique()
# sitesTotal

# Calculate the number of unique agencies
agenciesCount = groupedApiData_df["SiteAgencyName"].nunique()
# agenciesTotal


#### Create Summary DataFrame (that holds the grouped and aggregated values)

In [219]:
summaryApiData_df = pd.DataFrame({"aqiCount": aqiCount,
                                  "aqiAvg": aqiAvg,
                                  "aqiMin": aqiMin,
                                  "aqiMax": aqiMax,
                                  "aqiCategoryNumberAvg": aqiCategoryNumberAvg,
                                  "aqiCategoryNumberMin": aqiCategoryNumberMin,
                                  "aqiCategoryNumberMax": aqiCategoryNumberMax,
                                  "aqiCategoryCount": aqiCategoryCount,
                                  "sitesCount": sitesCount,
                                  "agenciesCount": agenciesCount
                                 })

# Reset index for the DataFrame
summaryApiData_df = summaryApiData_df.reset_index()

# Sort DataFrame by CSA, Parameter and DateObserved
summaryApiData_df = summaryApiData_df.sort_values(["csaRank","ParameterName","DateObserved"], ascending=[True,True,True])

# Reset Index
summaryApiData_df = summaryApiData_df.reset_index()
del summaryApiData_df["index"]

# Visualize the DataFrame
summaryApiData_df


Unnamed: 0,csaRank,csaName,csaPrimaryCity,csaPrimaryCityState,csaPrimaryCityLat,csaPrimaryCityLong,csaTimeZone,TimeMode,DateTimeObserved,DateObserved,...,aqiCount,aqiAvg,aqiMin,aqiMax,aqiCategoryNumberAvg,aqiCategoryNumberMin,aqiCategoryNumberMax,aqiCategoryCount,sitesCount,agenciesCount
0,1,"New York-Newark, NY-NJ-CT-PA Combined Statisti...",New York City,NY,40.7128,-74.0060,Eastern,Standard,2015-01-01T12:00,2015-01-01,...,4,23.250000,21,26,1.0,1,1,1,4,1
1,1,"New York-Newark, NY-NJ-CT-PA Combined Statisti...",New York City,NY,40.7128,-74.0060,Eastern,Standard,2015-01-02T12:00,2015-01-02,...,4,18.500000,17,20,1.0,1,1,1,4,1
2,1,"New York-Newark, NY-NJ-CT-PA Combined Statisti...",New York City,NY,40.7128,-74.0060,Eastern,Standard,2015-01-03T12:00,2015-01-03,...,4,15.250000,13,20,1.0,1,1,1,4,1
3,1,"New York-Newark, NY-NJ-CT-PA Combined Statisti...",New York City,NY,40.7128,-74.0060,Eastern,Standard,2015-01-04T12:00,2015-01-04,...,4,3.000000,2,5,1.0,1,1,1,4,1
4,1,"New York-Newark, NY-NJ-CT-PA Combined Statisti...",New York City,NY,40.7128,-74.0060,Eastern,Standard,2015-01-05T12:00,2015-01-05,...,3,24.666667,23,26,1.0,1,1,1,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98839,25,"San Antonio-New Braunfels-Pearsall, TX Combine...",San Antonio,TX,29.4241,-98.4936,Central,DaylightSavings,2020-04-12T12:00,2020-04-12,...,4,49.000000,44,55,1.5,1,2,2,4,1
98840,25,"San Antonio-New Braunfels-Pearsall, TX Combine...",San Antonio,TX,29.4241,-98.4936,Central,DaylightSavings,2020-04-13T12:00,2020-04-13,...,4,25.750000,14,49,1.0,1,1,1,4,1
98841,25,"San Antonio-New Braunfels-Pearsall, TX Combine...",San Antonio,TX,29.4241,-98.4936,Central,DaylightSavings,2020-04-14T12:00,2020-04-14,...,4,25.000000,20,33,1.0,1,1,1,4,1
98842,25,"San Antonio-New Braunfels-Pearsall, TX Combine...",San Antonio,TX,29.4241,-98.4936,Central,DaylightSavings,2020-04-15T12:00,2020-04-15,...,3,28.666667,24,32,1.0,1,1,1,3,1


#### Add AQICategoryNumber & AQICategory (1 = Good, 2 = Moderate, 3 = Unhealthy for Sensitive Groups, 4 = Unhealthy, 5 = Very Unhealthy, 6 = Hazardous; based on aqiAvg)

In [220]:
# Create new column: AQICategory, using above function
summaryApiData_df['AQICategory'] = summaryApiData_df.apply(faqiAvgAQICategory, axis=1)

# Create function to determine AQICategoryNumber value based on aqiAvg
def faqiAvgAQICategoryNumber(row):
    if round(row["aqiAvg"],0) >= 0 and round(row["aqiAvg"],0) < 51:
        AQICategoryNumber = 1
        AQICategory = "Good"
    elif round(row["aqiAvg"],0) >= 51 and round(row["aqiAvg"],0) < 101:
        AQICategoryNumber = 2
        AQICategory = "Moderate"
    elif round(row["aqiAvg"],0) >= 101 and round(row["aqiAvg"],0) < 151:
        AQICategoryNumber = 3
        AQICategory = "Unhealthy for Sensitive Groups"
    elif round(row["aqiAvg"],0) >= 151 and round(row["aqiAvg"],0) < 201:
        AQICategoryNumber = 4
        AQICategory = "Unhealthy"
    elif round(row["aqiAvg"],0) >= 201 and round(row["aqiAvg"],0) < 301:
        AQICategoryNumber = 5
        AQICategory = "Very Unhealthy"
    elif round(row["aqiAvg"],0) >= 301:
        AQICategoryNumber = 6
        AQICategory = "Hazardous"
    return AQICategoryNumber

# Create new column: AQICategoryNumber, using above function
summaryApiData_df['AQICategoryNumber'] = summaryApiData_df.apply(faqiAvgAQICategoryNumber, axis=1)# Create function to determine AQICategory value based on aqiAvg
def faqiAvgAQICategory(row):
    if round(row["aqiAvg"],0) >= 0 and round(row["aqiAvg"],0) < 51:
        AQICategoryNumber = 1
        AQICategory = "Good"
    elif round(row["aqiAvg"],0) >= 51 and round(row["aqiAvg"],0) < 101:
        AQICategoryNumber = 2
        AQICategory = "Moderate"
    elif round(row["aqiAvg"],0) >= 101 and round(row["aqiAvg"],0) < 151:
        AQICategoryNumber = 3
        AQICategory = "Unhealthy for Sensitive Groups"
    elif round(row["aqiAvg"],0) >= 151 and round(row["aqiAvg"],0) < 201:
        AQICategoryNumber = 4
        AQICategory = "Unhealthy"
    elif round(row["aqiAvg"],0) >= 201 and round(row["aqiAvg"],0) < 301:
        AQICategoryNumber = 5
        AQICategory = "Very Unhealthy"
    elif round(row["aqiAvg"],0) >= 301:
        AQICategoryNumber = 6
        AQICategory = "Hazardous"
    return AQICategory

# Visualize the DataFrame
summaryApiData_df

Unnamed: 0,csaRank,csaName,csaPrimaryCity,csaPrimaryCityState,csaPrimaryCityLat,csaPrimaryCityLong,csaTimeZone,TimeMode,DateTimeObserved,DateObserved,...,aqiMin,aqiMax,aqiCategoryNumberAvg,aqiCategoryNumberMin,aqiCategoryNumberMax,aqiCategoryCount,sitesCount,agenciesCount,AQICategory,AQICategoryNumber
0,1,"New York-Newark, NY-NJ-CT-PA Combined Statisti...",New York City,NY,40.7128,-74.0060,Eastern,Standard,2015-01-01T12:00,2015-01-01,...,21,26,1.0,1,1,1,4,1,Good,1
1,1,"New York-Newark, NY-NJ-CT-PA Combined Statisti...",New York City,NY,40.7128,-74.0060,Eastern,Standard,2015-01-02T12:00,2015-01-02,...,17,20,1.0,1,1,1,4,1,Good,1
2,1,"New York-Newark, NY-NJ-CT-PA Combined Statisti...",New York City,NY,40.7128,-74.0060,Eastern,Standard,2015-01-03T12:00,2015-01-03,...,13,20,1.0,1,1,1,4,1,Good,1
3,1,"New York-Newark, NY-NJ-CT-PA Combined Statisti...",New York City,NY,40.7128,-74.0060,Eastern,Standard,2015-01-04T12:00,2015-01-04,...,2,5,1.0,1,1,1,4,1,Good,1
4,1,"New York-Newark, NY-NJ-CT-PA Combined Statisti...",New York City,NY,40.7128,-74.0060,Eastern,Standard,2015-01-05T12:00,2015-01-05,...,23,26,1.0,1,1,1,3,1,Good,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98839,25,"San Antonio-New Braunfels-Pearsall, TX Combine...",San Antonio,TX,29.4241,-98.4936,Central,DaylightSavings,2020-04-12T12:00,2020-04-12,...,44,55,1.5,1,2,2,4,1,Good,1
98840,25,"San Antonio-New Braunfels-Pearsall, TX Combine...",San Antonio,TX,29.4241,-98.4936,Central,DaylightSavings,2020-04-13T12:00,2020-04-13,...,14,49,1.0,1,1,1,4,1,Good,1
98841,25,"San Antonio-New Braunfels-Pearsall, TX Combine...",San Antonio,TX,29.4241,-98.4936,Central,DaylightSavings,2020-04-14T12:00,2020-04-14,...,20,33,1.0,1,1,1,4,1,Good,1
98842,25,"San Antonio-New Braunfels-Pearsall, TX Combine...",San Antonio,TX,29.4241,-98.4936,Central,DaylightSavings,2020-04-15T12:00,2020-04-15,...,24,32,1.0,1,1,1,3,1,Good,1


#### Convert & export summary DataFrame to CSV files

In [221]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
summaryApiData_df.to_csv(f"{output_data_filepath}AirNowApiData_summary_{timestamp}.csv", encoding="utf-8", index= False)