In [1]:
# Dependencies
import requests
import json
import pandas as pd
from config import api_key

baycrane76


In [2]:
# EPA API base URL
base_url = f"https://aqs.epa.gov/data/api/dailyData/byCBSA?email=rob_mm@msn.com&key={api_key}"

bdate=20230601; edate=20231231; cbsa=16740

In [3]:
def pull_data(param, param_code):
    # url = base_url + f"&param={param_code}&bdate=20240101&edate=20240229&state=37&county=183" # dailyData/byCounty 
    url = base_url + f"&param={param_code}&bdate={bdate}&edate={edate}&cbsa={cbsa}" # dailyData/byCBSA
    response = requests.get(url).json()
    # print(response)

    state = []; county = []
    latitude = []; longitude = []
    date = []; parameter = []
    arithmetic_mean = []
    units_of_measure = []
    sample_duration_code = []

    for row in range(len(response["Data"])):
        date.append(response["Data"][row]["date_local"])
        state.append(response["Data"][row]["state"])
        county.append(response["Data"][row]["county"])
        latitude.append(response["Data"][row]["latitude"])
        longitude.append(response["Data"][row]["longitude"])
        parameter.append(response["Data"][row]["parameter"])
        arithmetic_mean.append(response["Data"][row]["arithmetic_mean"])
        units_of_measure.append(response["Data"][row]["units_of_measure"])
        sample_duration_code.append(response["Data"][row]["sample_duration_code"])

    # create DataFrame
    df = pd.DataFrame({
        "date": date,
        "state": state,
        "county": county,
        "latitude": latitude,
        "longitude": longitude,
        "sample_duration_code": sample_duration_code,
        "parameter": parameter,
        "arithmetic_mean": arithmetic_mean,
        "units_of_measure": units_of_measure
    })

    return df

In [4]:
CO_df = pull_data('CO', param_code=42101)
SO2_df = pull_data('SO2', param_code=42401)
NO2_df = pull_data('NO2', param_code=42602)
PM10_df = pull_data('PM10', param_code=81102)
PM25_df = pull_data('PM2.5', param_code=88101)
Temperature_df = pull_data('Temperature', param_code=68105)
Humidity_df = pull_data('Humidity', param_code=62201)

In [5]:
# drop duplicate rows
CO_df = CO_df.drop_duplicates()
SO2_df = SO2_df.drop_duplicates()
NO2_df = NO2_df.drop_duplicates()
PM10_df = PM10_df.drop_duplicates()
PM25_df = PM25_df.drop_duplicates()
Humidity_df = Humidity_df.drop_duplicates()
Temperature_df = Temperature_df.drop_duplicates()

In [6]:
# Merge DataFrames

# CO, SO2, NO2
Oxides_df = CO_df.merge(SO2_df, how='outer', on=['date', 'state', 'county', 'latitude', 'longitude', 'sample_duration_code'], suffixes=('_CO', '_SO2'))
Oxides_df = Oxides_df.merge(NO2_df, how='outer', on=['date', 'state', 'county', 'latitude', 'longitude', 'sample_duration_code'])

# PM10 and PM2.5
PM_df = PM10_df.merge(PM25_df, how='outer', on=['date', 'state', 'county', 'latitude', 'longitude', 'sample_duration_code'], suffixes=('_PM10', '_PM25'))

# Temperature and Humidity
TH_df = Temperature_df.merge(Humidity_df, how='outer', on=['date', 'state', 'county', 'latitude', 'longitude'], suffixes=('_Temperature', '_Humidity'))

# Oxides, PM, TH
tmp_df = Oxides_df.merge(PM_df, how='outer', on=['date', 'state', 'county', 'latitude', 'longitude', 'sample_duration_code'])
pollution_df = tmp_df.merge(TH_df, how='outer', on=['date', 'state', 'county', 'latitude', 'longitude'])

# delete this line later
# pollution_df = pollution_df.merge(Humidity_df, how='inner', on=['date', 'state', 'county', 'latitude', 'longitude', 'sample_duration_code'], suffixes=('_NO2', '_Humidity'))


In [7]:
# Verify the Units of Measure (should show one row only)
pollution_df.value_counts(subset=['units_of_measure_CO',
                                  'units_of_measure_SO2',
                                  'units_of_measure',
                                  'units_of_measure_PM25',
                                  'units_of_measure_PM10',
                                  'units_of_measure_Temperature',
                                  'units_of_measure_Humidity'
                                  ])

units_of_measure_CO  units_of_measure_SO2  units_of_measure   units_of_measure_PM25        units_of_measure_PM10          units_of_measure_Temperature  units_of_measure_Humidity
Parts per million    Parts per billion     Parts per billion  Micrograms/cubic meter (LC)  Micrograms/cubic meter (25 C)  Degrees Centigrade            Percent relative humidity    68
Name: count, dtype: int64

In [8]:
# take the important columns only
pollution_df = pollution_df[['date',	'state', 'county',
                       'arithmetic_mean_Temperature',
                       'arithmetic_mean_Humidity',
                       'arithmetic_mean_PM25',
                       'arithmetic_mean_PM10',
                       'arithmetic_mean',
                       'arithmetic_mean_SO2',
                       'arithmetic_mean_CO',
                       'latitude', 'longitude']]
pollution_df = pollution_df.rename(columns={"arithmetic_mean_Temperature": "Temperature",
                                "arithmetic_mean_Humidity": "Humidity",
                                "arithmetic_mean_PM25": "PM2.5",
                                "arithmetic_mean_PM10": "PM10",
                                "arithmetic_mean": "NO2",
                                "arithmetic_mean_SO2": "SO2",
                                "arithmetic_mean_CO": "CO"})
pollution_df

Unnamed: 0,date,state,county,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,latitude,longitude
0,2023-06-01,North Carolina,Mecklenburg,,,,10.705882,,,,35.123954,-80.907577
1,2023-06-01,North Carolina,Mecklenburg,,64.625,10.791667,,11.508333,,0.293957,35.213171,-80.874084
2,2023-06-01,North Carolina,Mecklenburg,,64.625,10.700000,,,,,35.213171,-80.874084
3,2023-06-01,North Carolina,Mecklenburg,,64.625,,,,,0.304167,35.213171,-80.874084
4,2023-06-01,North Carolina,Mecklenburg,,67.875,9.166667,16.916667,5.245833,0.187500,0.236435,35.240100,-80.785683
...,...,...,...,...,...,...,...,...,...,...,...,...
3510,2023-12-31,South Carolina,York,,,7.900000,,,,,34.912700,-80.874500
3511,2023-12-31,South Carolina,York,,,,,,-0.129167,,34.977000,-81.207000
3512,2023-12-31,South Carolina,York,,,,,,-0.128819,,34.977000,-81.207000
3513,2023-12-31,South Carolina,York,,,,,,-0.100000,,34.977000,-81.207000


In [9]:
filtered_df = pollution_df.dropna(how='any', ignore_index=True)


# For Cross-Checking Only

In [None]:
# Use this to cross-check the source rows with filtered_df
df = PM25_df
df.loc[ (df['date'] == '2023-06-02')&
        (df['state'] == 'North Carolina')&
        (df['county'] == 'Mecklenburg') &
        (df['sample_duration_code'] == '1') &
        (df['latitude'] == 35.2401) &
        (df['longitude'] == -80.785683)
    ]

In [None]:
filtered_df.loc[filtered_df['date'] == '2023-06-02']

# Population Density

In [10]:
CBSA_df = pd.read_csv('USA_Core_Based_Statistical_Area.csv')

CBSA_df = CBSA_df[['CBSA_ID', 'NAME', 'POP_SQMI']]
# CBSA_df.head()

In [11]:
CBSA_df.set_index('CBSA_ID', inplace=True)
Pop_SqMi = CBSA_df.at[cbsa, 'POP_SQMI']
Pop_SqKm = Pop_SqMi / 2.58998811
print(f"Pop_SqKm: {Pop_SqKm} people/km2")
print(f"Pop_SqMi: {Pop_SqMi} people/mi2")

Pop_SqKm: 173.36373022963411 people/km2
Pop_SqMi: 449.01 people/mi2


In [14]:
import numpy as np
filtered_df.loc[:,'Population_Density'] = Pop_SqKm
filtered_df

filtered_df.to_csv(f'EPA_Data_{cbsa}-{bdate}to{edate}.csv', index=False)