In [1]:
# Dependencies
import requests
import json
import pandas as pd
from config import api_key

In [7]:
import param

# EPA CO search base URL
base_url = f"https://aqs.epa.gov/data/api/dailyData/byCBSA?email=rob_mm@msn.com&key={api_key}"

In [8]:
def pull_data(param, param_code):
    # url = base_url + f"&param={param_code}&bdate=20240101&edate=20240229&state=37&county=183" # dailyData/byCounty 
    url = base_url + f"&param={param_code}&bdate=20240101&edate=20240229&cbsa=16740" # dailyData/byCBSA
    response = requests.get(url).json()
    # print(response)

    state = []; county = []
    latitude = []; longitude = []
    date = []; parameter = []
    arithmetic_mean = []
    units_of_measure = []
    sample_duration_code = []

    for row in range(len(response["Data"])):
        date.append(response["Data"][row]["date_local"])
        state.append(response["Data"][row]["state"])
        county.append(response["Data"][row]["county"])
        latitude.append(response["Data"][row]["latitude"])
        longitude.append(response["Data"][row]["longitude"])
        parameter.append(response["Data"][row]["parameter"])
        arithmetic_mean.append(response["Data"][row]["arithmetic_mean"])
        units_of_measure.append(response["Data"][row]["units_of_measure"])
        sample_duration_code.append(response["Data"][row]["sample_duration_code"])

    # create DataFrame
    df = pd.DataFrame({
        "date": date,
        "state": state,
        "county": county,
        "latitude": latitude,
        "longitude": longitude,
        "sample_duration_code": sample_duration_code,
        "parameter": parameter,
        "arithmetic_mean": arithmetic_mean,
        "units_of_measure": units_of_measure
    })

    return df

In [9]:
CO_df = pull_data('CO', param_code=42101)
SO2_df = pull_data('SO2', param_code=42401)
NO2_df = pull_data('NO2', param_code=42602)
PM10_df = pull_data('PM10', param_code=81102)
PM25_df = pull_data('PM2.5', param_code=88101)
# PM25_df

In [10]:
# drop duplicate rows
CO_df = CO_df.drop_duplicates()
SO2_df = SO2_df.drop_duplicates()
NO2_df = NO2_df.drop_duplicates()
PM10_df = PM10_df.drop_duplicates()
PM25_df = PM25_df.drop_duplicates()

In [11]:
# Merge DataFrames
pollution_df = CO_df.merge(SO2_df, how='inner', on=['date', 'state', 'county', 'latitude', 'longitude', 'sample_duration_code'], suffixes=('_CO', '_SO2'))
PM_df = PM10_df.merge(PM25_df, how='inner', on=['date', 'state', 'county', 'latitude', 'longitude', 'sample_duration_code'], suffixes=('_PM10', '_PM25'))
pollution_df = pollution_df.merge(PM_df, how='inner', on=['date', 'state', 'county', 'latitude', 'longitude', 'sample_duration_code'], suffixes=('_CO', '_SO2'))
pollution_df = pollution_df.merge(NO2_df, how='inner', on=['date', 'state', 'county', 'latitude', 'longitude', 'sample_duration_code'])

In [None]:
pollution_df.loc[(pollution_df['date'] == '2024-01-01')] #['arithmetic_mean_CO']
# pollution_df.loc[(pollution_df['date'] == '2024-01-01') | (pollution_df['date'] == '2024-02-01')]


In [None]:
CO_df.value_counts()

In [12]:
# Verify the Units of Measure (should show one row only)
pollution_df.value_counts(subset=['units_of_measure_CO', 'units_of_measure_SO2', 'units_of_measure', 'units_of_measure_PM25', 'units_of_measure_PM10'])

units_of_measure_CO  units_of_measure_SO2  units_of_measure   units_of_measure_PM25        units_of_measure_PM10        
Parts per million    Parts per billion     Parts per billion  Micrograms/cubic meter (LC)  Micrograms/cubic meter (25 C)    51
Name: count, dtype: int64

In [13]:
pollution_df.info()

# pollution_df.value_counts(subset=['date'])
# pollution_df['date'].value_counts()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date                   51 non-null     object 
 1   state                  51 non-null     object 
 2   county                 51 non-null     object 
 3   latitude               51 non-null     float64
 4   longitude              51 non-null     float64
 5   sample_duration_code   51 non-null     object 
 6   parameter_CO           51 non-null     object 
 7   arithmetic_mean_CO     51 non-null     float64
 8   units_of_measure_CO    51 non-null     object 
 9   parameter_SO2          51 non-null     object 
 10  arithmetic_mean_SO2    51 non-null     float64
 11  units_of_measure_SO2   51 non-null     object 
 12  parameter_PM10         51 non-null     object 
 13  arithmetic_mean_PM10   51 non-null     float64
 14  units_of_measure_PM10  51 non-null     object 
 15  paramete

In [14]:
tmp_df = pollution_df[['date',	'state', 'county', 'latitude', 'longitude', 'arithmetic_mean_PM25', 'arithmetic_mean_PM10', 'arithmetic_mean', 'arithmetic_mean_SO2', 'arithmetic_mean_CO']]
tmp_df = tmp_df.rename(columns={"arithmetic_mean_PM25": "PM2.5", 
                       "arithmetic_mean_PM10": "PM10",
                       "arithmetic_mean": "NO2",
                       "arithmetic_mean_SO2": "SO2",
                       "arithmetic_mean_CO": "CO"})
tmp_df

Unnamed: 0,date,state,county,latitude,longitude,PM2.5,PM10,NO2,SO2,CO
0,2024-01-01,North Carolina,Mecklenburg,35.2401,-80.785683,7.791667,9.625,4.4625,0.120833,0.201826
1,2024-01-02,North Carolina,Mecklenburg,35.2401,-80.785683,5.708333,11.125,13.227273,0.226087,0.3465
2,2024-01-03,North Carolina,Mecklenburg,35.2401,-80.785683,9.666667,12.875,14.025,0.304167,0.389826
3,2024-01-04,North Carolina,Mecklenburg,35.2401,-80.785683,6.375,10.666667,8.868182,0.121739,0.225375
4,2024-01-05,North Carolina,Mecklenburg,35.2401,-80.785683,6.333333,11.375,10.2,0.233333,0.269261
5,2024-01-06,North Carolina,Mecklenburg,35.2401,-80.785683,4.125,5.75,4.154545,0.043478,0.205417
6,2024-01-07,North Carolina,Mecklenburg,35.2401,-80.785683,2.916667,5.958333,3.820833,0.033333,0.189957
7,2024-01-08,North Carolina,Mecklenburg,35.2401,-80.785683,4.458333,7.416667,10.622727,0.182609,0.275167
8,2024-01-09,North Carolina,Mecklenburg,35.2401,-80.785683,2.458333,5.458333,1.683333,0.008333,0.166913
9,2024-01-10,North Carolina,Mecklenburg,35.2401,-80.785683,2.0,7.238095,3.85,0.004348,0.184042


In [None]:
tmp_df.info()

Unnamed: 0,Age,City
0,25,New York
1,30,Los Angeles
2,35,Chicago
