### Producer: 2021 Air Quality USA by County

In [0]:
### imports

import requests
import json

In [0]:
# Error callback functions

def error_cb(err):
    """ The error callback is used for generic client errors. These
        errors are generally to be considered informational as the client will
        automatically try to recover from all errors, and no extra action
        is typically required by the application.
        For this example however, we terminate the application if the client
        is unable to connect to any broker (_ALL_BROKERS_DOWN) and on
        authentication errors (_AUTHENTICATION). """

    print("Client error: {}".format(err))
    if err.code() == KafkaError._ALL_BROKERS_DOWN or \
       err.code() == KafkaError._AUTHENTICATION:
        # Any exception raised from this callback will be re-raised from the
        # triggering flush() or poll() call.
        raise KafkaException(err)


def acked(err, msg):
    """ 
        Error callback is used for generic issues for producer errors. 
        
        Parameters:
            err (err): Error flag.
            msg (str): Error message that was part of the callback.
    """
    if err is not None:
        print("Failed to deliver message: %s: %s" % (str(msg), str(err)))
    else:
        print("Message produced: %s" % (str(msg)))

In [0]:
# Connection strings

from confluent_kafka import Consumer
from time import sleep
import uuid
from confluent_kafka import Producer, Consumer, KafkaError, KafkaException
import json
from confluent_kafka.admin import AdminClient, NewTopic


#KAFKA variables, Move to the OS variables or configuration
# This will work in local Jupyter Notebook, but in a databrick, hiding config.py is tougher. 
confluentClusterName = "stage3talent"
confluentBootstrapServers = "pkc-ldvmy.centralus.azure.confluent.cloud:9092"
confluentTopicName = "jadr-AQI"
schemaRegistryUrl = "https://psrc-gq7pv.westus2.azure.confluent.cloud"
confluentApiKey = dbutils.secrets.get(scope = "jadr_blob", key = "confluentApiKey")
confluentSecret = dbutils.secrets.get(scope = "jadr_blob", key = "confluentSecret")
confluentRegistryApiKey = dbutils.secrets.get(scope = "jadr_blob", key = "confluentRegistryApiKey")
confluentRegistrySecret = dbutils.secrets.get(scope = "jadr_blob", key = "confluentRegistrySecret")

# Kafka admin setup

admin_client = AdminClient({
    'bootstrap.servers': confluentBootstrapServers,
    'sasl.mechanism': 'PLAIN',
    'security.protocol': 'SASL_SSL',
    'sasl.username': confluentApiKey,
    'sasl.password': confluentSecret,
    'group.id': str(uuid.uuid1()),  # this will create a new consumer group on each invocation.
    'auto.offset.reset': 'earliest',
    'error_cb': error_cb,
})

In [0]:
# Kafka Producer setup

p = Producer({
    'bootstrap.servers': confluentBootstrapServers,
    'sasl.mechanism': 'PLAIN',
    'security.protocol': 'SASL_SSL',
    'sasl.username': confluentApiKey,
    'sasl.password': confluentSecret,
    'group.id': str(uuid.uuid1()),  # this will create a new consumer group on each invocation.
    'auto.offset.reset': 'earliest',
    'error_cb': error_cb,
})

In [0]:
# Create a Kafka topic

futures = admin_client.create_topics([NewTopic(confluentTopicName, 1, 3)])

In [0]:
### The Producer Itself
### 2021 Air Quality data, all counties in US, annual data
# Documentation for API: https://aqs.epa.gov/aqsweb/documents/data_api.html#annual
    # https://aqs.epa.gov/data/api/list/countiesByState?email=test@aqs.api&key=test&state=06      

    
# Variable declarations
email = "dgerber@dev-10.com"
key = confluentRegistrySecret = dbutils.secrets.get(scope = "jadr_blob", key = "AQKey_dg")
date = '2021'
bdate = f"{date}0101"
edate = f"{date}1231"
col_to_keep = ["state_code", "county_code", "latitude","longitude","parameter","metric_used","method","year","units_of_measure",
               "state","county","city","arithmetic_mean","standard_deviation", "first_max_value", "second_max_value", 
               "ninety_ninth_percentile", "cbsa_code"]

# This returns Lead (TSP) LC, Carbon monoxide, Sulfur dioxide, Nitrogen dioxide (NO2), Ozone, PM10 Total 0-10um STP, Lead PM10 LC FRM/FEM,  and PM2.5 - Local Conditions
params = "14129,42401,42602,44201,81102,85129,88101" 

# To get all states
states = []
for staterow in json.loads(requests.get(f'https://aqs.epa.gov/data/api/list/states?email={email}&key={key}').text)['Data']:
    try:
        code = int(staterow['code'])
        if code < 66:
            states.append(str(code).zfill(2))
    except:
        continue

# Get data for each county from API, push to producer
for state in states:
    stateurl = f'https://aqs.epa.gov/data/api/list/countiesByState?email={email}&key={key}&state={state}'
    for countyrow in json.loads(requests.get(stateurl).text)['Data']:
        county = countyrow['code']
        URL = f'https://aqs.epa.gov/data/api/annualData/byCounty?email={email}&key={key}&param={params}&bdate={bdate}&edate={edate}&state={state}&county={county}'
        for aDict in json.loads(requests.get(URL).text)['Data']:
            p.produce(confluentTopicName,json.dumps(dict((k, aDict[k]) for k in col_to_keep))) # Grab only the columns we want, and append it to the list of dictionaries, push to producer
            p.flush()
            sleep(5)    