In [0]:
import requests
import pyspark.sql.functions
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql.functions import *
from pyspark.sql.types import StringType,DecimalType
from pyspark.sql.functions import input_file_name, substring
from pyspark.sql.functions import isnan, when, count, col

# Kafka producer and consumer imports
from confluent_kafka import Consumer
from time import sleep
import uuid
from confluent_kafka import Producer, Consumer, KafkaError, KafkaException
import json
from confluent_kafka.admin import AdminClient, NewTopic



**Mount Point to connect to healthcare-capstone-group3 and pull in csv files**

In [0]:
###### Mount Point 1 through Oauth security.
storageAccount = "gen10datafund2207"
storageContainer = "healthcare-capstone-group3"
clientSecret = "Cty8Q~AvEO_qC-MjvPvosYauiNsffOHKnMpj7cmd"
clientid = "2ca50102-5717-4373-b796-39d06568588d"
mount_point = "/mnt/healthcare/cleandataIn" # the mount point will be unique to you

configs = {"fs.azure.account.auth.type": "OAuth",
       "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
       "fs.azure.account.oauth2.client.id": clientid,
       "fs.azure.account.oauth2.client.secret": clientSecret,
       "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/d46b54b2-a652-420b-aa5a-2ef7f8fc706e/oauth2/token",
       "fs.azure.createRemoteFileSystemDuringInitialization": "true"}

try: 
    dbutils.fs.unmount(mount_point)
except:
    pass

dbutils.fs.mount(
source = "abfss://"+storageContainer+"@"+storageAccount+".dfs.core.windows.net/",
mount_point = mount_point,
extra_configs = configs)

/mnt/healthcare/cleandataIn has been unmounted.
Out[2]: True

*Loading demographics CSV file into a dataframe*

In [0]:
health = spark.read.options(header = 'True').csv('/mnt/healthcare/cleandataIn/CleanedData/HealthInsuranceChar.csv')

**Kafka Producer**

In [0]:
# Error functions
def error_cb(err):
    """ The error callback is used for generic client errors. These
        errors are generally to be considered informational as the client will
        automatically try to recover from all errors, and no extra action
        is typically required by the application.
        For this example however, we terminate the application if the client
        is unable to connect to any broker (_ALL_BROKERS_DOWN) and on
        authentication errors (_AUTHENTICATION). """

    print("Client error: {}".format(err))
    if err.code() == KafkaError._ALL_BROKERS_DOWN or \
       err.code() == KafkaError._AUTHENTICATION:
        # Any exception raised from this callback will be re-raised from the
        # triggering flush() or poll() call.
        raise KafkaException(err)


def acked(err, msg):
    """ 
        Error callback is used for generic issues for producer errors. 
        
        Parameters:
            err (err): Error flag.
            msg (str): Error message that was part of the callback.
    """
    if err is not None:
        print("Failed to deliver message: %s: %s" % (str(msg), str(err)))
    else:
        print("Message produced: %s" % (str(msg)))

In [0]:
#KAFKA variables, Move to the OS variables or configuration
# This will work in local Jupyter Notebook, but in a databrick, hiding config.py is tougher. 
confluentClusterName = "stage3talent"
confluentBootstrapServers = "pkc-ldvmy.centralus.azure.confluent.cloud:9092"

# Topic name
confluentTopicName = "healthcare-insurance"

schemaRegistryUrl = "https://psrc-gq7pv.westus2.azure.confluent.cloud"
confluentApiKey = "YHMHG7E54LJA55XZ"
confluentSecret = "/XYn+w3gHGMqpe9l0TWvA9FznMYNln2STI+dytyPqtZ9QktH0TbGXUqepEsJ/nR0"
confluentRegistryApiKey = "YHMHG7E54LJA55XZ"
confluentRegistrySecret = "/XYn+w3gHGMqpe9l0TWvA9FznMYNln2STI+dytyPqtZ9QktH0TbGXUqepEsJ/nR0"

### Admin variable
admin_client = AdminClient({
    'bootstrap.servers': confluentBootstrapServers,
    'sasl.mechanism': 'PLAIN',
    'security.protocol': 'SASL_SSL',
    'sasl.username': confluentApiKey,
    'sasl.password': confluentSecret,
    'group.id': str(uuid.uuid1()),  # this will create a new consumer group on each invocation.
    'auto.offset.reset': 'earliest',
    'error_cb': error_cb,
})

#Kakfa Class Setup.
p = Producer({
    'bootstrap.servers': confluentBootstrapServers,
    'sasl.mechanism': 'PLAIN',
    'security.protocol': 'SASL_SSL',
    'sasl.username': confluentApiKey,
    'sasl.password': confluentSecret,
    'group.id': str(uuid.uuid1()),  # this will create a new consumer group on each invocation.
    'auto.offset.reset': 'earliest',
    'error_cb': error_cb,
})

**Health Insurance Status Producer**

In [0]:
### Deleting topic
try:
    topics = [confluentTopicName]
    fs = admin_client.delete_topics(topics, request_timeout=30)
    
    for topic, f in fs.items():
        try:
            f.result()  # The result itself is None
            print("Topic {} deleted".format(topic))
        except Exception as e:
            print("Failed to delete topic {}: {}".format(topic, e))
    
except Exception as e:
    print(e)

Topic healthcare-insurance deleted


In [0]:
healthdict = []

for i in range(0, health.count()):
    temp = list(health.collect()[i])
    healthdict.append({'State':temp[0], 'County':temp[1], 'Population_Category':temp[2], 'Total_Population':temp[3], 'Under_6_Y':temp[4], '_6_to_18_Y':temp[5], '_19_to_25_Y':temp[6], '_26_to_34_Y':temp[7], '_35_to_44_Y':temp[8], '_45_to_54_Y':temp[9], '_55_to_64_Y':temp[10], '_65_to_74_Y':temp[11], '_75_and_Older':temp[12], 'Male':temp[13], 'Female':temp[14], 'White':temp[15], 'African_American':temp[16], 'American_Indian':temp[17], 'Asian':temp[18], 'Pacific_Islander':temp[19], 'Some_Other_Race':temp[20], 'Hispanic':temp[21], 'Native_Born':temp[22], 'Foreign_Born':temp[23], 'Naturalized':temp[24], 'Not_a_Citizen':temp[25], 'Less_than_High_School':temp[26], 'High_School_or_Equivalet':temp[27], 'Some_College':temp[28], 'Bachelors_Degree_or_Higher':temp[29], 'Under_25000S':temp[30], '_25000_to_49999S':temp[31], '_50000_to_74999S':temp[32], '_75000_to_99999S':temp[33], 'Over_100000S':temp[34]})

In [0]:
### Creating topic
topic_list = []

topic_list.append(NewTopic(confluentTopicName, 1, 3))
futures = admin_client.create_topics(topic_list)

try:
    record_metadata = []
    for k, future in futures.items():
        # f = i.get(timeout=10)
        print(f"type(k): {type(k)}")
        print(f"type(v): {type(future)}")
        print(future.result())

except KafkaError:
    # Decide what to do if produce request failed...
    print(traceback.format_exc())
    result = 'Fail'
finally:
    print("finally")

type(k): <class 'str'>
type(v): <class 'concurrent.futures._base.Future'>
None
finally


In [0]:
## Health Insurance Producer 

i = 0

while i < health.count():
    # Clarification on p.produce
    # produce (topic, message in JSON)
    p.produce(confluentTopicName,json.dumps(healthdict[i]))
    i = i+1
    p.flush()
#     sleep(5)
    
    # Used to check on the progress of the producer
#     print(f"{i}")