In [None]:
# import statements
import uuid
from time import sleep
import datetime
from kafka import KafkaProducer
import random
import copy as copy_util


def publish_message(producer_instance, topic, key, value):
    """
    Publish message to Kafka
    :param key: Should be unique to achieve parallelism
    :param value: record to send
    """
    try:
        producer_instance.send(topic, value=value, key=key)
        print('Message published successfully. ' + value)
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))


def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                                  value_serializer=lambda x: x.encode('utf-8'),
                                  key_serializer=lambda x: x.encode('utf-8'),
                                  api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer


def get_record(header, data_row):
    """
    Construct the record to send as a json
    :param header: A list of field names that denote the attributes of the record.
    :param data_row: A list of data.
    :return: formatted list of records
    """
    if len(header) != len(data_row):
        raise ValueError('Data might be corrupted -> header length does not match record length')

    record = {}
    for index, attribute in enumerate(header):
        attribute = attribute.strip()
        if attribute == "latitude" or attribute == "longitude":
            record[attribute] = float(data_row[index])
        elif attribute == "surface_temperature_celcius" or attribute == "confidence":
            record[attribute] = int(data_row[index])
        else:
            raise ValueError("Failed to catch all the data fields")

    return record


def parse_file(file_location):
    """
    Get raw data from the file and parse it to a list of dictionary

    :param file_location: CSV file where the new table is derived from

    :return: A list of dictionary that represents the records
    """

    data_to_insert = []  # a list of hotspot records to insert to the database

    hotspot_AQUA_file = open(file_location)

    file_as_list = hotspot_AQUA_file.readlines()
    header = file_as_list[0].replace("\n", "").split(",")  # get a list of field name

    for index, raw_data in enumerate(file_as_list[1:]):  # loop all rows except for the header
        data_row = raw_data.replace("\n", "").split(",")
        data_to_insert.append(get_record(header, data_row))

    return data_to_insert


def start_streaming(producer, data_list, topic, interval, sender_id):
    """
    Initiate the streaming process by sending records from the given list to kafka in an arbitrary order
    :param producer: Instance of the kafka server
    :param data_list: A list of records to stream. Records are randomly selected to push to kafka
    :param topic: The topic of the record
    :param interval: The time interval of streaming data arrival
    :param sender_id: A id for the consumer to identify the producers
    """
    max_index_of_list = len(data_list)
    while True:
        index_of_list = random.randrange(0, max_index_of_list)
        selected_record = copy_util.copy(data_list[index_of_list])

        selected_record["sender_id"] = sender_id
        selected_record["created_time"] = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

        ''' push the data to kafka '''
        publish_message(producer_instance=producer, topic=topic, key=str(uuid.uuid4()), value=str(selected_record))

        sleep(interval)




print("Publishing hostpot AQUA steaming...")

# get data from the file
data_list_to_stream = parse_file("/Users/frank/Desktop/Sem3/FIT5148/Assignment/data/hotspot_AQUA_streaming.csv")

producer2 = connect_kafka_producer()  # connect to kafka as a producer

interval = random.randrange(10, 30 + 1)  # data is sent at every 10 - 30 seconds
start_streaming(producer2, data_list_to_stream, "temperature_analysis", interval, "AQUA")  # initiate the streaming process



Publishing hostpot AQUA steaming...
Message published successfully. {'latitude': -34.8368, 'longitude': 141.5197, 'confidence': 83, 'surface_temperature_celcius': 56, 'sender_id': 'AQUA', 'created_time': '2019-05-24T10:38:46'}
Message published successfully. {'latitude': -37.4583, 'longitude': 148.2459, 'confidence': 81, 'surface_temperature_celcius': 39, 'sender_id': 'AQUA', 'created_time': '2019-05-24T10:39:13'}
Message published successfully. {'latitude': -37.1089, 'longitude': 143.0783, 'confidence': 82, 'surface_temperature_celcius': 55, 'sender_id': 'AQUA', 'created_time': '2019-05-24T10:39:39'}
Message published successfully. {'latitude': -36.6136, 'longitude': 142.5022, 'confidence': 81, 'surface_temperature_celcius': 54, 'sender_id': 'AQUA', 'created_time': '2019-05-24T10:40:05'}
Message published successfully. {'latitude': -37.0827, 'longitude': 143.8836, 'confidence': 72, 'surface_temperature_celcius': 47, 'sender_id': 'AQUA', 'created_time': '2019-05-24T10:40:31'}
Message p