# Big data management and processing with Apache Spark & Kafka
***

Name: Wong Kai Lin
***


## Processing Data Stream 
### Event Producer 2:

Write a python that loads all the data from hotspot_AQUA_streaming.csv and 
randomly (with replacement) feed the data to the stream every 2 seconds. 

AQUA is the satellite from NASA that reports latitude, longitude, confidence and surface temperature of a location. 

You will need to append additional information 
such as producer information to identify the producer and 
created date & time. 

In [1]:
# import statements
from time import sleep
from json import dumps
from kafka import KafkaProducer
from kafka import KafkaConsumer
import datetime as dt
import random
import datetime as dt
from datetime import datetime
import pymongo
from pymongo import MongoClient
from pprint import pprint
import pandas as pd
import json

In [2]:
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=['127.0.0.1:9092'],
                                  api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer

In [9]:
# retrieve current climate date in Kafka

def climate_date():
    
    def connect_kafka_consumer():
        _consumer = None
        try:
             _consumer = KafkaConsumer(topic,
                                       consumer_timeout_ms=10000, # stop iteration if no message after 10 sec
                                       auto_offset_reset='earliest', # comment this if you don't want to consume earliest available message
                                       bootstrap_servers=['localhost:9092'],
                                       api_version=(0, 10))
        except Exception as ex:
            print('Exception while connecting Kafka')
            print(str(ex))
        finally:
            return _consumer
 
    def consume_messages(consumer):
        
        try:
            
            # ATTEMPT TO ONLY READ THE LAST MESSAGE IN KAFKA
            #consumer.poll()
            #consumer.seek_to_end()
            
            #pos = consumer.position()

            for msg in consumer:
                message = msg

            # only keep the latest climate record
            data = str(message.value.decode('utf-8')).split(', ') # decode bytes message from kafka producer

            # convert message in kafka back into JSON
            string_json = ""
            for i in data:
                string_json+=i # combine all the key values pairs into one string_json
                string_json+=',' # append a comma to separate each key

            string_json = string_json[:-1] # remove last , from string_json
            climate_json = json.loads(string_json) # parse string_json into json            

            # acess date value  
            climate_date = climate_json['date']

        except Exception as ex:
            print(str(ex))

        finally:
            return climate_date
    
    if __name__ == '__main__':
        topic = 'climate'
        consumer = connect_kafka_consumer()
        return consume_messages(consumer)
        
#climate_date()        

In [4]:
def publish_message(producer_instance, topic_name, data):
    
    try:    

        data = json.dumps(data) # dictionary data of climate converted to json string             

        value_bytes = bytes(data, encoding='utf-8') # encode json string data to bytes

        key_bytes = bytes("P2", encoding='utf-8') # specific key for each producer

        producer_instance.send(topic_name, value=value_bytes, key=key_bytes)
        
        print('Message published successfully. Data: ' + data)
        
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))

In [17]:
if __name__ == '__main__':
   
    topic = 'hotspot'
    print('Publishing records..')
    producer2 = connect_kafka_producer()
    
    # reading csv
    hotspot_aqua = pd.read_csv("hotspot_AQUA_streaming.csv")
    head = list(hotspot_aqua.columns) # column names    

    while True:
        row = random.randrange(0, len(hotspot_aqua)-1) # retrieving a random row data from hotspot_aqua
               
        hotspot_aqua_json = {} # json to store the row of climate data
        
        for cols in head:
            hotspot = hotspot_aqua.loc[row,cols] #value of each column of the data row            
            hotspot_aqua_json[cols] = hotspot # append column name and data value into json

        # convert numpy.int64 column values to int for JSON
        hotspot_aqua_json['confidence'] = int(hotspot_aqua_json['confidence'])
        hotspot_aqua_json['surface_temperature_celcius'] = int(hotspot_aqua_json['surface_temperature_celcius'])
        
        # accessing the current (latest) climate date to be the hotspot date
        # ISSUE: climate_date function takes too long to run as it loops to the last consumer message
        
        #date = climate_date()
         
        # date time now
        now = datetime.now()
        date = now.strftime("%Y-%m-%d")
        current_time = now.strftime("%H:%M:%S")
        
        data = {'producer': 2, 'climate': hotspot_aqua_json, 'date': date, 'time': current_time}

        publish_message(producer2, topic, data)
        
        sleep(2) # feed data to spark streaming every 2 seconds

Publishing records..
Message published successfully. Data: {"climate": {"surface_temperature_celcius": 49, "longitude": 142.2303, "confidence": 76, "latitude": -36.4483}, "date": "2021-05-23", "producer": 2, "time": "21:40:01"}
Message published successfully. Data: {"climate": {"surface_temperature_celcius": 67, "longitude": 141.618, "confidence": 90, "latitude": -36.7903}, "date": "2021-05-23", "producer": 2, "time": "21:40:03"}
Message published successfully. Data: {"climate": {"surface_temperature_celcius": 75, "longitude": 146.2035, "confidence": 95, "latitude": -37.8906}, "date": "2021-05-23", "producer": 2, "time": "21:40:05"}
Message published successfully. Data: {"climate": {"surface_temperature_celcius": 58, "longitude": 141.4541, "confidence": 84, "latitude": -36.7381}, "date": "2021-05-23", "producer": 2, "time": "21:40:07"}
Message published successfully. Data: {"climate": {"surface_temperature_celcius": 93, "longitude": 143.5753, "confidence": 100, "latitude": -37.297}, "d

KeyboardInterrupt: 