# Big Data Management and Processing with Apache Spark & Kafka
***

Name: Wong Kai Lin
***


## Processing Data Stream 
### Event Producer 1:

Write a python program that loads all the data from climate_streaming.csv and 
randomly (with replacement) feed the data to the stream every 10 seconds. 

You will need to append additional information 
such as producer information to identify the producer and 
created date. 

In [25]:
# import statements
from time import sleep
from json import dumps
from kafka import KafkaProducer
import random
import datetime as dt
import pymongo
from pymongo import MongoClient
from pprint import pprint
import pandas as pd
import json

In [26]:
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=['127.0.0.1:9092'],
                                  api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer

In [27]:
# retrieve last date from MongoDb collection

def last_date():
    
    # Making a Connection with MongoDB
    client = MongoClient()

    # connecting to database
    db = client.fit3182_assignment_db
    # connecting to collection
    climate_hotspot = db.climate_hotspot

    # accesing the last date stored in the collection
    last_date = list(climate_hotspot.find({},{'date':1,'_id':0}).sort('date',-1).limit(1)) # store py cursor object into list
    last_date = last_date[0]['date'] # access the date value
    last_date = dt.datetime.strptime(last_date, "%Y-%m-%d")
    
    return(last_date)

In [28]:
def publish_message(producer_instance, topic_name, data):
    
    try:    

        data = json.dumps(data) # dictionary data of climate converted to json string             

        value_bytes = bytes(data, encoding='utf-8') # encode json string data to bytes

        key_bytes = bytes("P1", encoding='utf-8') # specific key for each producer

        producer_instance.send(topic_name, value=value_bytes, key=key_bytes)
        
        print('Message published successfully. Data: ' + str(data))
        
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))

In [32]:
if __name__ == '__main__':
   
    topic = 'climate'
    
    print('Publishing records..')
    producer1 = connect_kafka_producer()
    
    # reading csv
    climate_csv = pd.read_csv("climate_streaming.csv")
    
    # rename GHI column header
    climate_csv.columns = ['latitude', 'longitude', 'air_temperature_celcius', 'relative_humidity', 'windspeed_knots', 'max_wind_speed', 'precipitation ', 'GHI']
    head = list(climate_csv.columns) # column names

    # getting last date stored in climate data of Part A
    new_date = last_date()

    while True:
        row = random.randrange(0, len(climate_csv)-1) # retrieving a random row data from climate_csv
               
        climate_json = {} # json to store the row of climate data
        
        for cols in head:
            climate = climate_csv.loc[row,cols] #value of each column of the data row            
            climate_json[cols] = climate # append column name and data value into json

        # convert numpy.int64 column values to int for JSON
        climate_json['GHI'] = int(climate_json['GHI'])
        climate_json['air_temperature_celcius'] = int(climate_json['air_temperature_celcius'])   
            
        # new date + 1 for every 10second data
        new_date+=dt.timedelta(days=1)
        new_date_str = (new_date.date()).isoformat()
        
        data = {'date': new_date_str, 'producer': 1, 'climate': climate_json} # creating json data

        publish_message(producer1, topic, data)
        
        sleep(10) # feed data to spark streaming every 10 seconds

Publishing records..
Message published successfully. Data: {"date": "2019-01-01", "climate": {"windspeed_knots": 8.5, "precipitation ": " 0.08G", "air_temperature_celcius": 13, "longitude": 143.7132, "GHI": 111, "relative_humidity": 50.1, "latitude": -36.369, "max_wind_speed": 12.0}, "producer": 1}
Message published successfully. Data: {"date": "2019-01-02", "climate": {"windspeed_knots": 4.1, "precipitation ": " 0.00I", "air_temperature_celcius": 21, "longitude": 143.281, "GHI": 176, "relative_humidity": 52.8, "latitude": -36.94, "max_wind_speed": 11.1}, "producer": 1}
Message published successfully. Data: {"date": "2019-01-03", "climate": {"windspeed_knots": 7.7, "precipitation ": " 0.00I", "air_temperature_celcius": 17, "longitude": 143.1847, "GHI": 143, "relative_humidity": 52.5, "latitude": -36.0005, "max_wind_speed": 16.9}, "producer": 1}
Message published successfully. Data: {"date": "2019-01-04", "climate": {"windspeed_knots": 5.7, "precipitation ": " 0.00I", "air_temperature_c

KeyboardInterrupt: 