# FIT3182 - Big data management and processing

Name: Cheong Karr Kei

Student ID: 30091497

Email: kche0070@student.monash.edu


# Assignment Part B #

**Task 1. Processing Stream Data**

**Event Producer 1**

In this notebook, we have Event Producer 1 which loads data from `climate_streaming.csv`. This program acts as one of the producers in our stream and feeds climate data to our stream every 10 seconds under the topic `FIT3182_Assignment`. The key for this producer is set as `P1`.


In [None]:
# import statements
from time import sleep
from json import dumps
from kafka import KafkaProducer
import random
import datetime as dt
from datetime import timedelta
import csv
from pymongo import MongoClient
from pprint import pprint

 
def publish_message(producer_instance, topic_name, key, data):
    try:
        #First, enconde the value or key into bytes
        key_bytes = bytes(key, encoding='utf-8')
        
        #Publish data 
        producer_instance.send(topic_name, key=key_bytes, value=data)
        print('Message published successfully. ' + str(data))
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))
        
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=['127.0.0.1:9092'],
                                  value_serializer=lambda x: dumps(x).encode('ascii'),
                                  api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer
    
if __name__ == '__main__':
    
    #Set topic
    topic = 'FIT3182_Assignment'
    #Set key as P1 for Producer 1
    key = "P1"
    print('Publishing records..')
    producer01 = connect_kafka_producer()
    
    #initialise date as latest date in the climate data in Part A Task 2
    client = MongoClient()
    #Get DB
    db = client.fit3182_assignment_db
    #Get climate collection
    climate = db.climate_historic
    results = climate.find()
    #get latest date
    latest_date = None

    results = climate.aggregate([{"$sort": {"date": -1}},{"$limit": 1}])
    for result in results:
        latest_date = result["date"]
    #Set current date 
    current_date = latest_date
     
    #array of three station ID's to choose from randomly 
    station_ids = [948700, 948701, 948702]
    
    #array to store climate  data 
    climate_data = []


    #open csv file containing climate data 
    header = None
    with open('climate_streaming.csv', newline='') as f:
        reader = csv.reader(f)
        #get the header (first row)
        header = next(reader)  

        #iterate row 2 onwards
        for row in reader:
            #get station id
            station_id = station_ids[random.randrange(3)]

            #create new document
            newClimate = {
                            #latitude
                            header[0] : float(row[0]),    
                            #longitude
                            header[1]: float(row[1]),   
                            #air_temperature_celcius
                            header[2]: int(row[2]),
                            #relative_humidity
                            header[3]: float(row[3]),
                            #windspeed_knots
                            header[4]: float(row[4]),
                            #max_wind_speed
                            header[5]: float(row[5]),
                            #precipitation
                            header[6]: str(row[6]),
                            #GHI_w/m2
                            header[7]: int(row[7]),
                            #station
                            "station" : station_id
                          } 


            #add to array
            climate_data.append(newClimate)

    #send data while list of climate data is still empty
    while len(climate_data)>0:
        index = random.randrange(len(climate_data))
        data = climate_data.pop(index)

        #get date (every 10 seconds = 1 day)
        current_date = dt.datetime.strptime(current_date, "%d/%m/%Y") + timedelta(days=1)
        current_date = current_date.strftime("%d/%m/%Y")
        #add current date field to climate data
        data["date"] = current_date

        #send document 
        publish_message(producer01, topic, key, data)

        #wait 10 seconds 
        sleep(10)

    
