# Simulating real-time data using Apache Kafka Producers.

In this section we will implement multiple Apache Kafka producers to simulate the real-time streaming of the data which will be processed by Apache Spark Streaming client and then inserted into MongoDB. 

Description: A python program that load all the data from hotspot_TERRA_streaming.csv and randomly feed the data to the stream every 10 to 30   seconds. TERRA is another satellite from NASA that reports latitude, longitude, confidence and surface temperature of a location. Some additional information such as sender_id and created_time are also appended.

In [None]:
# importing required libraries
from time import sleep
from json import dumps
from kafka import KafkaProducer
import random
import datetime as dt
import pandas as pd


#reading in data
df = pd.read_csv("hotspot_TERRA_streaming.csv") # using pandas library to read the data set called hotspot_AQUA_streaming.csv
df = df.astype("object") # casting the data type of the entire dataset to primitive type so that it can be converted to json.


def publish_message(producer_instance, topic_name, data):
    try:
        producer_instance.send(topic_name, value=data)
        print('Message published successfully. Data: ' + str(data))
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))
        
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                                  value_serializer=lambda x: dumps(x).encode('ascii'),
                                  api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer
    
if __name__ == '__main__':
   
    topic = 'hotspot'# to create partitioning of data based on this topic in Kafka cluster
                    # all three producers belong to the same topic 'hotspot'
    print('Publishing records..')
    producer = connect_kafka_producer()

    
    while True:
        r = random.randint(0,len(df)-1)
         # Appending sender_id and created_time:
        data = df.loc[r].append(pd.Series({"created_at":dt.datetime.now().strftime("%X"), "sender_id": "producer_3"}))
        data = data.to_dict()
        publish_message(producer,topic,data)
        # To feed the data to the stream every 10 to 30 seconds:
        sleep(random.randint(10,30))

Publishing records..
Message published successfully. Data: {'confidence': 66, 'created_at': '19:48:29', 'surface_temperature_celcius': 53, 'longitude': 142.1573, 'latitude': -37.7069, 'sender_id': 'producer_3'}
Message published successfully. Data: {'confidence': 76, 'created_at': '19:48:51', 'surface_temperature_celcius': 50, 'longitude': 141.6103, 'latitude': -37.7126, 'sender_id': 'producer_3'}
Message published successfully. Data: {'confidence': 100, 'created_at': '19:49:08', 'surface_temperature_celcius': 95, 'longitude': 141.0013, 'latitude': -36.9034, 'sender_id': 'producer_3'}
Message published successfully. Data: {'confidence': 72, 'created_at': '19:49:23', 'surface_temperature_celcius': 56, 'longitude': 147.0408, 'latitude': -38.3525, 'sender_id': 'producer_3'}
Message published successfully. Data: {'confidence': 87, 'created_at': '19:49:50', 'surface_temperature_celcius': 63, 'longitude': 145.7664, 'latitude': -36.1036, 'sender_id': 'producer_3'}
Message published successful

Message published successfully. Data: {'confidence': 100, 'created_at': '20:04:02', 'surface_temperature_celcius': 84, 'longitude': 148.08100000000002, 'latitude': -37.399, 'sender_id': 'producer_3'}
Message published successfully. Data: {'confidence': 77, 'created_at': '20:04:18', 'surface_temperature_celcius': 50, 'longitude': 143.4714, 'latitude': -37.5304, 'sender_id': 'producer_3'}
