# 1. Producing the data
In this task, we will implement Apache Kafka producers to simulate real-time data streaming. Spark is not allowed in this part since it’s simulating a streaming data source.  

1.	Your program should send one batch of browsing behaviour data every 5 seconds. One batch consists of a random 500-1000 rows from the browsing behaviour dataset. The CSV shouldn’t be loaded to memory at once to conserve memory (i.e. Read row as needed). Keep track of the start and end event_time. (You can assume the dataset is sorted by event_time.)  
2.	Add an integer column named ‘ts’ for each row, a Unix timestamp in seconds since the epoch. Spead your batch out evenly for 5 seconds.  
a.	For example, if you send a batch of 600 records at 2023-09-01 00:00:00 (ISO format: YYYY-MM-DD HH:MM:SS) -> (ts = 1693526400):  
-	Record 1-120: ts = 1693526400   
-	Record 121-240: ts = 1693526401   
-	Record 241-360: ts = 1693526402  
-	….  
3.	Read the transactions between the start and end event_time in 1.1 every 5 seconds (the same frequency as browsing behaviour) and create a batch.  
4.	Send your two batches from 1.1 and 1.3 to Kafka topics with an appropriate name.  
Note 1: In 1.1, “random 500-1000” means the number of rows is random, and the data file is still read sequentially.  
Note 2: All the data except for the ‘ts’ column should be sent in the original String type without changing to any other type. This is because we are simulating a streaming access log and need to reduce the required processing at the source.


In [1]:
import pandas as pd
import time
import random
import json
import csv
from kafka3 import KafkaProducer
from datetime import datetime

In [2]:
# Kafka Producer Configuration
# producer = KafkaProducer(bootstrap_servers='localhost:9092', 
#                          value_serializer=lambda v: str(v).encode('utf-8'))
hostip = "kafka"
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=[f'{hostip}:9092'], api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer


In [None]:
def read_data_in_batches(file_path):
    batch_size = random.randint(500, 1000)
    
    # Open the CSV file
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        header = next(reader)  # Read the header

        while True:
            # Read a batch of rows
            rows = []
            for _ in range(batch_size):
                try:
                    row = next(reader)  # Read the next row from the reader
                    rows.append(row)
                except StopIteration:
                    break  # End of file
            
            if not rows:
                break  # No more rows
            
            yield header, rows  # Yield header and rows
# Function to read transactions in a time range
def read_transactions_in_timerange(trans_path, start_time, end_time):
    with open(trans_path, 'r') as file:
        # Use csv.reader to handle the CSV format correctly
        reader = csv.reader(file)
        header = next(reader)  # Read the header
#         print(header, '\n')

        for row in reader:
            event_time = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S.%f')  # Adjust for milliseconds
            if start_time <= event_time <= end_time:
#                 print(row, '\n')  # Print the raw row for verification
                yield dict(zip(header, row))  # Yield the transaction as a dictionary
                
# Function to produce batches of data to Kafka
def produce_batches(producer, browsing_path, browsing_topic_name, trans_path, trans_topic_name):
    
    batch_generator = read_data_in_batches(browsing_path)
    
    for header, rows in batch_generator:
#         print('Start public batch:', '\n')
        
        # Get the start and end event_time
        start_time = datetime.strptime(rows[0][2].strip('"'), '%Y-%m-%d %H:%M:%S.%f')  # Adjusted for milliseconds
        end_time = datetime.strptime(rows[-1][2].strip('"'), '%Y-%m-%d %H:%M:%S.%f')  # Adjusted for milliseconds

        # Send records with timestamps
        for i, row in enumerate(rows):
            row_dict = dict(zip(header, row))  # Create a dictionary from the header and row
            
            # Calculate the timestamp for each record
            current_time = int(datetime.now().timestamp())  # Current Unix timestamp
            ts = current_time + (i // (len(rows) // 5)) 
            row_dict['ts'] = ts  # Add the timestamp
            
#             print(row_dict)  # Print for verification
            producer.send(browsing_topic_name, value=json.dumps(row_dict).encode('utf-8'))  # Send to Kafka

        # Read and send transactions in the specified time range
        transactions = read_transactions_in_timerange(trans_path, start_time, end_time)
        for transaction in transactions:
            trans_current_time = int(datetime.now().timestamp())
            transaction['trans_ts'] = trans_current_time
#             print(transaction)  # Print for verification
            producer.send(trans_topic_name, value=json.dumps(transaction).encode('utf-8'))  # Send to Kafka
#         print('Finish batch', '\n')
        
# Main execution
if __name__ == "__main__":
    producer = connect_kafka_producer()
    browsing_path = 'new_browsing_behaviour.csv'  # Path to your browsing behavior CSV file
    trans_path = 'new_transactions.csv'  # Path to your transactions CSV file
    browsing_topic_name = 'browsing_behaviour_topic'  # Kafka topic for browsing behavior
    trans_topic_name = 'trans_topic'  # Kafka topic for transactions

    while True:
        produce_batches(producer, browsing_path, browsing_topic_name, trans_path, trans_topic_name)
        time.sleep(5)