In [None]:
import time
import datetime as dt
import random
import csv
from kafka3 import KafkaProducer
from json import dumps

# Kafka Configuration
hostip = "kafka"

def publish_message(producer_instance, topic_name, data):
    try:
        # Serializing the data, making sure all fields are strings
        serialized_data = {key: (str(value) if key != 'ts' else value) for key, value in data.items()}
        producer_instance.send(topic_name, value=serialized_data)
        print(f'Message published successfully to {topic_name}. Data: {serialized_data}')
    except Exception as ex:
        print(f'Exception in publishing message: {str(ex)}')

def connect_kafka_producer():
    try:
        producer = KafkaProducer(
            bootstrap_servers=[f'{hostip}:9092'],
            value_serializer=lambda x: dumps(x).encode('utf-8'),
            api_version=(0, 10)
        )
        return producer
    except Exception as ex:
        print(f'Exception while connecting Kafka: {str(ex)}')
        return None

# Function to add Unix timestamps spread evenly over 5 seconds for browsing data.
def add_timestamps(batch, start_time):
    batch_size = len(batch)
    rows_per_second = batch_size // 5
    remaining_rows = batch_size % 5

    for i in range(5):
        ts = int(dt.datetime.timestamp(start_time) + i)
        for j in range(rows_per_second):
            index = i * rows_per_second + j
            batch[index]['ts'] = ts

    # Spread the remaining rows over the next 5 seconds
    for k in range(remaining_rows):
        index = 5 * rows_per_second + k
        batch[index]['ts'] = int(dt.datetime.timestamp(start_time) + (k % 5))

    return batch

# Streaming function for browsing behavior and transactions
def stream_browsing_and_transactions(browsing_file, transactions_file, browsing_topic, transactions_topic):
    try:
        with open(browsing_file, mode='r') as browsing_f, open(transactions_file, mode='r') as transactions_f:
            browsing_reader = csv.DictReader(browsing_f)
            transactions_reader = csv.DictReader(transactions_f)
            transaction_iterator = iter(transactions_reader)

            producer = connect_kafka_producer()

            while True:
                # Random batch size between 500-1000
                batch_size = random.randint(500, 1000)
                browsing_batch = []

                # Read the next random batch of browsing behavior lines
                try:
                    for _ in range(batch_size):
                        browsing_batch.append(next(browsing_reader))
                except StopIteration:
                    print(f"End of browsing file reached for {browsing_topic}.")
                    break

                # Get the start time for the batch
                start_time = dt.datetime.utcnow()

                # Add timestamps to the browsing batch, spread across 5 seconds
                browsing_batch = add_timestamps(browsing_batch, start_time)

                # Get the min and max event_time from the browsing batch
                browsing_event_times = [row['event_time'] for row in browsing_batch]
                min_event_time = min(browsing_event_times)
                max_event_time = max(browsing_event_times)

                print(f"Processing browsing batch with event time range: {min_event_time} to {max_event_time}")

                # Publish browsing data to Kafka
                for record in browsing_batch:
                    publish_message(producer, browsing_topic, record)

                # Now process transactions within the same event time window
                transaction_batch = []
                try:
                    while True:
                        txn = next(transaction_iterator)
                        # Filter transactions within the browsing time window
                        if min_event_time <= txn['created_at'] <= max_event_time:
                            transaction_batch.append(txn)
                        elif txn['created_at'] > max_event_time:
                            break
                except StopIteration:
                    print(f"End of transaction file reached for {transactions_topic}.")
                    break

                # Publish transactions without adding timestamps
                if transaction_batch:
                    for txn in transaction_batch:
                        publish_message(producer, transactions_topic, txn)

                # Wait time for 5 seconds before publishing the next batch
                time.sleep(5)

    except Exception as e:
        print(f"Error processing files {browsing_file}, {transactions_file}: {e}")
    finally:
        if producer:
            producer.flush()
            producer.close() 

if __name__ == '__main__':
    # Kafka topics for browsing and transactions
    browsing_topic = 'browsing-behavior-topic'
    transactions_topic = 'transactions-topic'

    # Paths to the CSV files
    browsing_file = "new_browsing_behaviour.csv"
    transactions_file = "new_transactions.csv"

    print('Publishing browsing behavior and transactions data to Kafka...')

    # Stream browsing behavior and synchronize transactions by event time
    stream_browsing_and_transactions(browsing_file, transactions_file, browsing_topic, transactions_topic)
