In [1]:
!pip install confluent_kafka

Collecting confluent_kafka
  Downloading confluent_kafka-2.6.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (2.3 kB)
Downloading confluent_kafka-2.6.0-cp310-cp310-manylinux_2_28_x86_64.whl (3.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: confluent_kafka
Successfully installed confluent_kafka-2.6.0


In [2]:
!pip show confluent_kafka

Name: confluent-kafka
Version: 2.6.0
Summary: Confluent's Python client for Apache Kafka
Home-page: https://github.com/confluentinc/confluent-kafka-python
Author: Confluent Inc
Author-email: support@confluent.io
License: 
Location: /opt/conda/lib/python3.10/site-packages
Requires: 
Required-by: 


In [None]:
import json
import os
import pandas as pd
from confluent_kafka import Producer

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

CONFLUENT_BOOTSTRAP_SERVER = user_secrets.get_secret("CONFLUENT_BOOTSTRAP_SERVER")
CONFLUENT_API_KEY = user_secrets.get_secret("CONFLUENT_API_KEY")
CONFLUENT_API_SECRET = user_secrets.get_secret("CONFLUENT_API_SECRET")

#Setup kafka producer config

conf = {
    "bootstrap.servers":CONFLUENT_BOOTSTRAP_SERVER,
    "security.protocol":"SASL_SSL",
    "sasl.mechanisms":"PLAIN",
    "sasl.username":CONFLUENT_API_KEY,
    "sasl.password":CONFLUENT_API_SECRET,
    "client.id":"json-serial-producer"
}
producer = Producer(conf)

#Topic name created on conluent cloud
topic = "raw_topic"

#Delivery report callback
def delivery_report(err,msg):
    if err:
        print(f"Message delivery failed: {err}")
    else:
        print(f"Message delivered successfully! Key: {msg.key()}")

#Read checkpoint        
def read_checkpoint(checkpoint_file):
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as file:
            return int(file.read().strip())
    return 0

#Write checkpoint
def write_checkpoint(checkpoint_file,index):
    with open(checkpoint_file, 'w') as file:
        file.write(str(index))
    print(f"Checkpoint updated to line: {index}")
#Handle date    
def handle_date(obj):
    if isinstance(obj, pd.Timestamp):
        return obj.strftime('%Y-%m-%d %H:%M:%S')
    raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")
    
#Stream JSON serially
def stream_json_serially(file_path,checkpoint_file='/kaggle/working/checkpoint.txt'):
    last_sent_index = read_checkpoint(checkpoint_file)
    print("last_sent_index: ", last_sent_index)
    
    with open(file_path,'r') as file:
        for idx,line in enumerate(file):
            if idx < last_sent_index:
                continue
                
            try:
                record = json.loads(line)
                producer.produce(
                    topic,
                    key=str(record['review_id']),
                    value=json.dumps(record,default=handle_date).encode('utf-8'),
                    callback=delivery_report
                )
                
                producer.flush()
                
                write_checkpoint(checkpoint_file, idx + 1)
                
            except json.JSONDecodeError as e:
                print(f"Failed to decode JSON: {e}")
                
if __name__ == "__main__":
    stream_json_serially('/kaggle/input/yelp-dataset/yelp_academic_dataset_review.json')

last_sent_index:  0


%6|1730393425.717|GETSUBSCRIPTIONS|json-serial-producer#producer-1| [thrd:main]: Telemetry client instance id changed from AAAAAAAAAAAAAAAAAAAAAA to j+vsjfC6TgGSe2kg+Zrw1w


Message delivered successfully! Key: b'KU_O5udG6zpxOg-VcAEodg'
Checkpoint updated to line: 1
Message delivered successfully! Key: b'BiTunyQ73aT9WBnpR9DZGw'
Checkpoint updated to line: 2
Message delivered successfully! Key: b'saUsX_uimxRlCVr67Z4Jig'
Checkpoint updated to line: 3
Message delivered successfully! Key: b'AqPFMleE6RsU23_auESxiA'
Checkpoint updated to line: 4
Message delivered successfully! Key: b'Sx8TMOWLNuJBWer-0pcmoA'
Checkpoint updated to line: 5
Message delivered successfully! Key: b'JrIxlS1TzJ-iCu79ul40cQ'
Checkpoint updated to line: 6
Message delivered successfully! Key: b'6AxgBCNX_PNTOxmbRSwcKQ'
Checkpoint updated to line: 7
Message delivered successfully! Key: b'_ZeMknuYdlQcUqng_Im3yg'
Checkpoint updated to line: 8
Message delivered successfully! Key: b'ZKvDG2sBvHVdF5oBNUOpAQ'
Checkpoint updated to line: 9
Message delivered successfully! Key: b'pUycOfUwM8vqX7KjRRhUEA'
Checkpoint updated to line: 10
Message delivered successfully! Key: b'rGQRf8UafX7OTlMNN19I8A'
Checkp