## Generate Synthetic Data
Generate  synthetic streaming credit card transaction data for feature engineering and publish it to a kafka topic. 

### Prerequisites
- Kafka topic to write and read streaming data
  - Configure Kafka topic information in **utils/config.py** prior to running this notebook. 

In [0]:
dbutils.library.restartPython()
spark.conf.set("spark.sql.streaming.stateStore.providerClass", "com.databricks.sql.streaming.state.RocksDBStateStoreProvider")
spark.conf.set("spark.databricks.streaming.statefulOperator.asyncCheckpoint.enabled", "false")
spark.conf.set("spark.sql.streaming.stateStore.rocksdb.changelogCheckpointing.enabled", "true")
spark.conf.set("spark.sql.shuffle.partitions", "8")



In [0]:
# Import required libraries
import logging
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Import utility modules
from utils.data_generator import TransactionDataGenerator
from utils.config import Config

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

INFO:dbldatagen._version:Version : VersionInfo(major='0', minor='4', patch='0', release='post', build='1')


## Step 1: Generate Streaming Transaction Data

Create a streaming source that continuously generates synthetic transactions.

In [0]:
#Get Config
config = Config()

# Generate streaming transaction data
data_gen = TransactionDataGenerator(spark)
df_transactions = data_gen.generate_transaction_data(
    num_users=config.data_gen_config["num_users"],              # unique users
    num_merchants=config.data_gen_config["num_merchants"],      # unique merchants
    rows_per_second=config.data_gen_config["rows_per_second"]     # transactions per second
)

INFO:utils.data_generator:Creating synthetic transaction source with dbldatagen...
INFO:utils.data_generator:   Rows: 1000
INFO:utils.data_generator:   Users: 10000, Merchants: 1000
INFO:utils.data_generator:Synthetic source created successfully


## Step 2: Write Stream Data to Kafka Topic

In [0]:
#Get Kafka Config
kafka_credentials_secrets = config.kafka_config["kafka_credentials_secrets"]
scope = kafka_credentials_secrets["scope"]

# Retrieve secrets from Databricks secret scope. 
# Note: Daatabricks secrets should be stored prior to using them and it's not covered in this example
# You can find more information about Databricks secrets here: https://docs.databricks.com/aws/en/security/secrets/#secrets-overview 
KAFKA_USERNAME = dbutils.secrets.get(scope = scope, key = kafka_credentials_secrets["username"])
KAFKA_SECRET = dbutils.secrets.get(scope = scope, key = kafka_credentials_secrets["secret"])
KAFKA_SERVER = dbutils.secrets.get(scope = scope, key = kafka_credentials_secrets["server"])
KAFKA_TOPIC = config.kafka_config["kafka_topic"]

#Data Generator Configuration
CHECKPOINT_BASE_PATH = config.kafka_config["checkpoint_base_path"]
CHECKPOINT_LOCATION = f"{CHECKPOINT_BASE_PATH}/transaction-data-generator-checkpoint"
#Since we generate data using synthentic dbldatagen tool, we need to delete old checkpoint directory everytime we run the notebook to avoid any conflicts
dbutils.fs.rm(CHECKPOINT_LOCATION, True) 

#Convert row to JSON String
json_df = df_transactions.select(to_json(struct(*[col(c) for c in df_transactions.columns])).alias("value"), col("transaction_id").alias("key"))

kafkaWriter = (
    json_df
    .writeStream
    .format("kafka")
    .option("kafka.bootstrap.servers", KAFKA_SERVER)
    .option("kafka.security.protocol", "SASL_SSL")
    .option("kafka.sasl.jaas.config", f"kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username='{KAFKA_USERNAME}' password='{KAFKA_SECRET}';")
    .option("kafka.ssl.endpoint.identification.algorithm", "https")
    .option("kafka.sasl.mechanism", "PLAIN")
    .option("topic", KAFKA_TOPIC)
    .option("failOnDataLoss", "true")
    .option("checkpointLocation", CHECKPOINT_LOCATION)
)

#Start Kafka Producer
kafkaQuery = kafkaWriter.start()

print(f"""
[STREAMING] Kafka producer started successfully
  Query ID: {kafkaQuery.id}
  Query Name: {kafkaQuery.name}
  Status: {kafkaQuery.status}
  Kafka Topic: {KAFKA_TOPIC}
  Checkpoint: {CHECKPOINT_LOCATION}
  Serialization: Protobuf (Confluent Schema Registry)
""")


[STREAMING] Kafka producer started successfully
  Query ID: fb7aaf64-7725-478d-9f7f-22be9dc24167
  Query Name: None
  Status: {'message': 'Initializing sources', 'isDataAvailable': False, 'isTriggerActive': False}
  Kafka Topic: fraud_feature_eng_example
  Checkpoint: /Volumes/main/fraud_feature_eng_demo/default/checkpoints//transaction-data-generator-checkpoint
  Serialization: Protobuf (Confluent Schema Registry)



In [0]:
# Stop Kafka Writer
# if kafkaQuery.isActive:
#     kafkaQuery.stop()
#     logger.info("Streaming query stopped")

# logger.info("\nPipeline complete!")