In [0]:
from confluent_kafka import Producer
import json
from  itertools import islice
import numpy as np
from pyspark.sql.functions import col, decode, split, element_at,udf
import logging
from pyspark.sql.types import StructType, StructField, StringType
from pyspark import SparkContext


## Read in HTML and JSON files

In [0]:
eh_kv_secret = dbutils.secrets.get(scope="ingest00-meta002-sbox", key="EventHubNamespace-ConnStr")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, element_at
from pyspark.sql.types import StructType, StructField, StringType
from pyspark import SparkContext



# Event Hub configurations
eventhubs_hostname = "sbox-dlrm-eventhub-ns.servicebus.windows.net:9093"
conf = {
    'bootstrap.servers': eventhubs_hostname,
    'security.protocol': 'SASL_SSL',
    'sasl.mechanism': 'PLAIN',
    'sasl.username': '$ConnectionString',
    # 'sasl.password': "Endpoint=sb://sbox-dlrm-eventhub-ns.servicebus.windows.net/;SharedAccessKeyName=RootManageSharedAccessKey;SharedAccessKey=" ,
    'sasl.password': eh_kv_secret,
    'retries': 5,                     # Increased retries
    'enable.idempotence': True,        # Enable idempotent producer
}
broadcast_conf = sc.broadcast(conf)



In [0]:
# Read and prepare data HTML files
json_mount = '/mnt/ingest00curatedsboxgold/ARIADM/ARM/APPEALS/'
binary_df = spark.read.format('binaryFile') \
                     .option('pathGlobFilter', '*.{html,json,a360}') \
                     .option('recursiveFileLookup', 'true') \
                     .load(json_mount)



html_df = binary_df.withColumn("content_str", decode(col('content'), 'utf-8')) \
                   .withColumn('file_path', element_at(split(col('path'), '/'), -1))
html_df = html_df.select('content_str','file_path')

# # Repartition based on cluster resources
# num_spark_partitions =  8
# optimized_html_df = html_df.repartition(num_spark_partitions)

html_df.display()

## Send to EventHubs

In [0]:
# Repartition based on cluster resources
num_spark_partitions =  8
optimized_html_df = html_df.repartition(num_spark_partitions)

In [0]:
def process_partition(partition):
    import logging
    from confluent_kafka import Producer

    # Initialize logger
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger('KafkaProducer')
    
    failure_list = []
    success_list = []
    results = []

    # Initialize producer
    producer = Producer(**broadcast_conf.value)

    def delivery_report(err, msg):
        key_str = msg.key().decode('utf-8') if msg.key() is not None else "Unknown"
        if err is not None:
            err_msg = str(err)
            logger.error(f"Message delivery failed for key {key_str}: {err}")
            failure_list.append((key_str, "failure", err_msg))
        else:
            success_list.append((key_str, "success", ""))

    for row in partition:
        if row.file_path is None or row.content_str is None:
            logger.warning(f"Skipping row with missing file_path/content_str: {row}")
            continue

        try:
            if isinstance(row.content_str, str):
                value = row.content_str.encode('utf-8')
            elif isinstance(row.content_str, bytearray):
                value = bytes(row.content_str)
            elif isinstance(row.content_str, bytes):
                value = row.content_str
            else:
                logger.error(f"Unsupported type for content_str: {type(row.content_str)}")
                failure_list.append((row.file_path, "failure", "Unsupported type"))
                continue

            producer.produce(
                topic='evh-apl-pub-dev-uks-dlrm-01',
                key=row.file_path.encode('utf-8'),
                value=value,
                callback=delivery_report
            )

        except BufferError:
            logger.error("Producer buffer full. Polling for events.")
            producer.poll(1)  
        except Exception as e:
            logger.error(f"Unexpected error during production: {e}")
            failure_list.append((row.file_path, "failure", str(e)))

    try:
        producer.flush()
        logger.info("Producer flushed successfully.")
    except Exception as e:
        logger.error(f"Unexpected error during flush: {e}")

    # Append results to list instead of using yield
    results.extend(success_list)
    results.extend(failure_list)

    return results  # Return list instead of using yield

# Schema for result DataFrame
schema = StructType([
    StructField("file_name", StringType(), True),
    StructField("status", StringType(), True),
    StructField("error_message", StringType(), True)
])

# Apply the optimized processing
result_rdd = optimized_html_df.rdd.mapPartitions(process_partition)

# Create DataFrame and show results
result_df = spark.createDataFrame(result_rdd, schema)
result_df.display()  # Debugging step to verify output

In [0]:
# # Display failed files

failed_files = result_df.filter(col("status") == "failure")

display(failed_files)


In [0]:
# display(dbutils.fs.ls("/mnt/dropzoneariab/ARIAB/submission"))




In [0]:
from pyspark.sql.functions import from_unixtime, lit

timestamp = 1739971552000 / 1000
datetime_str = from_unixtime(lit(timestamp)).cast("timestamp")

display(datetime_str)

In [0]:
from pyspark.sql.functions import col, from_unixtime

files_df = spark.createDataFrame(dbutils.fs.ls("/mnt/dropzoneariab/ARIAB/submission/"))
files_df = files_df.withColumn("modificationTime", from_unixtime(col("modificationTime") / 1000).cast("timestamp"))

display(files_df.orderBy(col("modificationTime").desc()))

In [0]:
from pyspark.sql.functions import col, from_unixtime

files_df = spark.createDataFrame(dbutils.fs.ls("/mnt/dropzoneariab/ARIAB/response/"))
files_df = files_df.withColumn("modificationTime", from_unixtime(col("modificationTime") / 1000).cast("timestamp"))

display(files_df.orderBy(col("modificationTime").desc()))

In [0]:
display(dbutils.fs.ls("/mnt/dropzoneariafta/ARIAFTA/submission/"))

In [0]:
# Read and prepare data HTML files
t_json_mount = '/mnt/dropzoneariab/ARIAB/response/'
t_binary_df = spark.read.format('binaryFile') \
                     .option('pathGlobFilter', '*.rsp') \
                     .option('recursiveFileLookup', 'true') \
                     .load(t_json_mount)
 
 
 
t_html_df = t_binary_df.withColumn("content_str", decode(col('content'), 'utf-8')) \
                   .withColumn('file_path', element_at(split(col('path'), '/'), -1))
t_html_df = t_html_df.select('content_str','file_path')
 
display(t_html_df)

In [0]:
t_html_df.count()

In [0]:
display(dbutils.fs.ls("/mnt/dropzoneariab/ARIAB/submission/"))

In [0]:
context = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
print(context)

In [0]:
context = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
workspace_host = context.tags().get("browserHostName").get()  # Example: 'adb-3635282203417052.12.azuredatabricks.net'
print(f"Workspace Name: {workspace_host}")
workspace_name = workspace_host.split(".")[0]  # Extracts adb-3635282203417052
print(f"Workspace Name: {workspace_name}")



In [0]:
%sql
select coalesce(NULL, date_format(current_timestamp, "yyyy-MM-dd'T'HH:mm:ss'Z'"))


In [0]:
from pyspark.sql.functions import current_timestamp, col

current_datetime_df = spark.createDataFrame([(1,)], ["id"]).withColumn("current_datetime", current_timestamp)
display(current_datetime_df)