In [0]:
# import time
# time.sleep(300) # Await for 5 minutes

In [0]:
import json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, LongType
from pyspark.sql.functions import col,from_json
import json


In [0]:
ack_schema = StructType([
    StructField("filename", StringType(), True),
    StructField("http_response", IntegerType(),True),
    StructField("timestamp", TimestampType(), True),
    StructField("http_message", StringType(), True)
])

In [0]:
## set up the configuration to allow the autoloader to connect to the source system

## Eventhub details
connection_string = dbutils.secrets.get("ingest00-meta002-sbox", "evh-joh-ack-dev-uks-dlrm-01-key")
consumer_group = "$Default"
# Encrypt the connection string using the EventHubsUtils.encrypt method
encrypted_conn_str = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connection_string)
# Create a JSON string for starting positions for 10 partitions (0 through 9)
starting_positions = {str(i): "-1" for i in range(10)}
starting_positions_json = json.dumps(starting_positions)

ehConf = {
    "eventhubs.connectionString": encrypted_conn_str,
    "eventhubs.consumerGroup": consumer_group,
    # "eventhubs.startingPositions": starting_positions_json
}



eventhubdf = spark.readStream.format("eventhubs")\
    .options(**ehConf)\
        .load()

# eventhubdf.display()

In [0]:
# Service principal credentials
client_id = dbutils.secrets.get("ingest00-meta002-sbox", "SERVICE-PRINCIPLE-CLIENT-ID")
client_secret = dbutils.secrets.get("ingest00-meta002-sbox", "SERVICE-PRINCIPLE-CLIENT-SECRET")
tenant_id = dbutils.secrets.get("ingest00-meta002-sbox", "SERVICE-PRINCIPLE-TENANT-ID")

# Storage account names
curated_storage = "ingest00curatedsbox"
checkpoint_storage = "ingest00xcuttingsbox"

# Spark config for curated storage (Delta table)
spark.conf.set(f"fs.azure.account.auth.type.{curated_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{curated_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{curated_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{curated_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{curated_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{checkpoint_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{checkpoint_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{checkpoint_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{checkpoint_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{checkpoint_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

In [0]:
# Container and path for storing Delta table (in curated storage account)
data_path = "abfss://silver@ingest00curatedsbox.dfs.core.windows.net/ARIADM/ARM/AUDIT/JOH/joh_ack_audit"

# Container and path for checkpoint (in xcuttings storage account)
checkpoint_path = "abfss://db-ack-checkpoint@ingest00xcuttingsbox.dfs.core.windows.net/ARMJOH/ACK/ack"


In [0]:
parsed_df = (
    eventhubdf
    # 'body' is binary, so we cast to string (assuming UTF-8)
    .select(col("body").cast("string").alias("json_str"))
    .select(from_json(col("json_str"), ack_schema).alias("json_obj"))
    .select("json_obj.*")
)


# parsed_df.display()

In [0]:
# ack_path = "/mnt/ingest00curatedsboxsilver/ARIADM/ARM/AUDIT/JOH/"

parsed_df.writeStream \
    .format("delta") \
    .option("checkpointLocation", checkpoint_path) \
    .outputMode("append") \
    .start(data_path)

In [0]:
# dbutils.fs.ls(data_path)


In [0]:
df = spark.read.format("delta").load(data_path)
display(df)

In [0]:
dbutils.notebook.exit("Notebook completed successfully")

## Appendix

In [0]:
# dbutils.fs.rm("/mnt/autoLoaderSchema/", True)

In [0]:
# from pyspark.sql import SparkSession
# from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# # Initialize Spark session (if needed)
# spark = SparkSession.builder.getOrCreate()

# # Define schema
# schema = StructType([
#     StructField("EmployeeID", IntegerType(), True),
#     StructField("Name", StringType(), True),
#     StructField("Department", StringType(), True),
#     StructField("Salary", IntegerType(), True)
# ])

# # Create dummy data
# data = [
#     (101, "Alice", "Engineering", 80000),
#     (102, "Bob", "HR", 50000),
#     (103, "Charlie", "Finance", 60000),
#     (104, "Diana", "Marketing", 70000)
# ]

# # Create DataFrame
# df = spark.createDataFrame(data, schema)

# # Show the DataFrame
# df.display()



In [0]:
# file_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/testtable"
# file_path

In [0]:
# test