In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lower, regexp_extract, concat, lit, split, from_json
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from pathlib import Path
import json
from pathlib import Path
import requests

In [0]:
#Configuración de conexión JDBC
JDBC_CONFIG = {
    "hostname": "psql-dn-keycloak-restore.postgres.database.azure.com",
    "port": 5432,
    "database": "keycloak",
    "username": dbutils.secrets.get(scope='secret-storeview', key='username-keycloak-db'),
    "password": dbutils.secrets.get(scope='secret-storeview', key='password-keycloak-db'),
    "driver": "org.postgresql.Driver"
}

jdbcUrl = f"jdbc:postgresql://{JDBC_CONFIG['hostname']}:{JDBC_CONFIG['port']}/{JDBC_CONFIG['database']}?sslmode=require"
connectionProperties = {
    "user": JDBC_CONFIG["username"],
    "password": JDBC_CONFIG["password"],
    "driver": JDBC_CONFIG["driver"]
}

In [0]:
from pyspark.sql import SparkSession
from azure.storage.blob import BlobServiceClient

# Crear SparkSession si no existe
spark = SparkSession.builder.appName("JsonBlobReader").getOrCreate()

# Conexión a tu Storage Account
connection_string = "BlobEndpoint=https://adlsstoreview.blob.core.windows.net/;QueueEndpoint=https://adlsstoreview.queue.core.windows.net/;FileEndpoint=https://adlsstoreview.file.core.windows.net/;TableEndpoint=https://adlsstoreview.table.core.windows.net/;SharedAccessSignature=sv=2024-11-04&ss=bfqt&srt=sco&sp=rwdlacupyx&se=2025-10-09T01:33:59Z&st=2025-08-28T17:18:59Z&spr=https&sig=6M5CEY455HOFCu7R2V2T5TOOaW6GKNg%2FT5jhb5GJAaI%3D"
container_name = "realtimeprueba"
storage_account = "adlsstoreview"

# Paths relativos dentro del contenedor (OJO: no usar /mnt aquí)
source_prefix = "PT1H"
target_prefix = "PT1H_block"
# print(connection_string)
# Cliente de Azure Blob
blob_service = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service.get_container_client(container_name)

# Conversión AppendBlob → BlockBlob
for blob in container_client.list_blobs():
    blob_client = container_client.get_blob_client(blob)
    props = blob_client.get_blob_properties()
    if props.blob_type == "AppendBlob":
        print(f"Convirtiendo AppendBlob → BlockBlob: {blob.name}")
        data = blob_client.download_blob().readall()

        # Forzar extensión .json para que Spark lo pueda leer
        new_name = blob.name.replace(source_prefix, target_prefix, 1)
        if not new_name.endswith(".json"):
            new_name = new_name + ".json"

        container_client.upload_blob(
            name=new_name,
            data=data,
            blob_type="BlockBlob",
            overwrite=True
        )


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType


json_schema = StructType([
    StructField("time", StringType(), True),  # o TimestampType() si quieres convertirlo
    StructField("resultDescription", StringType(), True),
    StructField("resourceId", StringType(), True),
    StructField("level", StringType(), True),
    StructField("operationName", StringType(), True),
    StructField("containerId", StringType(), True),
    StructField("location", StringType(), True),
    StructField("category", StringType(), True),
    StructField("EventStampType", StringType(), True),
    StructField("EventPrimaryStampName", StringType(), True),
    StructField("EventStampName", StringType(), True),
    StructField("Host", StringType(), True),
    StructField("EventIpAddress", StringType(), True)
])


df = spark.read.json("/mnt/adlsstoreview/realtimeprueba/keyclocklogs/y=2025/m=08/d=01/h=00/m=00/PT1H_block.json")

display(df.limit(10))

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql.functions import regexp_extract, from_json, col

inner_schema = StructType([
    StructField("Activity", StringType(), True),
    StructField("Email", StringType(), True),
    StructField("Time", StringType(), True),
    StructField("DurationInSecond", DoubleType(), True),
    StructField("ItemId", StringType(), True)
])


# Extraer solo la parte JSON dentro de resultDescription
df_clean = df.withColumn(
    "result_json",
    regexp_extract(col("resultDescription"), r"(\{.*\})", 1)
)


# Parsear el JSON y expandir columnas
df_parsed = df_clean.withColumn(
    "parsed",
    from_json(col("result_json"), inner_schema)
).select(
    "*",
    col("parsed.Activity").alias("Activity"),
    col("parsed.Email").alias("Email"),
    col("parsed.Time").alias("InnerTime"),
    col("parsed.DurationInSecond").alias("DurationInSecond"),
    col("parsed.ItemId").alias("ItemId")
).drop("parsed", "result_json")

display(df_parsed.limit(10))

In [0]:
# Guardar en delta

path_delta = "/mnt/calidad_datos/Bronze/Ingesta/delta/carga_json_keycloak"
df_parsed.write.format("delta").mode("overwrite").save(path_delta)
spark.sql(f"CREATE TABLE IF NOT EXISTS bronze.carga_json_keycloak USING DELTA LOCATION '{path_delta}/'")
spark.sql("REFRESH TABLE bronze.carga_json_keycloak")

In [0]:
consulta = spark.sql("""select * from bronze.carga_json_keycloak""")
display(consulta)