In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lower, regexp_extract, concat, lit, split, from_json
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from pathlib import Path
import json
from pathlib import Path
import requests

In [0]:
#Configuración de conexión JDBC
JDBC_CONFIG = {
    "hostname": "psql-dn-keycloak-restore.postgres.database.azure.com",
    "port": 5432,
    "database": "keycloak",
    "username": dbutils.secrets.get(scope='secret-storeview', key='username-keycloak-db'),
    "password": dbutils.secrets.get(scope='secret-storeview', key='password-keycloak-db'),
    "driver": "org.postgresql.Driver"
}

jdbcUrl = f"jdbc:postgresql://{JDBC_CONFIG['hostname']}:{JDBC_CONFIG['port']}/{JDBC_CONFIG['database']}?sslmode=require"
connectionProperties = {
    "user": JDBC_CONFIG["username"],
    "password": JDBC_CONFIG["password"],
    "driver": JDBC_CONFIG["driver"]
}


In [0]:
df = spark.read.schema(schema).json(DATA_PATH)


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from pyspark.sql.functions import col, concat, lit, regexp_extract, from_json

DATA_PATH = "/mnt/adlsstoreview/realtimeprueba/keyclocklogs/y=2025/m=08/d=01/h=00/m=00/*json"
# https://adlsstoreview.blob.core.windows.net/realtimeprueba/keyclocklogs/y=2025/m=08/d=12/h=02/m=00/PT1H.json

MAX_RECORDS = 1000

# Definimos el esquema esperado
schema = StructType([
    StructField("Activity", StringType(), True),
    StructField("Email", StringType(), True),
    StructField("ItemId", StringType(), True),
    StructField("Item", StringType(), True),
    StructField("Time", TimestampType(), True)
])

# df = spark.read.schema(schema).json(f"{DATA_PATH}/*.json")

# dbutils.fs.ls(DATA_PATH)
# df = spark.read.schema(schema).json(f"{DATA_PATH}/*.json")
# df = spark.read.json("{DATA_PATH}/*/*")
df = spark.read.schema(schema).json(DATA_PATH)



df = df.filter(col("Activity").isNotNull())

# Construcción de la cadena JSON intermedia
df = df.withColumn(
    "resultDescription_listado",
    concat(
        lit("{"),
        regexp_extract(col("Activity"), "\\{(.*?)\\}", 1),
        lit(',"Time":"'), col("Time"), lit('"}')
    )
)

# Parseo de la cadena intermedia a columnas usando el mismo esquema
df = df.select(from_json(col("resultDescription_listado"), schema).alias("parsed")).select("parsed.*")

# Filtramos solo "View Report" y eliminamos duplicados
df = df.filter(col("Activity") == "View Report").distinct()

# Mostrar solo los primeros registros
df.limit(MAX_RECORDS).display()


In [0]:
dbutils.fs.ls("/mnt/adlsstoreview/realtimeprueba/keyclocklogs/y=2025/m=08/d=01/h=16/m=00")

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,TimestampType

schema = StructType([
    StructField("time", TimestampType(), True),
    StructField("resultDescription", StringType(), True),
    StructField("resourceId", StringType(), True),
    StructField("level", StringType(), True),
    StructField("operationName", StringType(), True),
    StructField("containerId", StringType(), True),
    StructField("location", StringType(), True),
    StructField("category", StringType(), True),
    StructField("EventStampType", StringType(), True),
    StructField("EventPrimaryStampName", StringType(), True),
    StructField("EventStampName", StringType(), True),
    StructField("Host", StringType(), True),
    StructField("EventIpAddress", StringType(), True)
])

path = (
    "wasbs://realtimeprueba@adlsstoreview.blob.core.windows.net/"
    "keyclocklogs/y=2025/m=08/d=12/h=02/m=00/"
)
df = spark.read.option("recursiveFileLookup", "true").schema(schema).json(path)
display(df)

In [0]:
json_log_path = "abfss://realtimeprueba@adlsstoreview.dfs.core.windows.net/keyclocklogs/y=2025/m=08/d=12/h=02/m=00/PT1H.json"

# Read the JSON data
spark.conf.set("fs.azure.account.key.adlsstoreview.dfs.core.windows.net", "<YOUR_STORAGE_ACCOUNT_KEY>")
df = spark.read.json(json_log_path, multiLine=True)


# Display the schema and some data
df.printSchema()
df.display()

In [0]:
## PRUEBA DE VALIDACIÓN - ingesta_logs_blob

# Conexión a Azure Blob Storage
conn_string = dbutils.secrets.get(scope='secret-storeview', key='connectionstring-portalbi')
container_name = "insights-logs-appserviceconsolelogs"

# Leer máximo 3 blobs y máximo 100 líneas de datos (Parámetros de control de carga)
MAX_BLOBS = 10
MAX_RECORDS = 1000

blob_service = BlobServiceClient.from_connection_string(conn_string)
container_client = blob_service.get_container_client(container_name)

data_list = []
blob_count = 0

for blob in container_client.list_blobs():
    if blob.name.endswith('.json'):
        blob_count += 1
        content = container_client.get_blob_client(blob).download_blob().content_as_text()
        for entry in content.strip().split('\n'):
            if len(data_list) >= MAX_RECORDS:
                break
            try:
                data_list.append(json.loads(entry))
            except:
                continue
    if blob_count >= MAX_BLOBS or len(data_list) >= MAX_RECORDS:
        break

# Esquema y transformación
schema = StructType([
    StructField("Activity", StringType(), True),
    StructField("Email", StringType(), True),
    StructField("ItemId", StringType(), True),
    StructField("Item", StringType(), True),
    StructField("Time", TimestampType(), True)
])

df = spark.createDataFrame(data_list)
df = df.filter("resultDescription like '%Activity%'")
df = df.withColumn("resultDescription_listado", concat(
    lit("{"),
    regexp_extract(col("resultDescription"), "\\{(.*?)\\}", 1),
    lit(',"Time":"'), col("time"), lit('"}')
))
df = df.select(from_json(col("resultDescription_listado"), schema).alias("parsed")).select("parsed.*")
df = df.filter(col("Activity") == "View Report").distinct()

# Mostrar una muestra (sin cargar todo)
df.limit(100).display()




In [0]:
# Guardar en delta

path_delta = "/mnt/bronze/delta/insights_logs_appserviceconsolelogs"
df.write.format("delta").mode("append").save(path_delta)
spark.sql(f"CREATE TABLE IF NOT EXISTS bronze.insights_logs_appserviceconsolelogs USING DELTA LOCATION '{path_delta}/'")
spark.sql("REFRESH TABLE bronze.insights_logs_appserviceconsolelogs")

In [0]:
consulta = spark.sql("""select * from bronze.insights_logs_appserviceconsolelogs""")
display(consulta)