In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lower, regexp_extract, concat, lit, split, from_json
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from pathlib import Path
import json
from pathlib import Path
import requests

In [0]:
from pyspark.sql import SparkSession
from azure.storage.blob import BlobServiceClient, BlobClient
import os

# Crear SparkSession si no existe (en Databricks ya existe `spark`)
spark = SparkSession.builder.appName("JsonBlobReader").getOrCreate()

# ðŸ”‘ ConexiÃ³n a tu Storage Account (desde KeyVault / Databricks Secrets)
connection_string = dbutils.secrets.get(scope='secret-storeview', key='connectionstring-portalbi')
container_name = "insights-logs-appserviceconsolelogs"

# Prefijos de rutas
source_prefix = "raw/json_files/"        # Ruta de origen con AppendBlob
target_prefix = "processed/json_files/"  # Ruta destino para BlockBlob

# Cliente de Azure Blob
blob_service = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service.get_container_client(container_name)

# ðŸš€ Convertir todos los AppendBlob a BlockBlob
for blob in container_client.list_blobs(name_starts_with=source_prefix):
    blob_client = container_client.get_blob_client(blob)

    props = blob_client.get_blob_properties()
    if props.blob_type == "AppendBlob":
        print(f"Convirtiendo AppendBlob â†’ BlockBlob: {blob.name}")

        # Descargar AppendBlob (como bytes)
        data = blob_client.download_blob().readall()

        # Definir el nuevo nombre manteniendo jerarquÃ­a de carpetas
        new_name = blob.name.replace(source_prefix, target_prefix, 1)

        # Subir como BlockBlob
        container_client.upload_blob(
            name=new_name,
            data=data,
            blob_type="BlockBlob",
            overwrite=True
        )

print("âœ… Todos los AppendBlob fueron convertidos a BlockBlob en el nuevo path")

# ðŸ”Ž Ahora sÃ­ puedes leer los JSON con PySpark
storage_account = "yourstorageaccount"  # reemplaza con tu cuenta
df = spark.read.json(
    f"wasbs://{container_name}@{storage_account}.blob.core.windows.net/{target_prefix}"
)

df.printSchema()
df.show(5, truncate=False)


In [0]:
#ConfiguraciÃ³n de conexiÃ³n JDBC
JDBC_CONFIG = {
    "hostname": "psql-dn-keycloak-restore.postgres.database.azure.com",
    "port": 5432,
    "database": "keycloak",
    "username": dbutils.secrets.get(scope='secret-storeview', key='username-keycloak-db'),
    "password": dbutils.secrets.get(scope='secret-storeview', key='password-keycloak-db'),
    "driver": "org.postgresql.Driver"
}

jdbcUrl = f"jdbc:postgresql://{JDBC_CONFIG['hostname']}:{JDBC_CONFIG['port']}/{JDBC_CONFIG['database']}?sslmode=require"
connectionProperties = {
    "user": JDBC_CONFIG["username"],
    "password": JDBC_CONFIG["password"],
    "driver": JDBC_CONFIG["driver"]
}



In [0]:
from azure.storage.blob import BlobClient
import json
 
# ConfiguraciÃ³n
account_name = "<storage_account>"
account_key = "<correct_base64_encoded_storage_key>"
container_name = "<container>"
blob_name = "<append_blob.json>"
 
# Crear cliente
url = f"https://{account_name}.blob.core.windows.net/{container_name}/{blob_name}"
blob = BlobClient(account_url=f"https://{account_name}.blob.core.windows.net", 
                  container_name=container_name,
                  blob_name=blob_name,
                  credential=account_key)
 
# Descargar
data = blob.download_blob().readall().decode("utf-8")
 
# Pasar a Spark
df = spark.read.json(spark.sparkContext.parallelize([data]))
df.show(5)

In [0]:
from azure.storage.blob import BlobServiceClient
import os
 
# ðŸ”‘ ConexiÃ³n a tu Storage Account
connection_string = dbutils.secrets.get(scope='secret-storeview', key='connectionstring-portalbi')
# container_name = "insights-logs-appserviceconsolelogs"     --- Ruta de los archivos

 
# Ruta origen (los AppendBlob)
source_prefix = "raw/json_files/"   # <--- tu path "carpeta"
# Ruta destino (nuevo path BlockBlob)
target_prefix = "processed/json_files/"
 
blob_service = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service.get_container_client(container_name)
 
for blob in container_client.list_blobs(name_starts_with=source_prefix):
    blob_client = container_client.get_blob_client(blob)
 
    # Checar tipo de blob
    props = blob_client.get_blob_properties()
    if props.blob_type == "AppendBlob":
        print(f"Convirtiendo: {blob.name}")
 
        # Descargar AppendBlob
        data = blob_client.download_blob().readall()
 
        # Nombre destino (manteniendo subcarpetas)
        new_name = blob.name.replace(source_prefix, target_prefix, 1)
 
        # Subir como BlockBlob
        container_client.upload_blob(
            name=new_name,
            data=data,
            blob_type="BlockBlob",
            overwrite=True
        )
 
print("âœ… Todos los AppendBlob fueron convertidos a BlockBlob en el nuevo path")
 
# ðŸ”Ž Ahora sÃ­ puedes leer con PySpark como siempre:
df = spark.read.json(f"wasbs://{container_name}@<storage_account>.blob.core.windows.net/{target_prefix}")
df.show()

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from pyspark.sql.functions import col, concat, lit, regexp_extract, from_json

DATA_PATH = "abfss://realtimeprueba@adlsstoreview.dfs.core.windows.net/keyclocklogs/y=2025/m=08/d=12/h=02/m=00/PT1H.json"
# https://adlsstoreview.blob.core.windows.net/realtimeprueba/keyclocklogs/y=2025/m=08/d=12/h=02/m=00/PT1H.json

MAX_RECORDS = 1000

# Definimos el esquema esperado
schema = StructType([
    StructField("Activity", StringType(), True),
    StructField("Email", StringType(), True),
    StructField("ItemId", StringType(), True),
    StructField("Item", StringType(), True),
    StructField("Time", TimestampType(), True)
])

# df = spark.read.schema(schema).json(f"{DATA_PATH}/*.json")

# dbutils.fs.ls(DATA_PATH)
# df = spark.read.schema(schema).json(f"{DATA_PATH}/*.json")
# df = spark.read.json("{DATA_PATH}/*/*")
df = spark.read.schema(schema).json(DATA_PATH)



df = df.filter(col("Activity").isNotNull())

# ConstrucciÃ³n de la cadena JSON intermedia
df = df.withColumn(
    "resultDescription_listado",
    concat(
        lit("{"),
        regexp_extract(col("Activity"), "\\{(.*?)\\}", 1),
        lit(',"Time":"'), col("Time"), lit('"}')
    )
)

# Parseo de la cadena intermedia a columnas usando el mismo esquema
df = df.select(from_json(col("resultDescription_listado"), schema).alias("parsed")).select("parsed.*")

# Filtramos solo "View Report" y eliminamos duplicados
df = df.filter(col("Activity") == "View Report").distinct()

# Mostrar solo los primeros registros
df.limit(MAX_RECORDS).display()


In [0]:
## PRUEBA DE VALIDACIÃ“N - ingesta_logs_blob

# ConexiÃ³n a Azure Blob Storage
conn_string = dbutils.secrets.get(scope='secret-storeview', key='connectionstring-portalbi')
container_name = "insights-logs-appserviceconsolelogs"

# Leer mÃ¡ximo 3 blobs y mÃ¡ximo 100 lÃ­neas de datos (ParÃ¡metros de control de carga)
MAX_BLOBS = 10
MAX_RECORDS = 1000

blob_service = BlobServiceClient.from_connection_string(conn_string)
container_client = blob_service.get_container_client(container_name)

data_list = []
blob_count = 0

for blob in container_client.list_blobs():
    if blob.name.endswith('.json'):
        blob_count += 1
        content = container_client.get_blob_client(blob).download_blob().content_as_text()
        for entry in content.strip().split('\n'):
            if len(data_list) >= MAX_RECORDS:
                break
            try:
                data_list.append(json.loads(entry))
            except:
                continue
    if blob_count >= MAX_BLOBS or len(data_list) >= MAX_RECORDS:
        break

# Esquema y transformaciÃ³n
schema = StructType([
    StructField("Activity", StringType(), True),
    StructField("Email", StringType(), True),
    StructField("ItemId", StringType(), True),
    StructField("Item", StringType(), True),
    StructField("Time", TimestampType(), True)
])

df = spark.createDataFrame(data_list)
df = df.filter("resultDescription like '%Activity%'")
df = df.withColumn("resultDescription_listado", concat(
    lit("{"),
    regexp_extract(col("resultDescription"), "\\{(.*?)\\}", 1),
    lit(',"Time":"'), col("time"), lit('"}')
))
df = df.select(from_json(col("resultDescription_listado"), schema).alias("parsed")).select("parsed.*")
df = df.filter(col("Activity") == "View Report").distinct()

# Mostrar una muestra (sin cargar todo)
df.limit(100).display()




In [0]:
# Guardar en delta

path_delta = "/mnt/bronze/delta/insights_logs_appserviceconsolelogs"
df.write.format("delta").mode("append").save(path_delta)
spark.sql(f"CREATE TABLE IF NOT EXISTS bronze.insights_logs_appserviceconsolelogs USING DELTA LOCATION '{path_delta}/'")
spark.sql("REFRESH TABLE bronze.insights_logs_appserviceconsolelogs")

In [0]:
consulta = spark.sql("""select * from bronze.insights_logs_appserviceconsolelogs""")
display(consulta)