In [1]:
# ==========================================================
# üìò BRONZE ‚Üí SILVER (DELTA LAKE)
# ==========================================================
# Este notebook l√™ os dados brutos de posi√ß√£o dos √¥nibus no MinIO (camada Bronze),
# transforma-os em formato tabular e escreve incrementalmente em formato Delta
# na camada Silver.
# ==========================================================

In [2]:
import os
from datetime import datetime
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, explode, current_timestamp, to_timestamp, to_date, when, lit
from delta.tables import DeltaTable

MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT_DOCKER")
MINIO_ACCESS_KEY = os.getenv("MINIO_ROOT_USER",)
MINIO_SECRET_KEY = os.getenv("MINIO_ROOT_PASSWORD")

today = datetime.now().strftime("%Y/%m/%d")
BRONZE_PATH = f"s3a://bronze/posicao/{today}"
#BRONZE_PATH = "s3a://bronze/posicao/*/*/*/"    # USAR UMA VEZ PRA PROCESSAR TODOS OS JSONS
SILVER_PATH = "s3a://silver/posicao/"


print(f"Lendo Bronze de: {BRONZE_PATH}")
print(f"Escrevendo Silver em: {SILVER_PATH}")

Lendo Bronze de: s3a://bronze/posicao/2025/11/03
Escrevendo Silver em: s3a://silver/posicao/


In [3]:
# Inicializa√ß√£o do Spark com suporte ao Delta Lake e MinIO
builder = (
    SparkSession.builder.appName("BronzeToSilver_Delta")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.hadoop.fs.s3a.endpoint", f"http://{MINIO_ENDPOINT}")
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY)
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY)
    .config("spark.hadoop.fs.s3a.path.style.access", True)
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark

In [4]:
# Definir schema expl√≠cito para evitar erro de infer√™ncia

schema = StructType([
    StructField("hr", StringType(), True),
    StructField("l", ArrayType(
        StructType([
            StructField("c", StringType(), True),   # c√≥digo vis√≠vel (ex: "6L10-10")
            StructField("cl", IntegerType(), True), # c√≥digo de linha usado pra outras chamadas de API
            StructField("sl", IntegerType(), True), # sentido (1 = ida, 2 = volta)
            StructField("lt0", StringType(), True), # terminal inicial
            StructField("lt1", StringType(), True), # terminal final
            StructField("qv", IntegerType(), True), # quantidade de ve√≠culos
            StructField("vs", ArrayType(
                StructType([
                    StructField("p", IntegerType(), True),  # c√≥digo do ve√≠culo
                    StructField("a", BooleanType(), True),  # acessibilidade
                    StructField("ta", StringType(), True),  # timestamp de atualiza√ß√£o
                    StructField("py", DoubleType(), True),  # latitude
                    StructField("px", DoubleType(), True),  # longitude
                ])
            ), True)
        ])
    ), True)
])

In [5]:
df_raw = spark.read.option("multiline", True).schema(schema).json(BRONZE_PATH)
df_raw.printSchema()

root
 |-- hr: string (nullable = true)
 |-- l: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- c: string (nullable = true)
 |    |    |-- cl: integer (nullable = true)
 |    |    |-- sl: integer (nullable = true)
 |    |    |-- lt0: string (nullable = true)
 |    |    |-- lt1: string (nullable = true)
 |    |    |-- qv: integer (nullable = true)
 |    |    |-- vs: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- p: integer (nullable = true)
 |    |    |    |    |-- a: boolean (nullable = true)
 |    |    |    |    |-- ta: string (nullable = true)
 |    |    |    |    |-- py: double (nullable = true)
 |    |    |    |    |-- px: double (nullable = true)



In [6]:
# Explode arrays e remover nulos 
df_exploded = (
    df_raw
    .withColumn("linha", explode(col("l")))
    .withColumn("veiculo", explode(col("linha.vs")))
    .filter(col("linha.cl").isNotNull())
)

# Classificando o tipo da linha - onibus e outros
df_exploded = df_exploded.withColumn(
    "tipo_linha",
    when((col("linha.cl") < 1000) | (col("linha.c").rlike("GUIN|TEST|TST")), lit("tecnica"))
    .otherwise(lit("regular"))
    .filter(col("tipo_linha") == "regular")
)

TypeError: 'Column' object is not callable

In [None]:
df_clean = (
    df_exploded.select(
        col("linha.c").alias("codigo_linha_texto"),
        col("linha.cl").alias("codigo_linha"),
        col("tipo_linha"),
        col("linha.sl").alias("sentido"),
        col("linha.lt0").alias("terminal_inicial"),
        col("linha.lt1").alias("terminal_final"),
        col("veiculo.p").alias("codigo_veiculo"),
        col("veiculo.a").alias("acessibilidade"),
        to_timestamp(col("veiculo.ta")).alias("ultima_atualizacao"),
        col("veiculo.py").alias("latitude"),
        col("veiculo.px").alias("longitude"),
        to_timestamp(col("hr")).alias("hora_referencia"),
    )
    .dropDuplicates(["codigo_veiculo", "hora_referencia"])
    .withColumn("data_ref", to_date(col("ultima_atualizacao"))) # data da √∫ltima atualiza√ß√£o do √¥nibus. para particionamento no delta lake
    .withColumn("ingest_timestamp", current_timestamp()) # timestamp do spark job
)
df_clean.show(10)

In [None]:
# Escrita ou merge no Delta Lake

if DeltaTable.isDeltaTable(spark, SILVER_PATH):
    print("‚öôÔ∏è Atualizando tabela Delta existente...")
    delta_table = DeltaTable.forPath(spark, SILVER_PATH)
    (
        delta_table.alias("t")
        .merge(
            df_clean.alias("s"),
            "t.codigo_veiculo = s.codigo_veiculo AND t.data_ref = s.data_ref"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )
else:
    print("üÜï Criando tabela Delta inicial...")
    (
        df_clean
        .write
        .format("delta")
        .mode("append")
        .partitionBy("codigo_linha", "data_ref")
        .save(SILVER_PATH)
    )

In [None]:
# Verifica√ß√£o de resultado

silver_delta = DeltaTable.forPath(spark, SILVER_PATH)
df_result = silver_delta.toDF()

print("Total de registros na Silver:")
print(df_result.count())

df_result.show(5)