In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, current_timestamp, lit, to_timestamp
from pyspark.sql.types import *
import os

In [2]:
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT_DOCKER")
MINIO_ACCESS_KEY = os.getenv("MINIO_ROOT_USER",)
MINIO_SECRET_KEY = os.getenv("MINIO_ROOT_PASSWORD")

BRONZE_PATH = "s3a://bronze/posicao/*/*/*/"
SILVER_PATH = "s3a://silver/posicao/"

In [3]:
builder = (
    SparkSession.builder.appName("BronzeToSilver_Delta")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.hadoop.fs.s3a.endpoint", f"http://{MINIO_ENDPOINT}")
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY)
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY)
    .config("spark.hadoop.fs.s3a.path.style.access", True)
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark

In [4]:
schema = StructType([
    StructField("hr", StringType(), True),
    StructField("l", ArrayType(
        StructType([
            StructField("c", StringType(), True),   # código visível (ex: "6L10-10")
            StructField("cl", IntegerType(), True), # código numérico
            StructField("sl", IntegerType(), True), # sentido (1 = ida, 2 = volta)
            StructField("lt0", StringType(), True), # terminal inicial
            StructField("lt1", StringType(), True), # terminal final
            StructField("qv", IntegerType(), True), # quantidade de veículos
            StructField("vs", ArrayType(
                StructType([
                    StructField("p", IntegerType(), True),  # código do veículo
                    StructField("a", BooleanType(), True),  # acessibilidade
                    StructField("ta", StringType(), True),  # timestamp de atualização
                    StructField("py", DoubleType(), True),  # latitude
                    StructField("px", DoubleType(), True),  # longitude
                ])
            ), True)
        ])
    ), True)
])

In [5]:
df_raw = spark.read.option("multiline", True).json(BRONZE_PATH)

In [6]:
df_exploded = (
    df_raw
    .withColumn("linha", explode(col("l")))
    .withColumn("veiculo", explode(col("linha.vs")))
)

In [7]:
df_clean = (
    df_exploded
    .select(
        col("linha.c").alias("codigo_linha"),
        col("linha.lt0").alias("terminal_inicial"),
        col("linha.lt1").alias("terminal_final"),
        col("linha.sl").alias("sentido"),
        col("veiculo.p").alias("codigo_veiculo"),
        col("veiculo.a").alias("acessibilidade"),
        col("veiculo.ta").alias("ultima_atualizacao"),
        col("veiculo.py").alias("latitude"),
        col("veiculo.px").alias("longitude"),
        col("hr").alias("hora_referencia")
    )
    .withColumn("hora_referencia", to_timestamp("hora_referencia"))
    .withColumn("ingest_timestamp", current_timestamp())
    .dropDuplicates(["codigo_veiculo", "hora_referencia"])
)
df_clean.show(30, truncate=False)

+------------+------------------------+---------------------+-------+--------------+--------------+--------------------+-------------------+-------------------+-------------------+--------------------------+
|codigo_linha|terminal_inicial        |terminal_final       |sentido|codigo_veiculo|acessibilidade|ultima_atualizacao  |latitude           |longitude          |hora_referencia    |ingest_timestamp          |
+------------+------------------------+---------------------+-------+--------------+--------------+--------------------+-------------------+-------------------+-------------------+--------------------------+
|414P-10     |TERM. NORTE METRÔ CARRÃO|VL. INDUSTRIAL       |2      |3117          |false         |2025-10-21T00:34:06Z|-23.536710499999998|-46.565512625000004|2025-10-28 21:34:00|2025-10-28 00:39:38.808282|
|414P-10     |TERM. NORTE METRÔ CARRÃO|VL. INDUSTRIAL       |1      |3117          |false         |2025-10-21T01:39:35Z|-23.610801000000002|-46.5299455        |2025-10-

In [8]:
if DeltaTable.isDeltaTable(spark, SILVER_PATH):
    delta_table = DeltaTable.forPath(spark, SILVER_PATH)
    (
        delta_table.alias("t")
        .merge(df_clean.alias("s"),
               "t.codigo_veiculo = s.codigo_veiculo AND t.hora_referencia = s.hora_referencia")
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )
else:
    df_clean.write.format("delta").mode("overwrite").save(SILVER_PATH)

In [9]:
print("✅ Transformação Bronze → Silver concluída para 'posicao'.")
spark.stop()

✅ Transformação Bronze → Silver concluída para 'posicao'.
