In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, current_timestamp, lit, to_timestamp
from pyspark.sql.types import *
import os

In [2]:
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT_DOCKER")
MINIO_ACCESS_KEY = os.getenv("MINIO_ROOT_USER",)
MINIO_SECRET_KEY = os.getenv("MINIO_ROOT_PASSWORD")

BRONZE_PATH = "s3a://bronze/posicao/*/*/*/"
SILVER_PATH = "s3a://silver/posicao/"

In [3]:
spark = (
    SparkSession.builder.appName("Transform_Posicao_Bronze_to_Silver")
    .config("spark.hadoop.fs.s3a.endpoint", f"http://{MINIO_ENDPOINT}")
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY)
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY)
    .config("spark.hadoop.fs.s3a.path.style.access", True)
    .getOrCreate()
)

In [4]:
schema = StructType([
    StructField("hr", StringType(), True),
    StructField("l", ArrayType(
        StructType([
            StructField("c", StringType(), True),   # código visível (ex: "6L10-10")
            StructField("cl", IntegerType(), True), # código numérico
            StructField("sl", IntegerType(), True), # sentido (1 = ida, 2 = volta)
            StructField("lt0", StringType(), True), # terminal inicial
            StructField("lt1", StringType(), True), # terminal final
            StructField("qv", IntegerType(), True), # quantidade de veículos
            StructField("vs", ArrayType(
                StructType([
                    StructField("p", IntegerType(), True),  # código do veículo
                    StructField("a", BooleanType(), True),  # acessibilidade
                    StructField("ta", StringType(), True),  # timestamp de atualização
                    StructField("py", DoubleType(), True),  # latitude
                    StructField("px", DoubleType(), True),  # longitude
                ])
            ), True)
        ])
    ), True)
])

NameError: name 'ArrayType' is not defined

In [None]:
df_raw = spark.read.option("multiline", True).json(BRONZE_PATH)

In [None]:
df_raw.printSchema()
df_raw.show(1, truncate=False)

In [None]:
df_exploded = (
    df_raw
    .withColumn("linha", explode(col("l")))
    .withColumn("veiculo", explode(col("linha.vs")))
)

In [None]:
df_clean = (
    df_exploded
    .select(
        col("linha.c").alias("codigo_linha"),
        col("linha.lt0").alias("terminal_inicial"),
        col("linha.lt1").alias("terminal_final"),
        col("linha.sl").alias("sentido"),
        col("veiculo.p").alias("codigo_veiculo"),
        col("veiculo.a").alias("acessibilidade"),
        col("veiculo.ta").alias("ultima_atualizacao"),
        col("veiculo.py").alias("latitude"),
        col("veiculo.px").alias("longitude"),
        col("hr").alias("hora_referencia")
    )
    .withColumn("hora_referencia", to_timestamp("hora_referencia"))
    .withColumn("ingest_timestamp", current_timestamp())
    .dropDuplicates(["codigo_veiculo", "hora_referencia"])
)
df_clean.show(30, truncate=False)

In [None]:
(
    df_clean
    .repartition(1)  # opcional: reduzir número de arquivos pequenos
    .write.mode("append")
    .format("parquet")
    .partitionBy("codigo_linha")
    .save(SILVER_PATH)
)

In [None]:
print("✅ Transformação Bronze → Silver concluída para 'posicao'.")
spark.stop()