In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, current_timestamp, lit, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
import os

In [15]:
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT_DOCKER", "minio:9000")
MINIO_ACCESS_KEY = os.getenv("MINIO_ROOT_USER", "admin")
MINIO_SECRET_KEY = os.getenv("MINIO_ROOT_PASSWORD", "minioadmin")

BRONZE_PATH = "s3a://bronze/posicao/*/*/*/"
SILVER_PATH = "s3a://silver/posicao/"

In [16]:
spark = (
    SparkSession.builder.appName("Transform_Posicao_Bronze_to_Silver")
    .config("spark.hadoop.fs.s3a.endpoint", f"http://{MINIO_ENDPOINT}")
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY)
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY)
    .config("spark.hadoop.fs.s3a.path.style.access", True)
    .getOrCreate()
)

In [17]:
df_raw = spark.read.option("multiline", True).json(BRONZE_PATH)

In [18]:
df_raw.printSchema()
df_raw.show(1, truncate=False)

root
 |-- hr: string (nullable = true)
 |-- l: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- c: string (nullable = true)
 |    |    |-- cl: long (nullable = true)
 |    |    |-- lt0: string (nullable = true)
 |    |    |-- lt1: string (nullable = true)
 |    |    |-- qv: long (nullable = true)
 |    |    |-- sl: long (nullable = true)
 |    |    |-- vs: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- a: boolean (nullable = true)
 |    |    |    |    |-- is: string (nullable = true)
 |    |    |    |    |-- p: long (nullable = true)
 |    |    |    |    |-- px: double (nullable = true)
 |    |    |    |    |-- py: double (nullable = true)
 |    |    |    |    |-- sv: string (nullable = true)
 |    |    |    |    |-- ta: string (nullable = true)



IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [19]:
df_exploded = (
    df_raw
    .withColumn("linha", explode(col("l")))
    .withColumn("veiculo", explode(col("linha.vs")))
)

In [25]:
df_clean = (
    df_exploded
    .select(
        col("linha.c").alias("codigo_linha"),
        col("linha.lt0").alias("terminal_inicial"),
        col("linha.lt1").alias("terminal_final"),
        col("linha.sl").alias("sentido"),
        col("veiculo.p").alias("codigo_veiculo"),
        col("veiculo.a").alias("acessibilidade"),
        col("veiculo.ta").alias("ultima_atualizacao"),
        col("veiculo.py").alias("latitude"),
        col("veiculo.px").alias("longitude"),
        col("hr").alias("hora_referencia")
    )
    .withColumn("hora_referencia", to_timestamp("hora_referencia"))
    .withColumn("ingest_timestamp", current_timestamp())
    .dropDuplicates(["codigo_veiculo", "hora_referencia"])
)
df_clean.show(30, truncate=False)

+------------+------------------+-----------------+-------+--------------+--------------+--------------------+-------------------+-------------------+-------------------+--------------------------+
|codigo_linha|terminal_inicial  |terminal_final   |sentido|codigo_veiculo|acessibilidade|ultima_atualizacao  |latitude           |longitude          |hora_referencia    |ingest_timestamp          |
+------------+------------------+-----------------+-------+--------------+--------------+--------------------+-------------------+-------------------+-------------------+--------------------------+
|4027-41     |SÃO MATEUS        |MORRO DO CRUZEIRO|2      |3117          |false         |2025-10-20T15:36:29Z|-23.637357         |-46.4383655        |2025-10-27 12:36:00|2025-10-27 19:51:21.160759|
|GUIN-10     |GUINCHO           |GUINCHO          |1      |7302          |false         |2025-10-20T18:53:54Z|-23.692902500000002|-46.778492         |2025-10-27 15:54:00|2025-10-27 19:51:21.160759|
|GUIN-10  

In [22]:
(
    df_clean
    .repartition(1)  # opcional: reduzir número de arquivos pequenos
    .write.mode("append")
    .format("parquet")
    .partitionBy("codigo_linha")
    .save(SILVER_PATH)
)

In [None]:
print("✅ Transformação Bronze → Silver concluída para 'posicao'.")
spark.stop()