In [1]:
pip install delta-spark==3.3.2 dotenv

Collecting delta-spark==3.3.2
  Downloading delta_spark-3.3.2-py3-none-any.whl.metadata (2.2 kB)
Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting pyspark<3.6.0,>=3.5.3 (from delta-spark==3.3.2)
  Downloading pyspark-3.5.7.tar.gz (317.4 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m317.4/317.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting python-dotenv (from dotenv)
  Downloading python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Collecting py4j==0.10.9.7 (from pyspark<3.6.0,>=3.5.3->delta-spark==3.3.2)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading delta_spark-3.3.2-py3-none-any.whl (22 kB)
Downloading dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚î

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from delta import configure_spark_with_delta_pip
from delta.tables import DeltaTable
from datetime import datetime

In [3]:
from dotenv import load_dotenv
import os
load_dotenv('/opt/workspace/.env')
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT_DOCKER")
MINIO_ACCESS = os.getenv("MINIO_ROOT_USER")
MINIO_SECRET = os.getenv("MINIO_ROOT_PASSWORD")

In [4]:
builder = (
    SparkSession.builder.appName("SilverToGold_Delta")
    .config("spark.hadoop.fs.s3a.endpoint", MINIO_ENDPOINT)
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS)
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET)
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    # Delta Lake
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [5]:
from datetime import datetime

today = "data_ref=" + datetime.today().strftime('%Y-%m-%d')
SILVER_PATH = "s3a://silver/"
routes = "gtfs/routes/"
trips = "gtfs/trips/"
stops = "gtfs/stops/"
stop_times = "gtfs/stop_times/"
shapes = "gtfs/shapes/"
posicao = f"posicao/{today}"
linhas = f"linhas/{today}"
paradas = f"paradas/{today}"

gtfs = [routes, trips, stops, stop_times, shapes]
api = [posicao, linhas, paradas]

for data in gtfs + api:
    print(f"\nüìÇ Lendo: {SILVER_PATH}{data}")
    df = spark.read.format("delta").load(f"{SILVER_PATH}{data}")
    df.show(10, False)
    


üìÇ Lendo: s3a://silver/gtfs/routes/
+--------+-----------+----------------+-----------------------------------------+------------------+--------------------+
|letreiro|route_color|route_text_color|nome_publico                             |terminal_inicial  |terminal_final      |
+--------+-----------+----------------+-----------------------------------------+------------------+--------------------+
|1012-10 |509E2F     |FFFFFF          |term. jd. britania - jd. monte belo      |term. jd. britania|jd. monte belo      |
|1012-21 |509E2F     |FFFFFF          |term. jd. britania - jd. rosinha         |term. jd. britania|jd. rosinha         |
|1014-10 |509E2F     |FFFFFF          |cptm perus - morro doce                  |cptm perus        |morro doce          |
|1015-10 |509E2F     |FFFFFF          |term. jd. britania - chac. maria trindade|term. jd. britania|chac. maria trindade|
|1016-10 |002F6C     |FFFFFF          |cem. do horto - shop. center norte       |cem. do horto     |shop. c

In [7]:
df_posicao_master = spark.read.format("delta").load(f"s3a://silver/posicao/{today}")

+--------+------------+-------+----------------+-----------------+--------------+--------------+-------------------+----------+----------+-------------------+----------+--------------------+
|letreiro|codigo_linha|sentido|terminal_inicial|   terminal_final|codigo_veiculo|acessibilidade| ultima_atualizacao|  latitude| longitude|    hora_referencia|  data_ref|    ingest_timestamp|
+--------+------------+-------+----------------+-----------------+--------------+--------------+-------------------+----------+----------+-------------------+----------+--------------------+
| 5031-10|        1746|      1|    term. sacoma|       vl. arapua|         55378|          true|2025-11-11 15:57:41|-23.624632|-46.590992|2025-11-11 12:58:00|2025-11-11|2025-11-11 15:58:...|
| 5031-10|        1746|      1|    term. sacoma|       vl. arapua|         55020|          true|2025-11-11 15:57:18|-23.628393|-46.590581|2025-11-11 12:58:00|2025-11-11|2025-11-11 15:58:...|
| 5031-10|        1746|      1|    term. saco

861558

In [10]:
df_posicao_master.show(30)
df_posicao_master.count()
df_posicao_master.select("codigo_veiculo", "ultima_atualizacao").distinct().show()

+--------+------------+-------+----------------+-----------------+--------------+--------------+-------------------+----------+----------+-------------------+----------+--------------------+
|letreiro|codigo_linha|sentido|terminal_inicial|   terminal_final|codigo_veiculo|acessibilidade| ultima_atualizacao|  latitude| longitude|    hora_referencia|  data_ref|    ingest_timestamp|
+--------+------------+-------+----------------+-----------------+--------------+--------------+-------------------+----------+----------+-------------------+----------+--------------------+
| 5031-10|        1746|      1|    term. sacoma|       vl. arapua|         55378|          true|2025-11-11 15:57:41|-23.624632|-46.590992|2025-11-11 12:58:00|2025-11-11|2025-11-11 15:58:...|
| 5031-10|        1746|      1|    term. sacoma|       vl. arapua|         55020|          true|2025-11-11 15:57:18|-23.628393|-46.590581|2025-11-11 12:58:00|2025-11-11|2025-11-11 15:58:...|
| 5031-10|        1746|      1|    term. saco