In [1]:
pip install delta-spark==3.3.2 dotenv

Collecting pyspark<3.6.0,>=3.5.3 (from delta-spark==3.3.2)
  Using cached pyspark-3.5.7-py2.py3-none-any.whl
Installing collected packages: pyspark
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.5.0
    Can't uninstall 'pyspark'. No files were found to uninstall.
Successfully installed pyspark-3.5.7
Note: you may need to restart the kernel to use updated packages.


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from delta import configure_spark_with_delta_pip
from delta.tables import DeltaTable
from datetime import datetime

In [3]:
from dotenv import load_dotenv
import os
load_dotenv('/opt/workspace/.env')
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT_DOCKER")
MINIO_ACCESS = os.getenv("MINIO_ROOT_USER")
MINIO_SECRET = os.getenv("MINIO_ROOT_PASSWORD")

In [4]:
builder = (
    SparkSession.builder.appName("SilverToGold_Delta")
    .config("spark.hadoop.fs.s3a.endpoint", MINIO_ENDPOINT)
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS)
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET)
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    # Delta Lake
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [5]:
from datetime import datetime

today = "data_ref=" + datetime.today().strftime('%Y-%m-%d')
SILVER_PATH = "s3a://silver/"
routes = "gtfs/routes/"
trips = "gtfs/trips/"
stops = "gtfs/stops/"
stop_times = "gtfs/stop_times/"
shapes = "gtfs/shapes/"
posicao = f"posicao/{today}"
linhas = f"linhas/{today}"
paradas = f"paradas/{today}"

gtfs = [routes, trips, stops, stop_times, shapes]
api = [posicao, linhas, paradas]

for data in gtfs + api:
    print(f"\nüìÇ Lendo: {SILVER_PATH}{data}")
    df = spark.read.format("delta").load(f"{SILVER_PATH}{data}")
    df.printSchema()
    df.show(10, False)



üìÇ Lendo: s3a://silver/gtfs/routes/
root
 |-- letreiro: string (nullable = true)
 |-- route_color: string (nullable = true)
 |-- route_text_color: string (nullable = true)
 |-- nome_publico: string (nullable = true)
 |-- terminal_inicial: string (nullable = true)
 |-- terminal_final: string (nullable = true)

+--------+-----------+----------------+-----------------------------------------+------------------+--------------------+
|letreiro|route_color|route_text_color|nome_publico                             |terminal_inicial  |terminal_final      |
+--------+-----------+----------------+-----------------------------------------+------------------+--------------------+
|1012-10 |509E2F     |FFFFFF          |term. jd. britania - jd. monte belo      |term. jd. britania|jd. monte belo      |
|1012-21 |509E2F     |FFFFFF          |term. jd. britania - jd. rosinha         |term. jd. britania|jd. rosinha         |
|1014-10 |509E2F     |FFFFFF          |cptm perus - morro doce              

In [6]:
# Gold
GOLD_PATH = "s3a://gold/"
dim_linha = "dim_linha/"
dim_parada = "dim_parada/"
fato_posicao = f"fato_posicao/{today}"

gold = [dim_linha, dim_parada, fato_posicao]

for data in gold:
    print(f"\nüìÇ Lendo: {GOLD_PATH}{data}")
    df = spark.read.format("delta").load(f"{GOLD_PATH}{data}")
    df.printSchema()
    df.show(10, False)


üìÇ Lendo: s3a://gold/dim_linha/
root
 |-- codigo_linha: long (nullable = true)
 |-- letreiro: string (nullable = true)
 |-- modo_circular: boolean (nullable = true)
 |-- regiao: string (nullable = true)
 |-- terminal_origem: string (nullable = true)
 |-- terminal_destino: string (nullable = true)
 |-- transform_gold_timestamp: timestamp (nullable = true)

+------------+--------+-------------+-----------------+--------------------+------------------+--------------------------+
|codigo_linha|letreiro|modo_circular|regiao           |terminal_origem     |terminal_destino  |transform_gold_timestamp  |
+------------+--------+-------------+-----------------+--------------------+------------------+--------------------------+
|2486        |1012-10 |true         |√Årea 1 - Noroeste|Jd. Monte Belo      |Term. Jd. Britania|2025-11-12 00:55:27.345849|
|2570        |1012-21 |true         |√Årea 1 - Noroeste|Jd. Rosinha         |Term. Jd. Britania|2025-11-12 00:55:27.345849|
|2609        |1014-10 

In [7]:
spark.stop()