In [1]:
pip install delta-spark==3.3.2 dotenv

Collecting delta-spark==3.3.2
  Using cached delta_spark-3.3.2-py3-none-any.whl.metadata (2.2 kB)
Collecting dotenv
  Using cached dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting pyspark<3.6.0,>=3.5.3 (from delta-spark==3.3.2)
  Downloading pyspark-3.5.7.tar.gz (317.4 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m317.4/317.4 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting python-dotenv (from dotenv)
  Using cached python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Collecting py4j==0.10.9.7 (from pyspark<3.6.0,>=3.5.3->delta-spark==3.3.2)
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Using cached delta_spark-3.3.2-py3-none-any.whl (22 kB)
Using cached dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Using cached python

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from delta import configure_spark_with_delta_pip
from delta.tables import DeltaTable
from datetime import datetime

In [3]:
from dotenv import load_dotenv
import os
load_dotenv('/opt/workspace/.env')
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT_DOCKER")
MINIO_ACCESS = os.getenv("MINIO_ROOT_USER")
MINIO_SECRET = os.getenv("MINIO_ROOT_PASSWORD")

In [4]:
builder = (
    SparkSession.builder.appName("SilverToGold_Delta")
    .config("spark.hadoop.fs.s3a.endpoint", MINIO_ENDPOINT)
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS)
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET)
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    # Delta Lake
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [5]:
#=============================================
#=============================================
# DIM_LINHA             ROUTES + LINHAS
#=============================================
#=============================================

In [6]:
today = "data_ref=" + datetime.today().strftime('%Y-%m-%d')
print(today)
SILVER_LINHAS_PATH = f"s3a://silver/linhas/{today}"
SILVER_ROUTES_PATH = "s3a://silver/gtfs/routes/"
GOLD_DIM_LINHA_PATH = "s3a://gold/dim_linha/"

print(f"üìÇ Lendo Silver Linhas: {SILVER_LINHAS_PATH}")
print(f"üìÇ Lendo Silver GTFS Routes: {SILVER_ROUTES_PATH}")

data_ref=2025-11-11
üìÇ Lendo Silver Linhas: s3a://silver/linhas/data_ref=2025-11-11
üìÇ Lendo Silver GTFS Routes: s3a://silver/gtfs/routes/


In [7]:
# Leitura
df_api = spark.read.format("delta").load(SILVER_LINHAS_PATH)
df_gtfs = spark.read.format("delta").load(SILVER_ROUTES_PATH)
df_api = df_api.alias("api")
df_gtfs = df_gtfs.alias("gtfs")

df_api.show(5)
df_gtfs.show(5)

+------------+-------------+--------+-------------+------------------+-------------------+----------+--------------------+
|codigo_linha|modo_circular|letreiro|sentido_linha|terminal_principal|terminal_secundario|  data_ref|    ingest_timestamp|
+------------+-------------+--------+-------------+------------------+-------------------+----------+--------------------+
|       35254|         true| 1012-10|            2|    jd. monte belo| term. jd. britania|2025-11-11|2025-11-11 02:46:...|
|        2486|         true| 1012-10|            1|    jd. monte belo| term. jd. britania|2025-11-11|2025-11-11 02:46:...|
|        2570|         true| 1012-21|            1|       jd. rosinha| term. jd. britania|2025-11-11|2025-11-11 02:46:...|
|       35338|         true| 1012-21|            2|       jd. rosinha| term. jd. britania|2025-11-11|2025-11-11 02:46:...|
|        2609|         true| 1014-10|            1|        morro doce|         cptm perus|2025-11-11|2025-11-11 02:46:...|
+------------+--

In [8]:
dim_linha = (
    df_api
    .filter(F.col("sentido_linha") == 1)
    .join(
        df_gtfs,
        F.col("api.letreiro") == F.col("gtfs.letreiro"),
        "left"
    )
    .select(
        F.col("api.codigo_linha"),
        F.col("api.letreiro"),
        F.col("api.modo_circular"),
        F.col("api.terminal_principal"),
        F.col("api.terminal_secundario"),
        F.col("gtfs.route_color"),
        F.current_timestamp().alias("transform_gold_timestamp")
    )
)    

dim_linha.show(10)

dim_linha.select("route_color").distinct().show()

+------------+--------+-------------+--------------------+-------------------+-----------+------------------------+
|codigo_linha|letreiro|modo_circular|  terminal_principal|terminal_secundario|route_color|transform_gold_timestamp|
+------------+--------+-------------+--------------------+-------------------+-----------+------------------------+
|        2486| 1012-10|         true|      jd. monte belo| term. jd. britania|     509E2F|    2025-11-11 18:22:...|
|        2570| 1012-21|         true|         jd. rosinha| term. jd. britania|     509E2F|    2025-11-11 18:22:...|
|        2609| 1014-10|         true|          morro doce|         cptm perus|     509E2F|    2025-11-11 18:22:...|
|        2230| 1015-10|         true|chac. maria trindade| term. jd. britania|     509E2F|    2025-11-11 18:22:...|
|         831| 1016-10|        false|  shop. center norte|      cem. do horto|     002F6C|    2025-11-11 18:22:...|
|        2141| 1017-10|        false|   conexao vl. iorio|              

In [9]:
dim_linha = (
    dim_linha
    .withColumn(
        "regiao",
        F.when(F.col("route_color") == "509E2F", "√Årea 1 - Noroeste")
         .when(F.col("route_color") == "002F6C", "√Årea 2 - Norte")
         .when(F.col("route_color") == "FFD100", "√Årea 3 - Nordeste")
         .when(F.col("route_color") == "DA291C", "√Årea 4 - Leste")
         .when(F.col("route_color") == "006341", "√Årea 5 - Sudeste")
         .when(F.col("route_color") == "0082BA", "√Årea 6 - Sul")
         .when(F.col("route_color") == "782F40", "√Årea 7 - Sudoeste")
         .when(F.col("route_color") == "FF671F", "√Årea 8 - Oeste")
         .otherwise("Outros")
    )
    .drop("route_color")
)
dim_linha.show(20)

+------------+--------+-------------+--------------------+-------------------+------------------------+-----------------+
|codigo_linha|letreiro|modo_circular|  terminal_principal|terminal_secundario|transform_gold_timestamp|           regiao|
+------------+--------+-------------+--------------------+-------------------+------------------------+-----------------+
|        2486| 1012-10|         true|      jd. monte belo| term. jd. britania|    2025-11-11 18:22:...|√Årea 1 - Noroeste|
|        2570| 1012-21|         true|         jd. rosinha| term. jd. britania|    2025-11-11 18:22:...|√Årea 1 - Noroeste|
|        2609| 1014-10|         true|          morro doce|         cptm perus|    2025-11-11 18:22:...|√Årea 1 - Noroeste|
|        2230| 1015-10|         true|chac. maria trindade| term. jd. britania|    2025-11-11 18:22:...|√Årea 1 - Noroeste|
|         831| 1016-10|        false|  shop. center norte|      cem. do horto|    2025-11-11 18:22:...|   √Årea 2 - Norte|
|        2141| 1017

In [10]:
# Verificar se as regi√µes e o letreiro/terminal correspondem com dados reais
from pyspark.sql import functions as F, Window

# Janela particionada por regi√£o, numerando cada linha dentro de cada grupo
window = Window.partitionBy("regiao").orderBy(F.rand())

# Adiciona n√∫mero da linha por grupo
df_sample = (
    dim_linha
    .withColumn("row_number", F.row_number().over(window))
    .filter(F.col("row_number") <= 3)   # pega 3 de cada regi√£o
    .drop("row_number")
)

df_sample.show(50, truncate=False)

+------------+--------+-------------+---------------------+-------------------+--------------------------+-----------------+
|codigo_linha|letreiro|modo_circular|terminal_principal   |terminal_secundario|transform_gold_timestamp  |regiao           |
+------------+--------+-------------+---------------------+-------------------+--------------------------+-----------------+
|2450        |189L-10 |false        |lapa                 |conexao vl. iorio  |2025-11-11 18:22:09.405326|√Årea 1 - Noroeste|
|432         |8594-10 |false        |pca. ramos de azevedo|cid. d'abril       |2025-11-11 18:22:09.405326|√Årea 1 - Noroeste|
|412         |8050-10 |false        |lapa                 |pq. morro doce     |2025-11-11 18:22:09.405326|√Årea 1 - Noroeste|
|658         |971A-10 |false        |shop. d              |jd. primavera      |2025-11-11 18:22:09.405326|√Årea 2 - Norte   |
|2314        |N204-11 |false        |term. pq. d. pedro ii|metro tucuruvi     |2025-11-11 18:22:09.405326|√Årea 2 - Norte

In [11]:
# (
#     dim_linha
#         .write.format("delta")
#         .mode("overwrite")
#         .save(GOLD_DIM_LINHA_PATH)
# )
# print(f"dim_linha salvo em {GOLD_DIM_LINHA_PATH}")

In [12]:
#=============================================
#=============================================
#=============================================
# DIM_PARADAS
#=============================================
#=============================================
#=============================================

In [13]:
SILVER_PARADAS_PATH = f"s3a://silver/paradas/{today}"
SILVER_STOPS_PATH = "s3a://silver/gtfs/stops/"
GOLD_DIM_LINHA_PATH = "s3a://gold/dim_linha/"
GOLD_DIM_PARADA_PATH = "s3a://gold/dim_parada/"

print(f"üìÇ Lendo Silver Linhas: {SILVER_LINHAS_PATH}")
print(f"üìÇ Lendo Silver GTFS Routes: {SILVER_ROUTES_PATH}")
print(f"üìÇ Lendo Gold dim_linhas: {GOLD_DIM_LINHA_PATH}")

üìÇ Lendo Silver Linhas: s3a://silver/linhas/data_ref=2025-11-11
üìÇ Lendo Silver GTFS Routes: s3a://silver/gtfs/routes/
üìÇ Lendo Gold dim_linhas: s3a://gold/dim_linha/


In [14]:
df_api = spark.read.format("delta").load(SILVER_PARADAS_PATH)
df_gtfs = spark.read.format("delta").load(SILVER_STOPS_PATH)
dim_linha = spark.read.format("delta").load(GOLD_DIM_LINHA_PATH)

df_api.show(3)
df_gtfs.show(3)
dim_linha.show(3)

+------------+-------------+-----------+--------------------+----------+----------+----------+--------------------+
|codigo_linha|codigo_parada|nome_parada|            endereco|  latitude| longitude|  data_ref|    ingest_timestamp|
+------------+-------------+-----------+--------------------+----------+----------+----------+--------------------+
|        2497|    640000381|guerino c/b|r coracao de bugr...| -23.48455|-46.723093|2025-11-11|2025-11-11 02:47:...|
|        2497|    640000382|guerino b/c|av miguel de cast...|-23.484536|-46.722711|2025-11-11|2025-11-11 02:47:...|
|        2559|    640000360|  pinel b/c|r mal. mendes de ...|-23.485948|-46.724827|2025-11-11|2025-11-11 02:47:...|
+------------+-------------+-----------+--------------------+----------+----------+----------+--------------------+
only showing top 3 rows

+---------+-------------+----------+----------+
|id_parada|  nome_parada|  latitude| longitude|
+---------+-------------+----------+----------+
|    18848|     cli