In [1]:
pip install delta-spark==3.3.2 dotenv

Collecting pyspark<3.6.0,>=3.5.3 (from delta-spark==3.3.2)
  Using cached pyspark-3.5.7-py2.py3-none-any.whl
Installing collected packages: pyspark
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.5.0
    Can't uninstall 'pyspark'. No files were found to uninstall.
Successfully installed pyspark-3.5.7
Note: you may need to restart the kernel to use updated packages.


In [2]:
from pyspark.sql import SparkSession, functions as F, types as T
from delta import configure_spark_with_delta_pip

In [3]:
from dotenv import load_dotenv
import os
load_dotenv('/opt/workspace/.env')
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT_DOCKER")
MINIO_ACCESS = os.getenv("MINIO_ROOT_USER")
MINIO_SECRET = os.getenv("MINIO_ROOT_PASSWORD")
                           
BRONZE_PATH = f"s3a://bronze/gtfs/"
SILVER_PATH = f"s3a://silver/gtfs/"
print(f"Buscando arquivos em: {BRONZE_PATH}")

Buscando arquivos em: s3a://bronze/gtfs/


In [4]:
builder = (
    SparkSession.builder.appName("BronzeToSilver_Delta")
    .config("spark.hadoop.fs.s3a.endpoint", MINIO_ENDPOINT)
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS)
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET)
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    # Delta Lake
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [5]:
import unicodedata

def strip_accents_py(txt):
    if txt is None:
        return None
    txt_norm = unicodedata.normalize('NFD', txt)
    return ''.join(c for c in txt_norm if unicodedata.category(c) != 'Mn')

strip_accents_udf = F.udf(strip_accents_py, T.StringType())

In [6]:
#-----------------------
# 1. ROUTES
#-----------------------

In [7]:
# 1. Criar schema e aplicar no arquivo
schema_routes = T.StructType([
    T.StructField("route_id",          T.StringType(), True),
    T.StructField("agency_id",         T.IntegerType(), True),
    T.StructField("route_short_name",  T.StringType(), True),
    T.StructField("route_long_name",   T.StringType(), True),
    T.StructField("route_type",        T.IntegerType(), True),
    T.StructField("route_color",       T.StringType(), True),
    T.StructField("route_text_color",  T.StringType(), True),
])

routes = spark.read.option("header", True).schema(schema_routes).csv(f"{BRONZE_PATH}routes.txt")
routes.printSchema()
routes.show(10)

root
 |-- route_id: string (nullable = true)
 |-- agency_id: integer (nullable = true)
 |-- route_short_name: string (nullable = true)
 |-- route_long_name: string (nullable = true)
 |-- route_type: integer (nullable = true)
 |-- route_color: string (nullable = true)
 |-- route_text_color: string (nullable = true)

+--------+---------+----------------+--------------------+----------+-----------+----------------+
|route_id|agency_id|route_short_name|     route_long_name|route_type|route_color|route_text_color|
+--------+---------+----------------+--------------------+----------+-----------+----------------+
| 1012-10|        1|         1012-10|Term. Jd. Britani...|         3|     509E2F|          FFFFFF|
| 1012-21|        1|         1012-21|Term. Jd. Britâni...|         3|     509E2F|          FFFFFF|
| 1014-10|        1|         1014-10|Cptm Perus - Morr...|         3|     509E2F|          FFFFFF|
| 1015-10|        1|         1015-10|Term. Jd. Britâni...|         3|     509E2F|        

In [8]:
# 2. Verificações do arquivo
routes_agency = routes.select("agency_id").distinct().count()
routes_type = routes.select("route_type").distinct().count()
#Divergências entre route_id e route_short_name
route_diff = routes.filter(F.col("route_id") != F.col("route_short_name")).count()
print(f"Unique agency_ids: {routes_agency}, unique route_types: {routes_type}, divergent route id and shortnames: {route_diff}")

Unique agency_ids: 1, unique route_types: 3, divergent route id and shortnames: 0


In [9]:
# 3. Remover tipo de rota que não é ônibus, renomear colunas e normalizar route_long_name
routes_cleaned = (
    routes
    .filter(F.col("route_type") == 3) # mantém somente ônibus
    .drop("agency_id", "route_short_name", "route_type")  # remove colunas inúteis
    .withColumn("route_long_name", F.lower(strip_accents_udf(F.col("route_long_name"))))
    .withColumnRenamed("route_id", "letreiro")
    .withColumnRenamed("route_long_name", "nome_publico")
)

routes_cleaned.show(10, truncate=False)

+--------+-----------------------------------------+-----------+----------------+
|letreiro|nome_publico                             |route_color|route_text_color|
+--------+-----------------------------------------+-----------+----------------+
|1012-10 |term. jd. britania - jd. monte belo      |509E2F     |FFFFFF          |
|1012-21 |term. jd. britania - jd. rosinha         |509E2F     |FFFFFF          |
|1014-10 |cptm perus - morro doce                  |509E2F     |FFFFFF          |
|1015-10 |term. jd. britania - chac. maria trindade|509E2F     |FFFFFF          |
|1016-10 |cem. do horto - shop. center norte       |002F6C     |FFFFFF          |
|1017-10 |perus - conexao vl. iorio                |509E2F     |FFFFFF          |
|1018-10 |vl. rosa - metro santana                 |002F6C     |FFFFFF          |
|1019-10 |sol nascente - term. pirituba            |509E2F     |FFFFFF          |
|1020-10 |perus - conexao vl. iorio                |509E2F     |FFFFFF          |
|1021-10 |cohab 

In [10]:
# 4. Fazer split do nome longo para terminal inicial e final
routes_split = (
    routes_cleaned
    .withColumn("nome_publico_split", F.split(F.col("nome_publico"), "-"))
    .withColumn("terminal_inicial", F.trim(F.col("nome_publico_split").getItem(0)))
    .withColumn("terminal_final", F.trim(F.col("nome_publico_split").getItem(1)))       
    .drop("nome_publico_split")
)
routes_split.show(10, truncate=False)                                           

+--------+-----------------------------------------+-----------+----------------+------------------+--------------------+
|letreiro|nome_publico                             |route_color|route_text_color|terminal_inicial  |terminal_final      |
+--------+-----------------------------------------+-----------+----------------+------------------+--------------------+
|1012-10 |term. jd. britania - jd. monte belo      |509E2F     |FFFFFF          |term. jd. britania|jd. monte belo      |
|1012-21 |term. jd. britania - jd. rosinha         |509E2F     |FFFFFF          |term. jd. britania|jd. rosinha         |
|1014-10 |cptm perus - morro doce                  |509E2F     |FFFFFF          |cptm perus        |morro doce          |
|1015-10 |term. jd. britania - chac. maria trindade|509E2F     |FFFFFF          |term. jd. britania|chac. maria trindade|
|1016-10 |cem. do horto - shop. center norte       |002F6C     |FFFFFF          |cem. do horto     |shop. center norte  |
|1017-10 |perus - conexa

In [11]:
# 5. Reordenar colunas e salvar em silver
routes_silver = (
    routes_split
    .select("letreiro", "nome_publico", "terminal_inicial", "terminal_final", "route_color", "route_text_color")
)
print(routes_silver.count())
routes_silver.dropDuplicates(["letreiro"])
print(routes_silver.select("letreiro").distinct().count())
(
    routes_silver
    .repartition(1)
    .write.format("delta")
    .mode("overwrite")
    .save(f"{SILVER_PATH}routes")
)
print(f"✅ {SILVER_PATH}routes salvo com sucesso.")

1332
1332
✅ s3a://silver/gtfs/routes salvo com sucesso.


In [12]:
#-----------------------
# 2. STOPS
#-----------------------

In [13]:
# 1. Criar schema e aplicar
schema_stops = T.StructType([
    T.StructField("stop_id",    T.StringType(), True),
    T.StructField("stop_name",  T.StringType(), True),
    T.StructField("stop_desc",  T.StringType(), True),
    T.StructField("stop_lat",   T.DoubleType(), True),
    T.StructField("stop_lon",   T.DoubleType(), True),
])

stops = spark.read.option("header", True).schema(schema_stops).csv("s3a://bronze/gtfs/stops.txt")
stops.printSchema()
stops.show(10)

root
 |-- stop_id: string (nullable = true)
 |-- stop_name: string (nullable = true)
 |-- stop_desc: string (nullable = true)
 |-- stop_lat: double (nullable = true)
 |-- stop_lon: double (nullable = true)

+-------+---------------+---------+----------+----------+
|stop_id|      stop_name|stop_desc|  stop_lat|  stop_lon|
+-------+---------------+---------+----------+----------+
|  18848|       Clínicas|     NULL|-23.554022|-46.671108|
|  18849|  Vila Madalena|     NULL|-23.546498|-46.691141|
|  18850|     Consolação|     NULL|-23.558094|-46.660205|
|  18851|      Conceição|     NULL|-23.635039|-46.641239|
|  18852|      Jabaquara|     NULL|-23.646033|-46.641028|
|  18853|      São Judas|     NULL|-23.625882|-46.640936|
|  18854|          Saúde|     NULL|-23.618245|-46.639139|
|  18855|Praça Da Árvore|     NULL|-23.610583|-46.637918|
|  18856|     Santa Cruz|     NULL|-23.598541|-46.636638|
|  18857|   Vila Mariana|     NULL|-23.589359|-46.634677|
+-------+---------------+---------+----

In [14]:
# 2. Verificação inicial
stops.groupBy("stop_name").count().filter(F.col("count") > 1).orderBy(F.desc("count")).show(10, truncate=False)
stops.count()

+-------------------------------+-----+
|stop_name                      |count|
+-------------------------------+-----+
|Terminal Pirituba              |30   |
|Terminal Campo Limpo           |28   |
|Terminal Grajaú                |26   |
|Terminal Varginha              |24   |
|Rod. Raposo Tavares            |22   |
|Terminal Sto. Amaro, 2         |20   |
|Terminal Vila Nova Cachoeirinha|19   |
|Rod. Anhanguera                |19   |
|Terminal Bandeira              |17   |
|Terminal Princesa Isabel       |17   |
+-------------------------------+-----+
only showing top 10 rows



22050

In [15]:
# 3. Tratamento/normalização
stops_cleaned = (
    stops
    .withColumn("stop_name", F.lower(strip_accents_udf(F.col("stop_name"))))
    .withColumnRenamed("stop_lat", "latitude")
    .withColumnRenamed("stop_lon", "longitude")
    .drop("stop_desc")
)

stops_cleaned.show(10, truncate=False)

+-------+---------------+----------+----------+
|stop_id|stop_name      |latitude  |longitude |
+-------+---------------+----------+----------+
|18848  |clinicas       |-23.554022|-46.671108|
|18849  |vila madalena  |-23.546498|-46.691141|
|18850  |consolacao     |-23.558094|-46.660205|
|18851  |conceicao      |-23.635039|-46.641239|
|18852  |jabaquara      |-23.646033|-46.641028|
|18853  |sao judas      |-23.625882|-46.640936|
|18854  |saude          |-23.618245|-46.639139|
|18855  |praca da arvore|-23.610583|-46.637918|
|18856  |santa cruz     |-23.598541|-46.636638|
|18857  |vila mariana   |-23.589359|-46.634677|
+-------+---------------+----------+----------+
only showing top 10 rows



In [16]:
# 4. Verificação pós-tratamento
# Nulos
print("Sem nome:", stops_cleaned.filter(F.col("stop_name").isNull()).count())
print("Sem coordenadas:", stops_cleaned.filter(F.col("latitude").isNull() | F.col("longitude").isNull()).count())

# Duplicados
dups = stops_cleaned.groupBy("stop_id").count().filter(F.col("count") > 1)
print("Duplicados:", dups.count())

Sem nome: 0
Sem coordenadas: 0
Duplicados: 0


In [17]:
# 5. Salvar em silver
(
    stops_cleaned
    .repartition(1)
    .write.format("delta")
    .mode("overwrite")
    .save(f"{SILVER_PATH}stops")
)

print(f"✅ {SILVER_PATH}stops salvo com sucesso.")

✅ s3a://silver/gtfs/stops salvo com sucesso.


In [18]:
##-----------------------
# 3. TRIPS
#-----------------------

In [19]:
# 1. Criar schema e aplicar
schema_trips = T.StructType([
    T.StructField("route_id",    T.StringType(), True),
    T.StructField("service_id",  T.StringType(), True),
    T.StructField("trip_id",  T.StringType(), True),
    T.StructField("trip_headsign",   T.StringType(), True),
    T.StructField("direction_id",   T.IntegerType(), True),
    T.StructField("shape_id", T.IntegerType(), True),
])

trips = spark.read.option("header", True).schema(schema_trips).csv("s3a://bronze/gtfs/trips.txt")
trips.printSchema()
trips.show(10)

root
 |-- route_id: string (nullable = true)
 |-- service_id: string (nullable = true)
 |-- trip_id: string (nullable = true)
 |-- trip_headsign: string (nullable = true)
 |-- direction_id: integer (nullable = true)
 |-- shape_id: integer (nullable = true)

+--------+----------+---------+--------------------+------------+--------+
|route_id|service_id|  trip_id|       trip_headsign|direction_id|shape_id|
+--------+----------+---------+--------------------+------------+--------+
| 1012-10|       USD|1012-10-0|      Jd. Monte Belo|           0|   84609|
| 1012-21|       U__|1012-21-0|         Jd. Rosinha|           0|   81195|
| 1014-10|       U__|1014-10-0|          Morro Doce|           0|   84061|
| 1015-10|       USD|1015-10-0|Chác. Maria Trindade|           0|   81148|
| 1016-10|       USD|1016-10-0|  Shop. Center Norte|           0|   84176|
| 1016-10|       USD|1016-10-1|       Cem. Do Horto|           1|   84540|
| 1017-10|       USD|1017-10-0|   Conexão Vl. Iório|           0|  

In [20]:
#2. Normalizar e renomear
trips_cleaned = (
    trips
    .withColumn("trip_headsign", F.lower(strip_accents_udf(F.col("trip_headsign"))))
    .withColumnRenamed("route_id", "letreiro")
    .withColumnRenamed("service_id", "dias_funcionamento")
    .withColumnRenamed("trip_headsign", "destino")
    .withColumnRenamed("direction_id", "sentido")
    # seleciona apenas campos úteis
    .select("trip_id", "letreiro", "sentido", "destino", "shape_id", "dias_funcionamento")
    .dropDuplicates(["trip_id"])
)
trips_cleaned.show(10, False)

+---------+--------+-------+--------------------+--------+------------------+
|trip_id  |letreiro|sentido|destino             |shape_id|dias_funcionamento|
+---------+--------+-------+--------------------+--------+------------------+
|1012-10-0|1012-10 |0      |jd. monte belo      |84609   |USD               |
|1012-21-0|1012-21 |0      |jd. rosinha         |81195   |U__               |
|1014-10-0|1014-10 |0      |morro doce          |84061   |U__               |
|1015-10-0|1015-10 |0      |chac. maria trindade|81148   |USD               |
|1016-10-0|1016-10 |0      |shop. center norte  |84176   |USD               |
|1016-10-1|1016-10 |1      |cem. do horto       |84540   |USD               |
|1017-10-0|1017-10 |0      |conexao vl. iorio   |72355   |USD               |
|1017-10-1|1017-10 |1      |perus               |72356   |USD               |
|1018-10-0|1018-10 |0      |metro santana       |71029   |USD               |
|1018-10-1|1018-10 |1      |vl. rosa            |71030   |USD   

In [21]:
# 3. Verificação pós-tratamento
print("Total original:", trips.count())
print("Após limpeza/dedup:", trips_cleaned.count())

# checar destinos nulos
trips_cleaned.filter(F.col("destino").isNull()).show(5, truncate=False)

Total original: 2252
Após limpeza/dedup: 2252
+-------+--------+-------+-------+--------+------------------+
|trip_id|letreiro|sentido|destino|shape_id|dias_funcionamento|
+-------+--------+-------+-------+--------+------------------+
+-------+--------+-------+-------+--------+------------------+



In [22]:
# 4. Salvar em silver
(
    trips_cleaned
    .repartition(1)
    .write.format("delta")
    .mode("overwrite")
    .save(f"{SILVER_PATH}trips")
)

print(f"✅ {SILVER_PATH}trips salvo com sucesso.")

✅ s3a://silver/gtfs/trips salvo com sucesso.


In [23]:
#-----------------------
# 4. STOP_TIMES
#-----------------------

In [24]:
# 1. Schema e aplicar

schema_stop_times = T.StructType([
    T.StructField("trip_id",         T.StringType(), True),
    T.StructField("arrival_time",    T.StringType(), True),
    T.StructField("departure_time",  T.StringType(), True),
    T.StructField("stop_id",         T.IntegerType(), True),
    T.StructField("stop_sequence",   T.IntegerType(), True),
])

stop_times = spark.read.option("header", True).schema(schema_stop_times).csv(f"{BRONZE_PATH}stop_times.txt")
stop_times.printSchema()
stop_times.show(10, False)

root
 |-- trip_id: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- departure_time: string (nullable = true)
 |-- stop_id: integer (nullable = true)
 |-- stop_sequence: integer (nullable = true)

+---------+------------+--------------+--------+-------------+
|trip_id  |arrival_time|departure_time|stop_id |stop_sequence|
+---------+------------+--------------+--------+-------------+
|1012-10-0|07:00:00    |07:00:00      |301790  |1            |
|1012-10-0|07:01:18    |07:01:18      |301764  |2            |
|1012-10-0|07:02:36    |07:02:36      |301724  |3            |
|1012-10-0|07:03:54    |07:03:54      |301730  |4            |
|1012-10-0|07:05:12    |07:05:12      |30003042|5            |
|1012-10-0|07:06:30    |07:06:30      |30003045|6            |
|1012-10-0|07:07:48    |07:07:48      |30003044|7            |
|1012-10-0|07:09:06    |07:09:06      |30003043|8            |
|1012-10-0|07:10:24    |07:10:24      |30003013|9            |
|1012-10-0|07:11:42    

In [25]:
# 2. Verificação dos dados

uniquetrips = stop_times.select("trip_id").distinct().count()
uniquestops = stop_times.select("stop_id").distinct().count()
print(f"{uniquetrips} viagens únicas e {uniquestops} paradas únicas.")

2252 viagens únicas e 22050 paradas únicas.


In [26]:
# 3. Renomear colunas

stop_times_cleaned = (
    stop_times
    .withColumnRenamed("arrival_time", "hora_chegada")
    .withColumnRenamed("departure_time", "hora_partida")
    .withColumnRenamed("stop_sequence", "sequencia_parada")
)

In [27]:
# 4. Verificações pós-tratamento

print("Total de registros:", stop_times_cleaned.count())
stop_times_cleaned.select("trip_id").distinct().count()

# Sequência mínima e máxima
stop_times_cleaned.agg(
    F.min("sequencia_parada").alias("seq_min"),
    F.max("sequencia_parada").alias("seq_max")
).show()

# Nulls
stop_times_cleaned.filter(F.col("stop_id").isNull()).show(5)

stop_times_cleaned.show(5, False)

Total de registros: 98306
+-------+-------+
|seq_min|seq_max|
+-------+-------+
|      1|    146|
+-------+-------+

+-------+------------+------------+-------+----------------+
|trip_id|hora_chegada|hora_partida|stop_id|sequencia_parada|
+-------+------------+------------+-------+----------------+
+-------+------------+------------+-------+----------------+

+---------+------------+------------+--------+----------------+
|trip_id  |hora_chegada|hora_partida|stop_id |sequencia_parada|
+---------+------------+------------+--------+----------------+
|1012-10-0|07:00:00    |07:00:00    |301790  |1               |
|1012-10-0|07:01:18    |07:01:18    |301764  |2               |
|1012-10-0|07:02:36    |07:02:36    |301724  |3               |
|1012-10-0|07:03:54    |07:03:54    |301730  |4               |
|1012-10-0|07:05:12    |07:05:12    |30003042|5               |
+---------+------------+------------+--------+----------------+
only showing top 5 rows



In [28]:
# 5. Salvar em silver

(
    stop_times_cleaned
    .repartition(1)
    .write.format("delta")
    .mode("overwrite")
    .save(f"{SILVER_PATH}stop_times")
)

print(f"✅ {SILVER_PATH}stop_times salvo com sucesso.")

✅ s3a://silver/gtfs/stop_times salvo com sucesso.


In [29]:
#-----------------------
# 5. SHAPES
#-----------------------

In [30]:
schema_shapes = T.StructType([
    T.StructField("shape_id",             T.IntegerType(), True),
    T.StructField("shape_pt_lat",         T.DoubleType(), True),
    T.StructField("shape_pt_lon",         T.DoubleType(), True),
    T.StructField("shape_pt_sequence",    T.IntegerType(), True),
    T.StructField("shape_dist_traveled",  T.FloatType(), True),
])

shapes = spark.read.option("header", True).csv(f"{BRONZE_PATH}shapes.txt")
shapes.show(10, False)

+--------+------------+------------+-----------------+-------------------+
|shape_id|shape_pt_lat|shape_pt_lon|shape_pt_sequence|shape_dist_traveled|
+--------+------------+------------+-----------------+-------------------+
|84609   |-23.432024  |-46.787121  |1                |0                  |
|84609   |-23.431827  |-46.787153  |2                |22.061337          |
|84609   |-23.431764  |-46.787163  |3                |29.113285          |
|84609   |-23.431614  |-46.787187  |4                |45.906006          |
|84609   |-23.431590  |-46.787191  |5                |48.596806          |
|84609   |-23.431554  |-46.787196  |6                |52.615639          |
|84609   |-23.431512  |-46.787202  |7                |57.306896          |
|84609   |-23.431438  |-46.787210  |8                |65.543129          |
|84609   |-23.431388  |-46.787215  |9                |71.10347           |
|84609   |-23.431378  |-46.787218  |10               |72.248848          |
+--------+------------+--

In [31]:
shapes_cleaned = (
    shapes
    .withColumnRenamed("shape_pt_lat", "latitude")
    .withColumnRenamed("shape_pt_lon", "longitude")
    .withColumnRenamed("shape_pt_sequence", "sequencia_ponto")
    .withColumnRenamed("shape_dist_traveled", "distancia_acumulada")
    # remove pontos nulos
    .filter(F.col("latitude").isNotNull() & F.col("longitude").isNotNull())
    .withColumn("distancia_acumulada", F.round(F.col("distancia_acumulada"), 2))
)

shapes_cleaned.show(10, False)

+--------+----------+----------+---------------+-------------------+
|shape_id|latitude  |longitude |sequencia_ponto|distancia_acumulada|
+--------+----------+----------+---------------+-------------------+
|84609   |-23.432024|-46.787121|1              |0.0                |
|84609   |-23.431827|-46.787153|2              |22.06              |
|84609   |-23.431764|-46.787163|3              |29.11              |
|84609   |-23.431614|-46.787187|4              |45.91              |
|84609   |-23.431590|-46.787191|5              |48.6               |
|84609   |-23.431554|-46.787196|6              |52.62              |
|84609   |-23.431512|-46.787202|7              |57.31              |
|84609   |-23.431438|-46.787210|8              |65.54              |
|84609   |-23.431388|-46.787215|9              |71.1               |
|84609   |-23.431378|-46.787218|10             |72.25              |
+--------+----------+----------+---------------+-------------------+
only showing top 10 rows



In [32]:
# 3. Verificações pós-tratamento

print("Total de registros:", shapes.count())
print("Após limpeza:", shapes_cleaned.count())

# Quantos shapes únicos existem
print("Shapes únicos:", shapes_cleaned.select("shape_id").distinct().count())

# Amostra de sequências por shape
shapes_cleaned.groupBy("shape_id").agg(
    F.min("sequencia_ponto").alias("seq_min"),
    F.max("sequencia_ponto").alias("seq_max"),
    F.count("*").alias("qtd_pontos")
).orderBy(F.desc("qtd_pontos")).show(5)

Total de registros: 1134633
Após limpeza: 1134633
Shapes únicos: 2252
+--------+-------+-------+----------+
|shape_id|seq_min|seq_max|qtd_pontos|
+--------+-------+-------+----------+
|   71508|      1|    999|      1824|
|   84700|      1|    999|      1713|
|   80173|      1|    999|      1625|
|   80483|      1|    999|      1545|
|   73012|      1|    999|      1469|
+--------+-------+-------+----------+
only showing top 5 rows



In [33]:
# 4. Escrita na silver
(
    shapes_cleaned
    .repartition(1)
    .write.format("delta")
    .mode("overwrite")
    .save(f"{SILVER_PATH}shapes")
)

print(f"✅ {SILVER_PATH}shapes salvo com sucesso.")

✅ s3a://silver/gtfs/shapes salvo com sucesso.


In [34]:
spark.stop()