In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import lit
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor

spark_conf = SparkConf()
spark_conf.set("spark.executor.memory", "4g")
spark_conf.set("spark.driver.memory", "2g")
spark_conf.set("spark.network.timeout", "600s")
spark_conf.set("spark.executor.instances", "4")
spark_conf.set("spark.executor.cores", "4")
spark_conf.set("spark.default.parallelism", "6")
spark_conf.set("spark.sql.shuffle.partitions", "6")
spark_conf.set("spark.sql.parquet.enableVectorizedReader", "true")
spark_conf.set("spark.jars", "/tmp/postgresql-42.2.24.jar")
spark_conf.set("spark.sql.statistics.histogram.enabled", "true")
spark_conf.set("spark.sql.sources.partitionOverwriteMode", "DYNAMIC")

spark = SparkSession.builder.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").config(conf=spark_conf).getOrCreate()

# Cache dos DataFrames intermediários
spark.conf.set("spark.sql.legacy.cache.enabled", "true")

def process_month(year, month):
    year_str = str(year)
    month_str = str(month).zfill(2)  # Adiciona zero à esquerda se necessário
        
    print(f"Executing year: {year_str}-{month_str}")
    
    yellow_df = spark.read.parquet(f'data/raw/yellow/{year_str}/{month_str}/*')
    green_df = spark.read.parquet(f'data/raw/green/{year_str}/{month_str}/*')
        
    # Renomear colunas do DataFrame yellow
    yellow_df = yellow_df.withColumnRenamed('tpep_pickup_datetime', 'pickup_datetime') \
                         .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_datetime')
        
    yellow_df = yellow_df.withColumn('ehail_fee', lit(0))
    yellow_df = yellow_df.withColumn('trip_type', lit(0))

    yellow_df = yellow_df.withColumn('source',lit('yellow'))
        
    # Renomear colunas do DataFrame green
    green_df = green_df.withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime') \
                       .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime')
        
    green_df = green_df.withColumn('airport_fee', lit(0))

    green_df = green_df.withColumn('source',lit('green'))
        
    # Merge yellow e green dataset
    final_df = yellow_df.unionByName(green_df).drop_duplicates()
        
    final_df = final_df.where("(trip_distance <= 30 and trip_distance > 0) AND (passenger_count > 0 and passenger_count < 5) AND (pickup_datetime < dropoff_datetime)")

    final_df = final_df.withColumn("year", lit(year)).withColumn("month", lit(month))
    
    mode = "overwrite"
    
    # Ação de gravação em parquet
    final_df = final_df.repartition(5)
    final_df.write.mode(mode).option("compression", "snappy").partitionBy("year", "month").parquet("data/curated/trip_data")
    
    print(f"Finished year: {year_str}-{month_str}")
    yellow_df.unpersist()
    green_df.unpersist()

executor = ThreadPoolExecutor(max_workers=12) 
futures = []

for year in [2018, 2019, 2020, 2021, 2022]:
    for month in range(1, 13):
        future = executor.submit(process_month, year, month)
        futures.append(future)

# Aguardar a conclusão de todas as tarefas
for future in futures:
    future.result()

Executing year: 2018-01
Executing year: 2018-02
Executing year: 2018-03
Executing year: 2018-04
Executing year: 2018-05
Executing year: 2018-06
Executing year: 2018-07
Executing year: 2018-08
Executing year: 2018-09
Executing year: 2018-10
Executing year: 2018-11
Executing year: 2018-12
Finished year: 2018-12
Executing year: 2019-01
Finished year: 2018-10
Executing year: 2019-02
Finished year: 2018-06
Executing year: 2019-03
Finished year: 2018-03
Executing year: 2019-04
Finished year: 2018-09
Executing year: 2019-05
Finished year: 2018-11
Executing year: 2019-06
Finished year: 2018-01
Executing year: 2019-07
Finished year: 2018-04
Executing year: 2019-08
Finished year: 2018-05
Executing year: 2019-09
Finished year: 2018-02
Executing year: 2019-10
Finished year: 2018-07
Executing year: 2019-11
Finished year: 2018-08
Executing year: 2019-12
Finished year: 2019-01
Executing year: 2020-01
Finished year: 2019-09
Executing year: 2020-02
Finished year: 2019-05
Executing year: 2020-03
Finishe

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [Errno 104] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERRO

ConnectionRefusedError: [Errno 111] Connection refused