In [0]:
df = spark.read.parquet("s3a://landing-layer-ifood/yellow_tripdata_2023-01.parquet")
df.display()

In [0]:
df = spark.read.parquet("s3a://landing-layer-ifood/yellow_tripdata_2023-02.parquet")
df.display()

In [0]:
df = spark.read.parquet("s3a://landing-layer-ifood/yellow_tripdata_2023-03.parquet")
df.display()

In [0]:
df = spark.read.parquet("s3a://landing-layer-ifood/yellow_tripdata_2023-04.parquet")
df.display()

In [0]:
df = spark.read.parquet("s3a://landing-layer-ifood/yellow_tripdata_2023-05.parquet")
df.display()

In [0]:
import os
import re
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, input_file_name
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, LongType
from functools import reduce

In [0]:
datas = ["202301", "202302", "202303","202304","202305"]
bucket = "landing-layer-ifood"

In [0]:

def read_files_by_dates_s3_uc_select_columns(bucket_name: str, dates: list, file_format: str = "parquet", options: dict = None):
    """
    Lê arquivos parquet do bucket S3 e seleciona colunas específicas,
    unificando todos os DataFrames resultantes.

    Parâmetros:
    - bucket_name: nome do bucket S3 (ex: 'landing-layer-ifood')
    - dates: lista de datas no formato 'YYYYMM' (ex: ['202301', '202302'])
    - file_format: formato dos arquivos (default 'parquet')
    - options: opções adicionais para spark.read

    Retorna:
    - DataFrame unificado contendo só as colunas selecionadas + coluna source_file com data no formato YYYY-MM
    """
    spark = SparkSession.builder.getOrCreate()
    if options is None:
        options = {}

    cols_to_select = [
        "VendorID",
        "passenger_count",
        "total_amount",
        "tpep_pickup_datetime",
        "tpep_dropoff_datetime"
    ]

    dfs = []
    for date in dates:
        path = f"s3a://{bucket_name}/yellow_tripdata_{date[:4]}-{date[4:]}.{file_format}"
        try:
            df_temp = spark.read.format(file_format).options(**options).load(path)
            df_temp = df_temp.select(*cols_to_select)

            # Extrai data no formato YYYY-MM do path
            match = re.search(r"yellow_tripdata_(\d{4}-\d{2})", path)
            prefix_date = match.group(1) if match else "unknown"

            df_temp = df_temp.withColumn("source_file", lit(prefix_date))
            dfs.append(df_temp)
            print(f"✅ Arquivo carregado e colunas selecionadas: {path}")
        except Exception as e:
            print(f"❌ Erro ao ler o arquivo {path}: {e}")

    if dfs:
        df_union = reduce(lambda df1, df2: df1.unionByName(df2, allowMissingColumns=True), dfs)
        return df_union
    else:
        return spark.createDataFrame([], schema=None)

In [0]:
df = read_files_by_dates_s3_uc_select_columns(bucket, datas)
df.display()