In [0]:
from pyspark.sql import SparkSession
import os

In [0]:
dates_to_load = ["202301", "202302", "202303","202304","202305"]

base_path = "/Workspace/Users/leonardocapriglione@gmail.com/ifood-case/Case-iFood/arquivos/"

In [0]:
def read_files_by_dates(base_path: str, dates: list, file_format: str = "parquet", options: dict = None):
    """
    Reads files with the pattern yellow_tripdata_YYYY-MM.{format} based on a list of dates in the format YYYYMM.

    Parameters:
    - base_path (str): Base directory where the files are located.
    - dates (list): List of dates in 'YYYYMM' format, e.g., ['202301', '202302'].
    - file_format (str): Format of the files (default: 'parquet').
    - options (dict): Additional read options as a dictionary.

    Returns:
    - A unified DataFrame containing the data from all selected files.
    """

    spark = SparkSession.builder.getOrCreate()

    if options is None:
        options = {}

    files_to_read = []
    for date in dates:
        year = date[:4]
        month = date[4:]
        file_name = f"yellow_tripdata_{year}-{month}.{file_format}"
        file_path = os.path.join(base_path, file_name)
        files_to_read.append(file_path)

    df_combined = spark.read.format(file_format).options(**options).load(files_to_read)
    return df_combined


In [0]:
df = read_files_by_dates(base_path, dates_to_load, file_format="parquet")

df.display()