In [0]:
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, DoubleType

# Define o caminho para a zona de landing
landing_path = "/Volumes/workspace/default/ifood_landing_zone/"
# O caminho de consumo.
consumption_path = "/Volumes/workspace/default/ifood_consumption_zone/"

# --- 1. Leitura e Processamento ---
files = [f.path for f in dbutils.fs.ls(landing_path) if f.path.endswith(".parquet")]
final_df = None
for file in files:
    print(f"📂 Lendo {file}...")
    df = spark.read.format("parquet").load(file)
    for old_name in df.columns:
        df = df.withColumnRenamed(old_name, old_name.lower())
    df = df.select(
        col("vendorid").cast(IntegerType()).alias("vendor_id"),
        col("tpep_pickup_datetime").alias("pickup_datetime"),
        col("tpep_dropoff_datetime").alias("dropoff_datetime"),
        col("passenger_count").cast(IntegerType()).alias("passenger_count"),
        col("total_amount").cast(DoubleType()).alias("total_amount")
    )
    if final_df is None:
        final_df = df
    else:
        final_df = final_df.unionByName(df)

cleaned_df = final_df.filter(
    (col("passenger_count") > 0) &
    (col("total_amount") >= 0) &
    col("vendor_id").isNotNull()
)

# --- 2. Criação do Schema e da Tabela (Maneira Correta) ---

# Garanta que o catálogo e o schema existam.
# O schema usará a localização padrão do catálogo.
print("⚙️ Criando schema (se não existir)...")
spark.sql("CREATE SCHEMA IF NOT EXISTS ifood_challenge")

# Salve o DataFrame como uma TABELA GERENCIADA.
# O Unity Catalog vai gerenciar automaticamente onde os arquivos são salvos.
print("💾 Salvando dados como tabela GERENCIADA...")
cleaned_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("ifood_challenge.yellow_taxi_trips")

print("✅ Tabela gerenciada 'ifood_challenge.yellow_taxi_trips' criada com sucesso!")


# --- 3. Verificação ---
print("\n📊 Verificando os dados da tabela criada...")
display(spark.sql("SELECT * FROM ifood_challenge.yellow_taxi_trips LIMIT 10"))


📂 Lendo dbfs:/Volumes/workspace/default/ifood_landing_zone/yellow_tripdata_2023-01.parquet...
📂 Lendo dbfs:/Volumes/workspace/default/ifood_landing_zone/yellow_tripdata_2023-02.parquet...
📂 Lendo dbfs:/Volumes/workspace/default/ifood_landing_zone/yellow_tripdata_2023-03.parquet...
📂 Lendo dbfs:/Volumes/workspace/default/ifood_landing_zone/yellow_tripdata_2023-04.parquet...
📂 Lendo dbfs:/Volumes/workspace/default/ifood_landing_zone/yellow_tripdata_2023-05.parquet...
⚙️ Criando schema (se não existir)...
💾 Salvando dados como tabela GERENCIADA...
✅ Tabela gerenciada 'ifood_challenge.yellow_taxi_trips' criada com sucesso!

📊 Verificando os dados da tabela criada...


vendor_id,pickup_datetime,dropoff_datetime,passenger_count,total_amount
1,2023-02-01T00:32:53.000,2023-02-01T00:34:34.000,2,9.4
2,2023-02-01T00:35:16.000,2023-02-01T00:35:30.000,1,5.5
2,2023-02-01T00:12:28.000,2023-02-01T00:25:46.000,1,25.3
1,2023-02-01T00:52:40.000,2023-02-01T01:07:18.000,1,32.25
1,2023-02-01T00:12:39.000,2023-02-01T00:40:36.000,1,50.0
1,2023-02-01T00:56:53.000,2023-02-01T01:00:37.000,1,14.64
2,2023-02-01T00:20:40.000,2023-02-01T00:33:56.000,1,44.12
2,2023-02-01T00:33:51.000,2023-02-01T00:37:34.000,1,12.42
2,2023-02-01T01:00:45.000,2023-02-01T01:06:00.000,1,14.64
2,2023-02-01T00:10:48.000,2023-02-01T00:18:09.000,1,16.0
