# Teste do Auto Loader com arquivos CDC

Este exemplo utiliza o Auto Loader para ler arquivos Parquet incrementais do caminho `cdc/` no container `taxi` e carregar na tabela Delta `lab.taxi.yellow_taxi_autoloader`.

In [0]:
%sql
DROP TABLE IF EXISTS lab.taxi.yellow_taxi_autoloader 

In [0]:
%sql
CREATE TABLE lab.taxi.yellow_taxi_autoloader (
  tpep_pickup_datetime TIMESTAMP,
  tpep_dropoff_datetime TIMESTAMP,
  passenger_count DOUBLE,
  trip_distance DOUBLE,
  RatecodeID DOUBLE,
  store_and_fwd_flag STRING,
  PULocationID BIGINT,
  DOLocationID BIGINT,
  payment_type BIGINT,
  fare_amount DOUBLE,
  extra DOUBLE,
  mta_tax DOUBLE,
  tip_amount DOUBLE,
  tolls_amount DOUBLE,
  improvement_surcharge DOUBLE,
  total_amount DOUBLE,
  congestion_surcharge DOUBLE,
  airport_fee DOUBLE,
  pickup_year_month STRING,
  ingestion_timestamp TIMESTAMP
)
USING DELTA
PARTITIONED BY (pickup_year_month)

In [0]:
from pyspark.sql.functions import trunc, current_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType, DoubleType, StringType, LongType

cdc_path = "abfss://taxi@hfotaxinyc.dfs.core.windows.net/cdc/yellow"

yellow_schema = StructType([
    StructField("tpep_pickup_datetime", TimestampType(), True),
    StructField("tpep_dropoff_datetime", TimestampType(), True),
    StructField("passenger_count", DoubleType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("RatecodeID", DoubleType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("PULocationID", LongType(), True),
    StructField("DOLocationID", LongType(), True),
    StructField("payment_type", LongType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("extra", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("improvement_surcharge", DoubleType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("congestion_surcharge", DoubleType(), True),
    StructField("airport_fee", DoubleType(), True),
    StructField("pickup_year_month", StringType(), True),
    StructField("ingestion_timestamp", TimestampType(), True)
])

df = (spark.readStream
      .format("cloudFiles")
      .option("cloudFiles.format", "parquet")
      .schema(yellow_schema)
      .load(cdc_path)
)

df = df.withColumn("pickup_year_month", trunc("tpep_pickup_datetime", "MM")) \
       .withColumn("ingestion_timestamp", current_timestamp())


(df.writeStream
 .format("delta")
 .option("checkpointLocation", "/tmp/checkpoints/yellow_autoloader")
 .option("mergeSchema", "true")
 .partitionBy("pickup_year_month")
 .table("lab.taxi.yellow_autoloader")
)

In [0]:
%sql
SELECT COUNT(*) FROM lab.taxi.yellow_autoloader LIMIT 5
--16330086
--17750975
--20942282
--22713075

In [0]:
dbutils.fs.rm("/tmp/checkpoints/yellow_autoloader", recurse=True)