## COPY INTO
Copy SQL command that is idempotent.


In [0]:
%sql
CREATE CATALOG IF NOT EXISTS lab
MANAGED LOCATION 'abfss://taxi@hfotaxinyc.dfs.core.windows.net/lab';

CREATE SCHEMA IF NOT EXISTS lab.taxi
MANAGED LOCATION 'abfss://taxi@hfotaxinyc.dfs.core.windows.net/lab';

In [0]:
%sql


DROP TABLE IF EXISTS lab.taxi.yellow_copy_into;



In [0]:
%sql
CREATE TABLE IF NOT EXISTS lab.taxi.yellow_copy_into (
  VendorID INT,
  tpep_pickup_datetime TIMESTAMP_NTZ,
  tpep_dropoff_datetime TIMESTAMP_NTZ,
  passenger_count BIGINT,
  trip_distance DOUBLE,
  RatecodeID BIGINT,
  store_and_fwd_flag STRING,
  PULocationID INT,
  DOLocationID INT,
  payment_type BIGINT,
  fare_amount DOUBLE,
  extra DOUBLE,
  mta_tax DOUBLE,
  tip_amount DOUBLE,
  tolls_amount DOUBLE,
  improvement_surcharge DOUBLE,
  total_amount DOUBLE,
  congestion_surcharge DOUBLE,
  airport_fee DOUBLE,
  pickup_year_month STRING
)
USING DELTA
PARTITIONED BY (pickup_year_month);


In [0]:
%sql
COPY INTO lab.taxi.yellow_copy_into
FROM(
    SELECT *, 
           _metadata, 
           date_format(tpep_pickup_datetime, 'yyyy-MM') AS pickup_year_month 
    FROM 'dbfs:/Volumes/raw/taxi/yellow_taxi/2024/yellow_tripdata_2024-12.parquet'
)
FILEFORMAT = PARQUET
FORMAT_OPTIONS('inferSchema' = 'true')
COPY_OPTIONS('mergeSchema' = 'true');

In [0]:
%sql
SELECT COUNT(*) FROM lab.taxi.yellow_copy_into 

In [0]:
%sql
SELECT * FROM lab.taxi.yellow_copy_into limit 5 

Test idempotency running it again

## COPY INTO com mergeSchema = true

In 2023, column airport_fee was named Airport_fee. Running with and without mergeSchema to test.

In [0]:
%sql
COPY INTO lab.taxi.yellow_copy_into
FROM(
    SELECT *, 
           _metadata, 
           date_format(tpep_pickup_datetime, 'yyyy-MM') AS pickup_year_month 
    FROM 'dbfs:/Volumes/raw/taxi/yellow_taxi/2025/yellow_tripdata_2025-01.parquet'
)
FILEFORMAT = PARQUET
FORMAT_OPTIONS('inferSchema' = 'true')
COPY_OPTIONS('mergeSchema' = 'false');

In [0]:
%sql
COPY INTO lab.taxi.yellow_copy_into
FROM(
    SELECT *, 
           _metadata, 
           date_format(tpep_pickup_datetime, 'yyyy-MM') AS pickup_year_month 
    FROM 'dbfs:/Volumes/raw/taxi/yellow_taxi/2025/yellow_tripdata_2025-01.parquet'
)
FILEFORMAT = PARQUET
FORMAT_OPTIONS('inferSchema' = 'true')
COPY_OPTIONS('mergeSchema' = 'true');

In [0]:
%sql
(select cbd_congestion_fee from lab.taxi.yellow_copy_into
where pickup_year_month = '2024-12' limit 5)
UNION ALL
(select cbd_congestion_fee from lab.taxi.yellow_copy_into
where pickup_year_month = '2025-01' limit 5)

##Testing data type proble solving with COPY INTO

In [0]:

%sql
DESCRIBE TABLE lab.taxi.yellow_copy_into;


In [0]:
%sql
DESCRIBE HISTORY lab.taxi.yellow_copy_into

In [0]:
%sql
CREATE TABLE IF NOT EXISTS lab.taxi.yellow_taxi_data_type_problem
(
    VendorID TINYINT,
  tpep_pickup_datetime TIMESTAMP_NTZ,
  tpep_dropoff_datetime TIMESTAMP_NTZ,
  passenger_count TINYINT,
  trip_distance DOUBLE,
  RatecodeID TINYINT,
  store_and_fwd_flag STRING,
  PULocationID SMALLINT,
  DOLocationID SMALLINT,
  payment_type TINYINT,
  fare_amount DECIMAL(10, 2),
  extra DECIMAL(7, 2),
  mta_tax DECIMAL(5, 2),
  tip_amount DECIMAL(12, 2),
  tolls_amount DECIMAL(7, 2),
  improvement_surcharge DECIMAL(3, 2),
  total_amount DECIMAL(10, 2),
  congestion_surcharge DECIMAL(4, 2),
  airport_fee DECIMAL(3, 2),
  pickup_year_month STRING
)
USING DELTA
PARTITIONED BY (pickup_year_month);


In [0]:
%sql
COPY INTO lab.taxi.yellow_taxi_data_type_problem
FROM(
    SELECT 
      CAST(VendorID AS TINYINT) AS VendorID,
      CAST(tpep_pickup_datetime AS TIMESTAMP_NTZ) AS tpep_pickup_datetime,
      CAST(tpep_dropoff_datetime AS TIMESTAMP_NTZ) AS tpep_dropoff_datetime,
      CAST(passenger_count AS TINYINT) AS passenger_count,
      CAST(trip_distance AS DOUBLE) AS trip_distance,
      CAST(RatecodeID AS TINYINT) AS RatecodeID,
      CAST(store_and_fwd_flag AS STRING) AS store_and_fwd_flag,
      CAST(PULocationID AS SMALLINT) AS PULocationID,
      CAST(DOLocationID AS SMALLINT) AS DOLocationID,
      CAST(payment_type AS TINYINT) AS payment_type,
      CAST(fare_amount AS DECIMAL(10,2)) AS fare_amount,
      CAST(extra AS DECIMAL(7,2)) AS extra,
      CAST(mta_tax AS DECIMAL(5,2)) AS mta_tax,
      CAST(tip_amount AS DECIMAL(12,2)) AS tip_amount,
      CAST(tolls_amount AS DECIMAL(7,2)) AS tolls_amount,
      CAST(improvement_surcharge AS DECIMAL(3,2)) AS improvement_surcharge,
      CAST(total_amount AS DECIMAL(10,2)) AS total_amount,
      CAST(congestion_surcharge AS DECIMAL(4,2)) AS congestion_surcharge,
  airport_fee DECIMAL,
           date_format(tpep_pickup_datetime, 'yyyy-MM') AS pickup_year_month 
    FROM 'dbfs:/Volumes/raw/taxi/yellow_taxi/2012/yellow_tripdata_2012-01.parquet'
)
FILEFORMAT = PARQUET
FORMAT_OPTIONS('inferSchema' = 'true')
COPY_OPTIONS('mergeSchema' = 'true');

In [0]:
display(spark.read.format('parquet').load('/Volumes/raw/taxi/yellow_taxi/2009/yellow_tripdata_2009-01.parquet').limit(5))

In [0]:
%sql
COPY INTO lab.taxi.yellow_taxi_data_type_problem
FROM(
    SELECT 
      CAST(VendorID AS TINYINT) AS VendorID,
      CAST(tpep_pickup_datetime AS TIMESTAMP_NTZ) AS tpep_pickup_datetime,
      CAST(tpep_dropoff_datetime AS TIMESTAMP_NTZ) AS tpep_dropoff_datetime,
      CAST(passenger_count AS TINYINT) AS passenger_count,
      CAST(trip_distance AS DOUBLE) AS trip_distance,
      CAST(RatecodeID AS TINYINT) AS RatecodeID,
      CAST(store_and_fwd_flag AS STRING) AS store_and_fwd_flag,
      CAST(PULocationID AS SMALLINT) AS PULocationID,
      CAST(DOLocationID AS SMALLINT) AS DOLocationID,
      CAST(payment_type AS TINYINT) AS payment_type,
      CAST(fare_amount AS DECIMAL(10,2)) AS fare_amount,
      CAST(extra AS DECIMAL(7,2)) AS extra,
      CAST(mta_tax AS DECIMAL(5,2)) AS mta_tax,
      CAST(tip_amount AS DECIMAL(12,2)) AS tip_amount,
      CAST(tolls_amount AS DECIMAL(7,2)) AS tolls_amount,
      CAST(improvement_surcharge AS DECIMAL(3,2)) AS improvement_surcharge,
      CAST(total_amount AS DECIMAL(10,2)) AS total_amount,
      CAST(congestion_surcharge AS DECIMAL(4,2)) AS congestion_surcharge,
  airport_fee DECIMAL,
           date_format(tpep_pickup_datetime, 'yyyy-MM') AS pickup_year_month 
    FROM 'dbfs:/Volumes/raw/taxi/yellow_taxi/2012/yellow_tripdata_2012-01.parquet'
)
FILEFORMAT = PARQUET
FORMAT_OPTIONS('inferSchema' = 'true')
COPY_OPTIONS('mergeSchema' = 'true');

## Clone into Delta

In [0]:

%sql
--Necessário criar o STORAGE CREDENTIAL na UI
CREATE EXTERNAL LOCATION IF NOT EXISTS lab_taxi_location URL
'abfss://lab@hfotaxinyc.dfs.core.windows.net/' WITH (STORAGE CREDENTIAL taxi_credential);


CREATE SCHEMA IF NOT EXISTS lab.taxi MANAGED LOCATION
'abfss://lab@hfotaxinyc.dfs.core.windows.net/taxi/';

CREATE EXTERNAL VOLUME IF NOT EXISTS lab.taxi.yellow_taxi_raw LOCATION
'abfss://lab@hfotaxinyc.dfs.core.windows.net/taxi/';

In [0]:
%sql
CONVERT TO DELTA parquet.`/Volumes/lab/taxi/yellow_taxi_raw` 