# Bronze Layer - Create Tables and Ingest Data
This notebook creates Bronze tables with explicit schema and ingests data from 2020-2022 using COPY INTO.

In [None]:
# Create Yellow Taxi Bronze table with explicit schema
%sql
CREATE TABLE IF NOT EXISTS bronze.taxi.yellow_taxi (
  VendorID TINYINT COMMENT 'Taxi company identifier',
  tpep_pickup_datetime TIMESTAMP_NTZ COMMENT 'Pickup timestamp',
  tpep_dropoff_datetime TIMESTAMP_NTZ COMMENT 'Dropoff timestamp',
  passenger_count TINYINT COMMENT 'Number of passengers',
  trip_distance DOUBLE COMMENT 'Trip distance in miles',
  RatecodeID TINYINT COMMENT 'Rate code identifier',
  store_and_fwd_flag STRING COMMENT 'Store and forward flag',
  PULocationID SMALLINT COMMENT 'Pickup location ID',
  DOLocationID SMALLINT COMMENT 'Dropoff location ID',
  payment_type TINYINT COMMENT 'Payment method identifier',
  fare_amount DECIMAL(10, 2) COMMENT 'Base fare amount',
  extra DECIMAL(7, 2) COMMENT 'Extra charges',
  mta_tax DECIMAL(5, 2) COMMENT 'MTA tax',
  tip_amount DECIMAL(12, 2) COMMENT 'Tip amount',
  tolls_amount DECIMAL(7, 2) COMMENT 'Tolls amount',
  improvement_surcharge DECIMAL(3, 2) COMMENT 'Improvement surcharge',
  total_amount DECIMAL(10, 2) COMMENT 'Total trip amount',
  congestion_surcharge DECIMAL(4, 2) COMMENT 'Congestion surcharge',
  airport_fee DECIMAL(3, 2) COMMENT 'Airport fee',
  pickup_year_month STRING COMMENT 'Partition column - YYYY-MM format'
)
USING DELTA
PARTITIONED BY (pickup_year_month)
COMMENT 'Bronze Yellow Taxi trip records'
TBLPROPERTIES (
  'delta.autoOptimize.optimizeWrite' = 'true',
  'delta.autoOptimize.autoCompact' = 'true'
);

In [None]:
# Create Green Taxi Bronze table with explicit schema
%sql
CREATE TABLE IF NOT EXISTS bronze.taxi.green_taxi (
  VendorID TINYINT COMMENT 'Taxi company identifier',
  lpep_pickup_datetime TIMESTAMP_NTZ COMMENT 'Pickup timestamp',
  lpep_dropoff_datetime TIMESTAMP_NTZ COMMENT 'Dropoff timestamp',
  passenger_count TINYINT COMMENT 'Number of passengers',
  trip_distance DOUBLE COMMENT 'Trip distance in miles',
  RatecodeID TINYINT COMMENT 'Rate code identifier',
  store_and_fwd_flag STRING COMMENT 'Store and forward flag',
  PULocationID SMALLINT COMMENT 'Pickup location ID',
  DOLocationID SMALLINT COMMENT 'Dropoff location ID',
  payment_type TINYINT COMMENT 'Payment method identifier',
  fare_amount DECIMAL(10, 2) COMMENT 'Base fare amount',
  extra DECIMAL(7, 2) COMMENT 'Extra charges',
  mta_tax DECIMAL(5, 2) COMMENT 'MTA tax',
  tip_amount DECIMAL(12, 2) COMMENT 'Tip amount',
  tolls_amount DECIMAL(7, 2) COMMENT 'Tolls amount',
  improvement_surcharge DECIMAL(3, 2) COMMENT 'Improvement surcharge',
  total_amount DECIMAL(10, 2) COMMENT 'Total trip amount',
  trip_type TINYINT COMMENT 'Trip type',
  congestion_surcharge DECIMAL(4, 2) COMMENT 'Congestion surcharge',
  ehail_fee DECIMAL(3, 2) COMMENT 'Ehail fee',
  pickup_year_month STRING COMMENT 'Partition column - YYYY-MM format'
)
USING DELTA
PARTITIONED BY (pickup_year_month)
COMMENT 'Bronze Green Taxi trip records'
TBLPROPERTIES (
  'delta.autoOptimize.optimizeWrite' = 'true',
  'delta.autoOptimize.autoCompact' = 'true'
);

In [None]:
# Function to ingest Yellow Taxi data for a specific year
def ingest_yellow_taxi_year(year):
    for month in range(1, 13):
        try:
            query = f"""
            COPY INTO bronze.taxi.yellow_taxi
            FROM (
                SELECT 
                  CAST(VendorID AS TINYINT) AS VendorID,
                  CAST(tpep_pickup_datetime AS TIMESTAMP_NTZ) AS tpep_pickup_datetime,
                  CAST(tpep_dropoff_datetime AS TIMESTAMP_NTZ) AS tpep_dropoff_datetime,
                  CAST(passenger_count AS TINYINT) AS passenger_count,
                  CAST(trip_distance AS DOUBLE) AS trip_distance,
                  CAST(RatecodeID AS TINYINT) AS RatecodeID,
                  CAST(store_and_fwd_flag AS STRING) AS store_and_fwd_flag,
                  CAST(PULocationID AS SMALLINT) AS PULocationID,
                  CAST(DOLocationID AS SMALLINT) AS DOLocationID,
                  CAST(payment_type AS TINYINT) AS payment_type,
                  CAST(fare_amount AS DECIMAL(10,2)) AS fare_amount,
                  CAST(extra AS DECIMAL(7,2)) AS extra,
                  CAST(mta_tax AS DECIMAL(5,2)) AS mta_tax,
                  CAST(tip_amount AS DECIMAL(12,2)) AS tip_amount,
                  CAST(tolls_amount AS DECIMAL(7,2)) AS tolls_amount,
                  CAST(improvement_surcharge AS DECIMAL(3,2)) AS improvement_surcharge,
                  CAST(total_amount AS DECIMAL(10,2)) AS total_amount,
                  CAST(congestion_surcharge AS DECIMAL(4,2)) AS congestion_surcharge,
                  CAST(airport_fee AS DECIMAL(3,2)) AS airport_fee,
                  date_format(tpep_pickup_datetime, 'yyyy-MM') AS pickup_year_month 
                FROM '/Volumes/raw/taxi/yellow_taxi/{year}/yellow_tripdata_{year}-{month:02d}.parquet'
                WHERE tpep_pickup_datetime >= '{year}-01-01' 
                  AND tpep_pickup_datetime < '{year + 1}-01-01'
            )
            FILEFORMAT = PARQUET
            FORMAT_OPTIONS('inferSchema' = 'false')
            COPY_OPTIONS('mergeSchema' = 'true')
            """
            spark.sql(query)
            print(f"Ingested Yellow Taxi data for {year}-{month:02d}")
        except Exception as e:
            print(f"Failed to ingest Yellow Taxi data for {year}-{month:02d}: {str(e)}")

# Ingest data for years 2020-2022
for year in [2020, 2021, 2022]:
    print(f"Starting ingestion for Yellow Taxi {year}")
    ingest_yellow_taxi_year(year)
    print(f"Completed ingestion for Yellow Taxi {year}")

In [None]:
# Function to ingest Green Taxi data for a specific year
def ingest_green_taxi_year(year):
    for month in range(1, 13):
        try:
            query = f"""
            COPY INTO bronze.taxi.green_taxi
            FROM (
                SELECT 
                  CAST(VendorID AS TINYINT) AS VendorID,
                  CAST(lpep_pickup_datetime AS TIMESTAMP_NTZ) AS lpep_pickup_datetime,
                  CAST(lpep_dropoff_datetime AS TIMESTAMP_NTZ) AS lpep_dropoff_datetime,
                  CAST(passenger_count AS TINYINT) AS passenger_count,
                  CAST(trip_distance AS DOUBLE) AS trip_distance,
                  CAST(RatecodeID AS TINYINT) AS RatecodeID,
                  CAST(store_and_fwd_flag AS STRING) AS store_and_fwd_flag,
                  CAST(PULocationID AS SMALLINT) AS PULocationID,
                  CAST(DOLocationID AS SMALLINT) AS DOLocationID,
                  CAST(payment_type AS TINYINT) AS payment_type,
                  CAST(fare_amount AS DECIMAL(10,2)) AS fare_amount,
                  CAST(extra AS DECIMAL(7,2)) AS extra,
                  CAST(mta_tax AS DECIMAL(5,2)) AS mta_tax,
                  CAST(tip_amount AS DECIMAL(12,2)) AS tip_amount,
                  CAST(tolls_amount AS DECIMAL(7,2)) AS tolls_amount,
                  CAST(improvement_surcharge AS DECIMAL(3,2)) AS improvement_surcharge,
                  CAST(total_amount AS DECIMAL(10,2)) AS total_amount,
                  CAST(trip_type AS TINYINT) AS trip_type,
                  CAST(congestion_surcharge AS DECIMAL(4,2)) AS congestion_surcharge,
                  CAST(ehail_fee AS DECIMAL(3,2)) AS ehail_fee,
                  date_format(lpep_pickup_datetime, 'yyyy-MM') AS pickup_year_month 
                FROM '/Volumes/raw/taxi/green_taxi/{year}/green_tripdata_{year}-{month:02d}.parquet'
                WHERE lpep_pickup_datetime >= '{year}-01-01' 
                  AND lpep_pickup_datetime < '{year + 1}-01-01'
            )
            FILEFORMAT = PARQUET
            FORMAT_OPTIONS('inferSchema' = 'false')
            COPY_OPTIONS('mergeSchema' = 'true')
            """
            spark.sql(query)
            print(f"Ingested Green Taxi data for {year}-{month:02d}")
        except Exception as e:
            print(f"Failed to ingest Green Taxi data for {year}-{month:02d}: {str(e)}")

# Ingest data for years 2020-2022
for year in [2020, 2021, 2022]:
    print(f"Starting ingestion for Green Taxi {year}")
    ingest_green_taxi_year(year)
    print(f"Completed ingestion for Green Taxi {year}")

In [None]:
# Verify data ingestion
%sql
SELECT 
  'yellow_taxi' as table_name,
  pickup_year_month,
  COUNT(*) as record_count
FROM bronze.taxi.yellow_taxi 
GROUP BY pickup_year_month
ORDER BY pickup_year_month;

In [None]:
%sql
SELECT 
  'green_taxi' as table_name,
  pickup_year_month,
  COUNT(*) as record_count
FROM bronze.taxi.green_taxi 
GROUP BY pickup_year_month
ORDER BY pickup_year_month;

In [None]:
%sql
SELECT 
  'green_taxi' as table_name,
  pickup_year_month,
  COUNT(*) as record_count
FROM bronze.taxi.green_taxi 
GROUP BY pickup_year_month
ORDER BY pickup_year_month;