# Setup - Create Catalogs, Schemas, Volume and Tables
This notebook creates the complete NYC Taxi Lakehouse structure including:
- Dev catalog
- Taxi schema
- Raw volume
- Bronze and Silver tables for yellow and green taxi data

In [0]:
%sql
-- Create storage credential and external location (run once)
CREATE EXTERNAL LOCATION IF NOT EXISTS taxi_location URL
'abfss://taxi@hfotaxinyc.dfs.core.windows.net/' WITH (STORAGE CREDENTIAL taxi_credential);

In [0]:
%sql
-- Create dev catalog
CREATE CATALOG IF NOT EXISTS dev
MANAGED LOCATION 'abfss://taxi@hfotaxinyc.dfs.core.windows.net/'
COMMENT 'Development catalog for taxi data lakehouse';


In [0]:
%sql
-- Create taxi schema
CREATE SCHEMA IF NOT EXISTS dev.taxi
MANAGED LOCATION 'abfss://taxi@hfotaxinyc.dfs.core.windows.net/'
COMMENT 'Schema for NYC taxi data processing pipeline';

-- Use the taxi schema
USE SCHEMA taxi;

In [0]:
%sql
-- Create volume for raw data

CREATE EXTERNAL VOLUME IF NOT EXISTS dev.taxi.raw
LOCATION 'abfss://taxi@hfotaxinyc.dfs.core.windows.net/raw'
COMMENT 'Volume for raw taxi data files from Azure storage';

## Bronze Layer Tables
Raw data ingestion tables with metadata for data lineage

In [0]:
%sql
-- Bronze Yellow Taxi table
CREATE TABLE IF NOT EXISTS bronze_yellow_taxi (
    VendorID INT,
    tpep_pickup_datetime TIMESTAMP,
    tpep_dropoff_datetime TIMESTAMP,
    passenger_count DOUBLE,
    trip_distance DOUBLE,
    RatecodeID DOUBLE,
    store_and_fwd_flag STRING,
    PULocationID INT,
    DOLocationID INT,
    payment_type INT,
    fare_amount DOUBLE,
    extra DOUBLE,
    mta_tax DOUBLE,
    tip_amount DOUBLE,
    tolls_amount DOUBLE,
    improvement_surcharge DOUBLE,
    total_amount DOUBLE,
    congestion_surcharge DOUBLE,
    airport_fee DOUBLE,
    -- Metadata columns for data lineage
    _source_file STRING,
    _ingestion_timestamp TIMESTAMP,
    _processing_date DATE
)
USING DELTA
PARTITIONED BY (_processing_date)
LOCATION 'abfss://taxi@hfotaxinyc.dfs.core.windows.net/bronze/yellow_taxi'
COMMENT 'Bronze layer table for raw yellow taxi trip records';

In [0]:
%sql
-- Bronze Green Taxi table
CREATE TABLE IF NOT EXISTS bronze_green_taxi (
    VendorID INT,
    lpep_pickup_datetime TIMESTAMP,
    lpep_dropoff_datetime TIMESTAMP,
    store_and_fwd_flag STRING,
    RatecodeID DOUBLE,
    PULocationID INT,
    DOLocationID INT,
    passenger_count DOUBLE,
    trip_distance DOUBLE,
    fare_amount DOUBLE,
    extra DOUBLE,
    mta_tax DOUBLE,
    tip_amount DOUBLE,
    tolls_amount DOUBLE,
    ehail_fee DOUBLE,
    improvement_surcharge DOUBLE,
    total_amount DOUBLE,
    payment_type INT,
    trip_type INT,
    congestion_surcharge DOUBLE,
    -- Metadata columns for data lineage
    _source_file STRING,
    _ingestion_timestamp TIMESTAMP,
    _processing_date DATE
)
USING DELTA
PARTITIONED BY (_processing_date)
LOCATION 'abfss://taxi@hfotaxinyc.dfs.core.windows.net/bronze/green_taxi'
COMMENT 'Bronze layer table for raw green taxi trip records';

## Silver Layer Tables
Cleaned and validated data with data quality metrics

In [0]:
%sql
-- Silver Yellow Taxi table
CREATE TABLE IF NOT EXISTS silver_yellow_taxi (
    trip_id STRING,
    vendor_id INT,
    pickup_datetime TIMESTAMP,
    dropoff_datetime TIMESTAMP,
    passenger_count INT,
    trip_distance DECIMAL(10,2),
    ratecode_id INT,
    store_and_fwd_flag BOOLEAN,
    pickup_location_id INT,
    dropoff_location_id INT,
    payment_type_id INT,
    fare_amount DECIMAL(10,2),
    extra DECIMAL(10,2),
    mta_tax DECIMAL(10,2),
    tip_amount DECIMAL(10,2),
    tolls_amount DECIMAL(10,2),
    improvement_surcharge DECIMAL(10,2),
    total_amount DECIMAL(10,2),
    congestion_surcharge DECIMAL(10,2),
    airport_fee DECIMAL(10,2),
    trip_duration_minutes INT,
    trip_speed_mph DECIMAL(10,2),
    -- Data quality flags
    is_valid_trip BOOLEAN,
    data_quality_score DECIMAL(3,2),
    -- Metadata columns
    processing_timestamp TIMESTAMP,
    processing_date DATE
)
USING DELTA
PARTITIONED BY (processing_date)
LOCATION 'abfss://taxi@hfotaxinyc.dfs.core.windows.net/silver/yellow_taxi'
COMMENT 'Silver layer table for cleaned and validated yellow taxi trip records';

In [0]:
%sql
-- Silver Green Taxi table
CREATE TABLE IF NOT EXISTS silver_green_taxi (
    trip_id STRING,
    vendor_id INT,
    pickup_datetime TIMESTAMP,
    dropoff_datetime TIMESTAMP,
    store_and_fwd_flag BOOLEAN,
    ratecode_id INT,
    pickup_location_id INT,
    dropoff_location_id INT,
    passenger_count INT,
    trip_distance DECIMAL(10,2),
    fare_amount DECIMAL(10,2),
    extra DECIMAL(10,2),
    mta_tax DECIMAL(10,2),
    tip_amount DECIMAL(10,2),
    tolls_amount DECIMAL(10,2),
    ehail_fee DECIMAL(10,2),
    improvement_surcharge DECIMAL(10,2),
    total_amount DECIMAL(10,2),
    payment_type_id INT,
    trip_type INT,
    congestion_surcharge DECIMAL(10,2),
    trip_duration_minutes INT,
    trip_speed_mph DECIMAL(10,2),
    -- Data quality flags
    is_valid_trip BOOLEAN,
    data_quality_score DECIMAL(3,2),
    -- Metadata columns
    processing_timestamp TIMESTAMP,
    processing_date DATE
)
USING DELTA
PARTITIONED BY (processing_date)
LOCATION 'abfss://taxi@hfotaxinyc.dfs.core.windows.net/silver/green_taxi'
COMMENT 'Silver layer table for cleaned and validated green taxi trip records';

## Verification
Check that all objects were created successfully

In [0]:
%sql
SHOW CATALOGS;

In [0]:
%sql
SHOW SCHEMAS IN dev;

In [0]:
%sql
SHOW VOLUMES IN dev.taxi;

In [0]:
%sql
SHOW TABLES IN dev.taxi;

In [0]:
%sql
-- Show table details
DESCRIBE EXTENDED dev.taxi.bronze_yellow_taxi;

In [0]:
%sql
-- Show table details
DESCRIBE EXTENDED dev.taxi.silver_yellow_taxi;