# Silver Layer - Create Tables and Copy Data
This notebook creates Silver tables with readable column names, converts miles to km, and filters outliers.

In [0]:
%sql
CREATE TABLE IF NOT EXISTS silver.taxi.yellow_taxi (
  trip_id STRING COMMENT 'Unique trip identifier (MD5 hash)',
  vendor_id TINYINT COMMENT 'Taxi company identifier',
  vendor_name STRING COMMENT 'Taxi company name',
  pickup_datetime TIMESTAMP_NTZ COMMENT 'Trip pickup timestamp',
  dropoff_datetime TIMESTAMP_NTZ COMMENT 'Trip dropoff timestamp',
  trip_duration_minutes DECIMAL(10, 2) COMMENT 'Trip duration in minutes',
  passenger_count TINYINT COMMENT 'Number of passengers',
  trip_distance_km DECIMAL(10, 2) COMMENT 'Trip distance in kilometers',
  ratecode_id TINYINT COMMENT 'Rate code identifier',
  ratecode_description STRING COMMENT 'Rate code description',
  store_and_fwd_flag STRING COMMENT 'Store and forward flag',
  store_and_fwd_description STRING COMMENT 'Store and forward description',
  pickup_location_id SMALLINT COMMENT 'Pickup location ID',
  dropoff_location_id SMALLINT COMMENT 'Dropoff location ID',
  payment_type_id TINYINT COMMENT 'Payment method identifier',
  payment_type_description STRING COMMENT 'Payment method description',
  fare_amount DECIMAL(10, 2) COMMENT 'Base fare amount',
  extra DECIMAL(7, 2) COMMENT 'Extra charges',
  mta_tax DECIMAL(5, 2) COMMENT 'MTA tax',
  tip_amount DECIMAL(12, 2) COMMENT 'Tip amount',
  tolls_amount DECIMAL(7, 2) COMMENT 'Tolls amount',
  improvement_surcharge DECIMAL(3, 2) COMMENT 'Improvement surcharge',
  total_amount DECIMAL(10, 2) COMMENT 'Total trip amount',
  congestion_surcharge DECIMAL(4, 2) COMMENT 'Congestion surcharge',
  airport_fee DECIMAL(3, 2) COMMENT 'Airport fee',
  pickup_year_month STRING COMMENT 'Partition column - YYYY-MM format'
)
USING DELTA
PARTITIONED BY (pickup_year_month)
COMMENT 'Silver Yellow Taxi trip records - cleaned, transformed and readable'
TBLPROPERTIES (
  'delta.autoOptimize.optimizeWrite' = 'true',
  'delta.autoOptimize.autoCompact' = 'true'
);

In [0]:
%sql
CREATE TABLE IF NOT EXISTS silver.taxi.green_taxi (
  trip_id STRING COMMENT 'Unique trip identifier (MD5 hash)',
  vendor_id TINYINT COMMENT 'Taxi company identifier',
  vendor_name STRING COMMENT 'Taxi company name',
  pickup_datetime TIMESTAMP_NTZ COMMENT 'Trip pickup timestamp',
  dropoff_datetime TIMESTAMP_NTZ COMMENT 'Trip dropoff timestamp',
  trip_duration_minutes DECIMAL(10, 2) COMMENT 'Trip duration in minutes',
  passenger_count TINYINT COMMENT 'Number of passengers',
  trip_distance_km DECIMAL(10, 2) COMMENT 'Trip distance in kilometers',
  ratecode_id TINYINT COMMENT 'Rate code identifier',
  ratecode_description STRING COMMENT 'Rate code description',
  store_and_fwd_flag STRING COMMENT 'Store and forward flag',
  store_and_fwd_description STRING COMMENT 'Store and forward description',
  pickup_location_id SMALLINT COMMENT 'Pickup location ID',
  dropoff_location_id SMALLINT COMMENT 'Dropoff location ID',
  payment_type_id TINYINT COMMENT 'Payment method identifier',
  payment_type_description STRING COMMENT 'Payment method description',
  fare_amount DECIMAL(10, 2) COMMENT 'Base fare amount',
  extra DECIMAL(7, 2) COMMENT 'Extra charges',
  mta_tax DECIMAL(5, 2) COMMENT 'MTA tax',
  tip_amount DECIMAL(12, 2) COMMENT 'Tip amount',
  tolls_amount DECIMAL(7, 2) COMMENT 'Tolls amount',
  improvement_surcharge DECIMAL(3, 2) COMMENT 'Improvement surcharge',
  total_amount DECIMAL(10, 2) COMMENT 'Total trip amount',
  trip_type TINYINT COMMENT 'Trip type',
  congestion_surcharge DECIMAL(4, 2) COMMENT 'Congestion surcharge',
  ehail_fee DECIMAL(3, 2) COMMENT 'Ehail fee',
  pickup_year_month STRING COMMENT 'Partition column - YYYY-MM format'
)
USING DELTA
PARTITIONED BY (pickup_year_month)
COMMENT 'Silver Green Taxi trip records - cleaned, transformed and readable'
TBLPROPERTIES (
  'delta.autoOptimize.optimizeWrite' = 'true',
  'delta.autoOptimize.autoCompact' = 'true'
);

In [0]:
%sql
INSERT INTO silver.taxi.yellow_taxi
SELECT
  md5(
    concat_ws(
      '|',
      CAST(VendorID AS STRING),
      CAST(tpep_pickup_datetime AS STRING),
      CAST(tpep_dropoff_datetime AS STRING),
      CAST(PULocationID AS STRING),
      CAST(DOLocationID AS STRING)
    )
  ) AS trip_id,
  VendorID as vendor_id,
  -- Vendor name translation
  CASE
    VendorID
    WHEN 1 THEN 'Creative Mobile Technologies, LLC'
    WHEN 2 THEN 'Curb Mobility, LLC'
    WHEN 6 THEN 'Myle Technologies Inc'
    WHEN 7 THEN 'Helix'
    ELSE 'Unknown'
  END AS vendor_name,
  tpep_pickup_datetime as pickup_datetime,
  tpep_dropoff_datetime as dropoff_datetime,
  -- Trip duration in minutes
  ROUND(
    (UNIX_TIMESTAMP(tpep_dropoff_datetime) - UNIX_TIMESTAMP(tpep_pickup_datetime)) / 60, 2
  ) AS trip_duration_minutes,
  passenger_count,
  ROUND(trip_distance * 1.60934, 2) as trip_distance_km, -- Convert miles to km
  -- Rate code translation
  RatecodeID as ratecode_id,
  CASE
    RatecodeID
    WHEN 1 THEN 'Standard rate'
    WHEN 2 THEN 'JFK'
    WHEN 3 THEN 'Newark'
    WHEN 4 THEN 'Nassau or Westchester'
    WHEN 5 THEN 'Negotiated fare'
    WHEN 6 THEN 'Group ride'
    WHEN 99 THEN 'Null/unknown'
    ELSE 'Other'
  END AS ratecode_description,
  store_and_fwd_flag,
  -- Store and forward flag description
  CASE
    store_and_fwd_flag
    WHEN 'Y' THEN 'store and forward trip'
    WHEN 'N' THEN 'not a store and forward trip'
    ELSE 'unknown'
  END AS store_and_fwd_description,
  PULocationID as pickup_location_id,
  DOLocationID as dropoff_location_id,
  payment_type AS payment_type_id,
  -- Payment type translation
  CASE
    payment_type
    WHEN 0 THEN 'Flex Fare trip'
    WHEN 1 THEN 'Credit card'
    WHEN 2 THEN 'Cash'
    WHEN 3 THEN 'No charge'
    WHEN 4 THEN 'Dispute'
    WHEN 5 THEN 'Unknown'
    WHEN 6 THEN 'Voided trip'
    ELSE 'Other'
  END AS payment_type_description,
  -- Financial values
  fare_amount,
  extra,
  mta_tax,
  tip_amount,
  tolls_amount,
  improvement_surcharge,
  total_amount,
  congestion_surcharge,
  airport_fee,
  pickup_year_month
FROM bronze.taxi.yellow_taxi
WHERE 
  -- Basic data quality filters
  tpep_pickup_datetime IS NOT NULL
  AND tpep_dropoff_datetime IS NOT NULL
  AND tpep_pickup_datetime < tpep_dropoff_datetime
  AND trip_distance > 0
  AND passenger_count BETWEEN 0 AND 8  -- Reasonable passenger count
  AND total_amount > 0
  AND fare_amount >= 0
  AND tip_amount >= 0
  AND tolls_amount >= 0
  AND tpep_pickup_datetime <= current_timestamp()
  AND tpep_dropoff_datetime <= current_timestamp()
  
  -- Outlier filtering
  AND trip_distance <= 200  -- Max 200 miles (320 km)
  AND total_amount <= 1000  -- Max $1000 per trip
  AND fare_amount <= 500    -- Max $500 base fare
  AND tip_amount <= 200     -- Max $200 tip
  AND tolls_amount <= 100   -- Max $100 tolls
  AND (UNIX_TIMESTAMP(tpep_dropoff_datetime) - UNIX_TIMESTAMP(tpep_pickup_datetime)) BETWEEN 60 AND 28800  -- Between 1 min and 8 hours
  AND year(tpep_pickup_datetime) BETWEEN 2009 AND 2025  -- Reasonable year range
;

In [0]:
%sql
INSERT INTO silver.taxi.green_taxi
SELECT
  md5(
    concat_ws(
      '|',
      CAST(VendorID AS STRING),
      CAST(lpep_pickup_datetime AS STRING),
      CAST(lpep_dropoff_datetime AS STRING),
      CAST(PULocationID AS STRING),
      CAST(DOLocationID AS STRING)
    )
  ) AS trip_id,
  VendorID as vendor_id,
  -- Vendor name translation
  CASE
    VendorID
    WHEN 1 THEN 'Creative Mobile Technologies, LLC'
    WHEN 2 THEN 'Curb Mobility, LLC'
    WHEN 6 THEN 'Myle Technologies Inc'
    ELSE 'Unknown'
  END AS vendor_name,
  lpep_pickup_datetime as pickup_datetime,
  lpep_dropoff_datetime as dropoff_datetime,
  -- Trip duration in minutes
  ROUND(
    (UNIX_TIMESTAMP(lpep_dropoff_datetime) - UNIX_TIMESTAMP(lpep_pickup_datetime)) / 60, 2
  ) AS trip_duration_minutes,
  passenger_count,
  ROUND(trip_distance * 1.60934, 2) as trip_distance_km, -- Convert miles to km
  -- Rate code translation
  RatecodeID as ratecode_id,
  CASE
    RatecodeID
    WHEN 1 THEN 'Standard rate'
    WHEN 2 THEN 'JFK'
    WHEN 3 THEN 'Newark'
    WHEN 4 THEN 'Nassau or Westchester'
    WHEN 5 THEN 'Negotiated fare'
    WHEN 6 THEN 'Group ride'
    WHEN 99 THEN 'Null/unknown'
    ELSE 'Other'
  END AS ratecode_description,
  store_and_fwd_flag,
  -- Store and forward flag description
  CASE
    store_and_fwd_flag
    WHEN 'Y' THEN 'store and forward trip'
    WHEN 'N' THEN 'not a store and forward trip'
    ELSE 'unknown'
  END AS store_and_fwd_description,
  PULocationID as pickup_location_id,
  DOLocationID as dropoff_location_id,
  payment_type AS payment_type_id,
  -- Payment type translation
  CASE
    payment_type
    WHEN 0 THEN 'Flex Fare trip'
    WHEN 1 THEN 'Credit card'
    WHEN 2 THEN 'Cash'
    WHEN 3 THEN 'No charge'
    WHEN 4 THEN 'Dispute'
    WHEN 5 THEN 'Unknown'
    WHEN 6 THEN 'Voided trip'
    ELSE 'Other'
  END AS payment_type_description,
  -- Financial values
  fare_amount,
  extra,
  mta_tax,
  tip_amount,
  tolls_amount,
  improvement_surcharge,
  total_amount,
  trip_type,
  congestion_surcharge,
  ehail_fee,
  pickup_year_month
FROM bronze.taxi.green_taxi
WHERE 
  -- Basic data quality filters
  lpep_pickup_datetime IS NOT NULL
  AND lpep_dropoff_datetime IS NOT NULL
  AND lpep_pickup_datetime < lpep_dropoff_datetime
  AND trip_distance > 0
  AND passenger_count BETWEEN 0 AND 8  -- Reasonable passenger count
  AND total_amount > 0
  AND fare_amount >= 0
  AND tip_amount >= 0
  AND tolls_amount >= 0
  AND lpep_pickup_datetime <= current_timestamp()
  AND lpep_dropoff_datetime <= current_timestamp()
  
  -- Outlier filtering
  AND trip_distance <= 200  -- Max 200 miles (320 km)
  AND total_amount <= 1000  -- Max $1000 per trip
  AND fare_amount <= 500    -- Max $500 base fare
  AND tip_amount <= 200     -- Max $200 tip
  AND tolls_amount <= 100   -- Max $100 tolls
  AND (UNIX_TIMESTAMP(lpep_dropoff_datetime) - UNIX_TIMESTAMP(lpep_pickup_datetime)) BETWEEN 60 AND 28800  -- Between 1 min and 8 hours
  AND year(lpep_pickup_datetime) BETWEEN 2009 AND 2025  -- Reasonable year range
;

In [0]:
# Verify Silver data with new schema
%sql
SELECT 
  'yellow_taxi' as table_name,
  pickup_year_month,
  COUNT(*) as record_count,
  AVG(trip_distance_km) as avg_trip_distance_km,
  AVG(trip_duration_minutes) as avg_duration_min
FROM silver.taxi.yellow_taxi 
GROUP BY pickup_year_month
ORDER BY pickup_year_month;

In [0]:
%sql
SELECT 
  'green_taxi' as table_name,
  pickup_year_month,
  COUNT(*) as record_count,
  AVG(trip_distance_km) as avg_trip_distance_km,
  AVG(trip_duration_minutes) as avg_duration_min
FROM silver.taxi.green_taxi 
GROUP BY pickup_year_month
ORDER BY pickup_year_month;

In [0]:
%sql
SELECT 
  trip_id,
  vendor_name,
  pickup_datetime,
  trip_duration_minutes,
  trip_distance_km,
  ratecode_description,
  payment_type_description,
  total_amount
FROM silver.taxi.yellow_taxi 
LIMIT 5;

In [0]:
%sql
OPTIMIZE silver.taxi.yellow_taxi;

In [0]:
%sql
OPTIMIZE silver.taxi.green_taxi;