In [0]:
%pip install tqdm

In [0]:
import sys

sys.path.insert(0, "../lib")

from utils import table_exists, extract_from, format_query_cdf, import_query
from ingestors import IngestorCDF

    
schema_name = 'taxi'
catalog = 'silver'

In [0]:
table_name = 'green_taxi'
# Updated to use trip_id from the new Silver schema
id_field = "trip_id"
id_field_from = ""

In [0]:
%sql
SELECT 
  trip_id,
  vendor_name,
  pickup_datetime,
  trip_duration_minutes,
  trip_distance_km,
  ratecode_description,
  payment_type_description,
  total_amount
FROM silver.taxi.yellow_taxi
WHERE pickup_datetime BETWEEN '2020-01-01' AND '2020-01-02'
LIMIT 5

In [0]:
remove_checkpoint = False
from pyspark.sql.functions import to_date

if not table_exists(spark, catalog, schema_name, table_name):
    print(f'Table {catalog}.{schema_name}.{table_name} does not exists. Starting full load...')
    query = import_query(table_name)

    df = spark.sql(query)

    (df
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .partitionBy("pickup_year_month")
    .saveAsTable(f"{catalog}.{schema_name}.{table_name}"))

else:
    print(f'Table {table_name} already exists. Starting CDF load...')


# ingest = IngestorCDF(spark=spark,
#                                catalog=catalog,
#                                schema_name=schema_name,
#                                table_name=table_name,
#                                id_field=id_field,
#                                id_field_from=id_field_from)

# if remove_checkpoint:
#     dbutils.fs.rm(ingest.checkpoint_location, True)

# stream = ingest.execute()
# print('Done')


In [0]:
# Example of the transformation now done in Silver layer
%sql
SELECT
  trip_id,
  vendor_name,
  pickup_datetime,
  dropoff_datetime,
  trip_duration_minutes,
  passenger_count,
  trip_distance_km, -- Already converted from miles
  ratecode_description,
  store_and_fwd_description,
  pickup_location_id,
  dropoff_location_id,
  payment_type_description,
  fare_amount,
  extra,
  mta_tax,
  tip_amount,
  tolls_amount,
  improvement_surcharge,
  congestion_surcharge,
  airport_fee,
  total_amount
FROM silver.taxi.yellow_taxi
WHERE pickup_datetime BETWEEN '2020-01-01' AND '2020-01-02'
LIMIT 5;

In [0]:
# Green taxi example with new Silver schema
%sql
SELECT
  trip_id,
  vendor_name,
  pickup_datetime,
  dropoff_datetime,
  trip_duration_minutes,
  passenger_count,
  trip_distance_km, -- Already converted from miles
  ratecode_description,
  store_and_fwd_description,
  pickup_location_id,
  dropoff_location_id,
  payment_type_description,
  fare_amount,
  extra,
  mta_tax,
  tip_amount,
  tolls_amount,
  improvement_surcharge,
  congestion_surcharge,
  ehail_fee,
  total_amount
FROM silver.taxi.green_taxi
WHERE pickup_datetime BETWEEN '2020-01-01' AND '2020-01-02'
LIMIT 5;