In [0]:
# Databricks notebook source
dbutils.library.restartPython()

# COMMAND ----------

import tqdm
import sys
import datetime

sys.path.insert(0, "../lib")

from utils import *
from ingestors import IngestorCubo



# COMMAND ----------

today = (spark.sql("SELECT MAX(dtCompra) as max_dtCompra FROM silver.taxi.yellow_taxi").collect()[0]['max_dtCompra'])
catalog = "gold"
schema_name = 'taxi'


try:
    table_name = dbutils.widgets.get("table_name")
    start = dbutils.widgets.get("dt_start") # now
    stop = dbutils.widgets.get("dt_stop") # now
except:
    start = '2018-06-01'
    stop =  '2018-12-01'
    table_name = 'daily_reports'

if start == today.strftime('%Y-%m-%d'):
    start = (today - datetime.timedelta(days=1)).strftime("%Y-%m-%d")



# COMMAND ----------

ingestor = IngestorCubo(spark=spark,
                                  catalog=catalog,
                                  schema_name=schema_name,
                                  table_name=table_name)

ingestor.backfill(start, stop)

In [0]:
%sql
SELECT *

 FROM silver.taxi.yellow_taxi_partitioned  limit 5


In [0]:
%sql
WITH base AS (
    SELECT
        DATE(pickup_datetime) AS pickup_date,
        trip_duration_minutes,
        total_amount,
        fare_amount,
        tip_amount,
        tolls_amount,
        improvement_surcharge,
        extra,
        trip_distance_km,
        passenger_count,
        vendor_name,
        pickup_location_id,
        dropoff_location_id,
        payment_type_id
    FROM silver.taxi.yellow_taxi_partitioned
    
),
aggregated AS (
    SELECT
        pickup_date,

        -- Volume & receita
        COUNT(*) AS num_trips,
        SUM(passenger_count) AS total_passengers,
        SUM(total_amount) AS total_revenue,
        SUM(fare_amount) AS total_fare,
        SUM(tip_amount) AS total_tips,
        SUM(tolls_amount) AS total_tolls,
        SUM(extra) AS total_extra,
        AVG(total_amount) AS avg_total_amount,
        PERCENTILE(total_amount, 0.5) AS median_total_amount,

        -- Localidade e fornecedores
        SUM(CASE WHEN vendor_name = 'Creative Mobile Technologies, LLC' THEN 1 ELSE 0 END) AS num_trips_cmt,
        SUM(CASE WHEN vendor_name = 'Curb Mobility, LLC' THEN 1 ELSE 0 END) AS num_trips_curb,
        SUM(CASE WHEN vendor_name = 'Myle Technologies Inc' THEN 1 ELSE 0 END) AS num_trips_myle,
        SUM(CASE WHEN vendor_name = 'Helix' THEN 1 ELSE 0 END) AS num_trips_helix,
        SUM(CASE WHEN vendor_name = 'Creative Mobile Technologies, LLC' THEN total_amount ELSE 0 END) AS total_revenue_cmt,
        SUM(CASE WHEN vendor_name = 'Curb Mobility, LLC' THEN total_amount ELSE 0 END) AS total_revenue_curb,
        SUM(CASE WHEN vendor_name = 'Myle Technologies Inc' THEN total_amount ELSE 0 END) AS total_revenue_myle,
        SUM(CASE WHEN vendor_name = 'Helix' THEN total_amount ELSE 0 END) AS total_revenue_helix,
        COUNT(DISTINCT pickup_location_id) AS distinct_pickup_locations,
        COUNT(DISTINCT dropoff_location_id) AS distinct_dropoff_locations,

        -- Eficiência
        AVG(trip_distance_km) AS avg_trip_distance_km,
        AVG(trip_duration_minutes) AS avg_trip_duration_min,
        SUM(trip_distance_km) / NULLIF(SUM(trip_duration_minutes), 0) AS avg_speed_kmph,
        SUM(total_amount) / NULLIF(SUM(trip_duration_minutes), 0) AS revenue_per_km,
        SUM(total_amount) / NULLIF(SUM(trip_distance_km), 0) AS revenue_per_km,

        -- Qualidade de serviço
        AVG(CASE WHEN total_amount > 0 THEN tip_amount / total_amount ELSE NULL END) AS avg_tip_pct,
        COUNT(CASE WHEN tip_amount > 0 THEN 1 END) * 1.0 / COUNT(*) AS pct_trips_with_tip,

        -- Corridas por faixa de distância
        COUNT(CASE WHEN trip_distance_km <= 1 THEN 1 END) AS short_trips,
        COUNT(CASE WHEN trip_distance_km >= 10 THEN 1 END) AS long_trips,

        -- Pagamentos
        COUNT(DISTINCT payment_type_id) AS payment_methods_used,
        COUNT(CASE WHEN payment_type_id = 1 THEN 1 END) AS num_credit_card,
        COUNT(CASE WHEN payment_type_id = 2 THEN 1 END) AS num_cash,
        COUNT(CASE WHEN payment_type_id = 3 THEN 1 END) AS num_no_charge,
        COUNT(CASE WHEN payment_type_id = 4 THEN 1 END) AS num_dispute,
        COUNT(CASE WHEN payment_type_id = 5 THEN 1 END) AS num_unknown,
        COUNT(CASE WHEN payment_type_id = 6 THEN 1 END) AS num_voided,

        -- Qualidade e dados faltantes
        COUNT(CASE WHEN passenger_count = 0 THEN 1 END) AS zero_passenger_trips,
        COUNT(CASE WHEN total_amount <= 0 THEN 1 END) AS invalid_total_trips

    FROM base
    GROUP BY pickup_date
)
SELECT *
FROM aggregated
WHERE pickup_date between '2018-01-01' and '2018-03-01'
ORDER BY pickup_date;

In [0]:
%sql
SELECT MAX(pickup_datetime), MIN(pickup_datetime) FROM silver.taxi.yellow_taxi_partitioned