# New York Tadi tip Prediction: Data Manipulation for Regression

## 1. Data manipulation for Yellow and Green Taxi

In [2]:
import duckdb 

In [3]:
query_yellow_green = """
WITH CTE_yellow_2009 AS (
    SELECT 
        CAST(Trip_Pickup_DateTime AS TIMESTAMP) AS pick_up_time,
        CAST(Trip_Dropoff_DateTime AS TIMESTAMP) AS drop_off_time,
        CAST(Passenger_Count AS INTEGER) AS passenger_count,
        CAST(Trip_Distance AS INTEGER) AS trip_distance,
        Payment_Type AS payment_type,
        CAST(Total_Amt AS FLOAT)   AS total_amount,
        Tip_Amt AS tip_amount,
        CAST(Tip_Amt AS FLOAT)   AS tip_amount,
        CAST( CASE Payment_Type
            WHEN  'Credit' THEN 1
            WHEN  'CREDIT' THEN 1
            ELSE 2
        END AS INTEGER) AS payment_category
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/2009/yellow_taxi_2009/yellow_tripdata_*.parquet'
    WHERE Trip_Pickup_DateTime IS NOT NULL
        AND Trip_Dropoff_DateTime IS NOT NULL
        AND Passenger_Count >= 0
        AND Trip_Distance >= 0 
        AND Trip_Distance <= 50
        AND Payment_Type IS NOT NULL
        AND Total_Amt >= 0
        AND Tip_Amt >= 0
        AND Trip_Pickup_DateTime >= '2009-01-01' 
        AND Trip_Pickup_DateTime < '2010-01-01'
), CTE_duration_yellow_2009 AS (
    SELECT
        pick_up_time,
        drop_off_time,
        passenger_count,
        trip_distance,
        total_amount,
        payment_category,
        tip_amount,
        DATE_DIFF('day', pick_up_time, drop_off_time) AS duration_days,
        EPOCH(drop_off_time - pick_up_time) AS duration_seconds
    FROM CTE_yellow_2009
    WHERE payment_category = 1
), CTE_yellow_2010 AS (
    SELECT 
        CAST(pickup_datetime AS TIMESTAMP) AS pick_up_time,
        CAST(dropoff_datetime AS TIMESTAMP) AS drop_off_time,
        CAST(passenger_count AS INTEGER) AS passenger_count,
        CAST(total_amount AS FLOAT)   AS total_amount,
        CAST(trip_distance AS FLOAT)   AS trip_distance,
        payment_type,
        tip_amount,
        CAST( CASE payment_type
            WHEN  'Cre' THEN 1
            WHEN  'CRE' THEN 1
            ELSE 2
        END AS INTEGER) AS payment_category
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/2010/yellow_taxi_2010/yellow_tripdata_*.parquet'
    WHERE pickup_datetime IS NOT NULL
        AND dropoff_datetime  IS NOT NULL
        AND passenger_count >= 0
        AND trip_distance >= 0
        AND trip_distance <= 50
        AND payment_type IS NOT NULL
        AND total_amount >= 0
        AND tip_amount >= 0 
        AND pickup_datetime >= '2010-01-01' 
        AND pickup_datetime < '2011-01-01'
), CTE_duration_yellow_2010 AS (
    SELECT
        pick_up_time,
        drop_off_time,
        passenger_count,
        trip_distance,
        total_amount,
        payment_category,
        tip_amount,
        DATE_DIFF('day', pick_up_time, drop_off_time) AS duration_days,
        EPOCH(drop_off_time - pick_up_time) AS duration_seconds
    FROM CTE_yellow_2010
    WHERE payment_category = 1
), CTE_yellow_2011_2023 AS (
    SELECT 
        tpep_pickup_datetime AS pick_up_time,
        tpep_dropoff_datetime AS drop_off_time,
        CAST(passenger_count AS INTEGER) AS passenger_count,
        CAST(total_amount AS FLOAT)   AS total_amount,
        CAST(trip_distance AS FLOAT)   AS trip_distance,
        CAST(payment_type AS INTEGER) AS payment_category,
        tip_amount
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/*/yellow_taxi/yellow_tripdata_*.parquet'
    WHERE tpep_pickup_datetime IS NOT NULL
        AND tpep_dropoff_datetime  IS NOT NULL
        AND passenger_count IS NOT NULL
        AND trip_distance >= 0
        AND trip_distance <= 50
        AND payment_type IS NOT NULL
        AND total_amount >= 0
        AND tip_amount >= 0
        AND tpep_pickup_datetime >= '2011-01-01' 
        AND tpep_pickup_datetime < '2023-10-01'
), CTE_duration_yellow_2011_2023 AS (
    SELECT
        pick_up_time,
        drop_off_time,
        passenger_count,
        trip_distance,
        total_amount,
        payment_category,
        tip_amount,
        DATE_DIFF('day', pick_up_time, drop_off_time) AS duration_days,
        EPOCH(drop_off_time - pick_up_time) AS duration_seconds
    FROM CTE_yellow_2011_2023
    WHERE payment_category = 1
), CTE_green_2011_2023 AS (
    SELECT 
        lpep_pickup_datetime AS pick_up_time,
        lpep_dropoff_datetime AS drop_off_time,
        CAST(passenger_count AS INTEGER) AS passenger_count,
        CAST(total_amount AS FLOAT)   AS total_amount,
        CAST(trip_distance AS FLOAT)   AS trip_distance,
        CAST(payment_type AS INTEGER) AS payment_category,
        tip_amount
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/*/green_taxi/green_tripdata_*.parquet'
    WHERE lpep_pickup_datetime IS NOT NULL
        AND lpep_dropoff_datetime  IS NOT NULL
        AND passenger_count IS NOT NULL
        AND trip_distance >= 0
        AND trip_distance <= 50
        AND payment_type IS NOT NULL
        AND total_amount >= 0
        AND tip_amount >= 0
        AND lpep_pickup_datetime >= '2009-01-01' 
        AND lpep_pickup_datetime < '2023-10-01'
), CTE_duration_green_2011_2023 AS (
    SELECT
        pick_up_time,
        drop_off_time,
        passenger_count,
        trip_distance,
        total_amount,
        payment_category,
        tip_amount,
        DATE_DIFF('day', pick_up_time, drop_off_time) AS duration_days,
        EPOCH(drop_off_time - pick_up_time) AS duration_seconds
    FROM CTE_green_2011_2023
    WHERE payment_category = 1
), CTE_union_all AS (
    SELECT * FROM CTE_duration_yellow_2009
    UNION ALL
    SELECT * FROM CTE_duration_yellow_2010
    UNION ALL
    SELECT * FROM CTE_duration_yellow_2011_2023
    UNION ALL
    SELECT * FROM CTE_duration_green_2011_2023
)

SELECT 
    passenger_count,
    trip_distance,
    total_amount,
    CAST(duration_seconds AS FLOAT)   AS duration_seconds,
    tip_amount
FROM CTE_union_all
WHERE duration_days = 0
LIMIT 5
"""

con = duckdb.connect()
df_yellow_green = con.execute(query_yellow_green).fetchdf()
df_yellow_green.head()

Unnamed: 0,passenger_count,trip_distance,total_amount,duration_seconds,tip_amount
0,3,5.0,14.6,420.0,2.0
1,5,10.0,28.440001,840.0,4.74
2,1,5.0,18.450001,1262.0,3.05
3,1,0.0,6.7,585.0,1.0
4,1,2.0,10.0,679.0,1.3


## 2. Data Manipulation for High Value for Hire Vehicles

In [4]:
query_hvfhv_2019_2023 = """
WITH CTE_hvfhv_2019_2023 AS (
    SELECT 
        hvfhs_license_num AS provider,
        request_datetime AS request_time,
        pickup_datetime AS pick_up_time,
        CAST(trip_miles AS FLOAT) AS trip_distance,
        CAST(trip_time AS INTEGER) AS duration_seconds,
        CAST(base_passenger_fare AS FLOAT) AS base_fare,
        CAST(tolls AS FLOAT) AS toll_fare,
        CAST(bcf AS FLOAT) AS bcf_fare,
        CAST(sales_tax AS FLOAT) AS tax_fare,
        CAST(tips AS FLOAT) AS tip_amount,
        shared_request_flag AS shared_before,
        shared_match_flag AS shared_during,
        wav_request_flag AS wheelchair_request
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/*/high_volume_for_hire_vehicle/fhvhv_tripdata_*.parquet'
    WHERE hvfhs_license_num IS NOT NULL
        AND request_datetime IS NOT NULL
        AND pickup_datetime IS NOT NULL
        AND trip_miles >= 0
        AND trip_miles <= 50
        AND trip_time >= 0
        AND base_passenger_fare >= 0
        AND tolls >= 0
        AND bcf >= 0
        AND sales_tax >= 0
        AND tips >= 0
        AND shared_request_flag IS NOT NULL
        AND shared_match_flag IS NOT NULL
        AND wav_request_flag IS NOT NULL
        AND request_datetime >= '2019-02-01' 
        AND request_datetime < '2023-10-01'
), CTE_duration_hvfhv_2019_2023 AS (
    SELECT
        provider,
        DATE_DIFF('day', request_time, pick_up_time) AS duration_days,
        EPOCH(pick_up_time - request_time) AS duration_request,
        trip_distance,
        duration_seconds,
        base_fare + toll_fare + bcf_fare + tax_fare  AS total_amount,
        shared_before,
        shared_during,
        wheelchair_request,
        tip_amount
    FROM CTE_hvfhv_2019_2023
)

SELECT 
    provider,
    duration_request,
    trip_distance,
    duration_seconds,
    total_amount,
    shared_before,
    shared_during,
    wheelchair_request,
    tip_amount
FROM CTE_duration_hvfhv_2019_2023
WHERE duration_days = 0
LIMIT 5
"""

con = duckdb.connect()
df_hvfhv_2019_2023 = con.execute(query_hvfhv_2019_2023).fetchdf()
df_hvfhv_2019_2023.head()

Unnamed: 0,provider,duration_request,trip_distance,duration_seconds,total_amount,shared_before,shared_during,wheelchair_request,tip_amount
0,HV0003,232.0,2.45,579,10.41,Y,N,N,0.0
1,HV0003,921.0,1.71,490,10.809999,N,N,N,2.0
2,HV0005,156.0,5.01,2159,50.07,N,Y,N,0.0
3,HV0005,96.0,0.34,179,11.01,N,Y,N,3.0
4,HV0005,207.0,6.84,1799,31.130001,N,Y,N,4.0
