# Tipping Prediction on High Value for Hire Vehicle in New York City
Data that will processed here is the result from data manipulation as shown in the previous step. Categorical data will be transformed to one hot encoding manually in the DuckDB query. It can be done in the pipeline, but it takes a lot of time to run. 

In [1]:
import duckdb

# Check the sample data
Do the one hot encoding manually through four categorical columns.

In [2]:
query_hvfhv_2019_2023 = """
WITH CTE_hvfhv_2019_2023 AS (
    SELECT 
        hvfhs_license_num AS provider,
        request_datetime AS request_time,
        pickup_datetime AS pick_up_time,
        CAST(trip_miles AS FLOAT) AS trip_distance,
        CAST(trip_time AS INTEGER) AS duration_seconds,
        CAST(base_passenger_fare AS FLOAT) AS base_fare,
        CAST(tolls AS FLOAT) AS toll_fare,
        CAST(bcf AS FLOAT) AS bcf_fare,
        CAST(sales_tax AS FLOAT) AS tax_fare,
        CAST(tips AS FLOAT) AS tip_amount,
        shared_request_flag AS shared_before,
        shared_match_flag AS shared_during,
        wav_request_flag AS wheelchair_request,
        CAST( CASE tips
            WHEN  0.0 THEN 1
            ELSE 0
        END AS INTEGER) AS tip_category
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/*/high_volume_for_hire_vehicle/fhvhv_tripdata_*.parquet'
    WHERE hvfhs_license_num IS NOT NULL
        AND request_datetime IS NOT NULL
        AND pickup_datetime IS NOT NULL
        AND trip_miles >= 0
        AND trip_miles <= 50
        AND trip_time >= 0
        AND base_passenger_fare >= 0
        AND tolls >= 0
        AND bcf >= 0
        AND sales_tax >= 0
        AND tips >= 0
        AND shared_request_flag IS NOT NULL
        AND shared_match_flag IS NOT NULL
        AND wav_request_flag IS NOT NULL
        AND request_datetime >= '2019-02-01' 
        AND request_datetime < '2023-10-01'
), CTE_duration_hvfhv_2019_2023 AS (
    SELECT
        provider,
        CAST(CASE provider WHEN 'HV0002' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0002,
        CAST(CASE provider WHEN 'HV0003' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0003,
        CAST(CASE provider WHEN 'HV0004' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0004,
        CAST(CASE provider WHEN 'HV0005' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0005,
        DATE_DIFF('day', request_time, pick_up_time) AS duration_days,
        CAST(EPOCH(pick_up_time - request_time) AS INTEGER) AS duration_request,
        trip_distance,
        duration_seconds,
        base_fare + toll_fare + bcf_fare + tax_fare + tip_amount AS total_amount,
        shared_before,
        CAST(CASE shared_before WHEN 'Y' THEN 1 ELSE 0 END AS INTEGER) AS shared_before_yes,
        CAST(CASE shared_before WHEN 'N' THEN 1 ELSE 0 END AS INTEGER) AS shared_before_no,
        shared_during,
        CAST(CASE shared_during WHEN 'Y' THEN 1 ELSE 0 END AS INTEGER) AS shared_during_yes,
        CAST(CASE shared_during WHEN 'N' THEN 1 ELSE 0 END AS INTEGER) AS shared_during_no,
        wheelchair_request,
        CAST(CASE wheelchair_request WHEN 'Y' THEN 1 ELSE 0 END AS INTEGER) AS wheelchair_request_yes,
        CAST(CASE wheelchair_request WHEN 'N' THEN 1 ELSE 0 END AS INTEGER) AS wheelchair_request_no,
        tip_category
    FROM CTE_hvfhv_2019_2023
)

SELECT 
    provider,
    provider_HV0002,
    provider_HV0003,
    provider_HV0004,
    provider_HV0005,
    duration_request,
    trip_distance,
    duration_seconds,
    total_amount,
    shared_before,
    shared_before_yes,
    shared_before_no,
    shared_during,
    shared_during_yes,
    shared_during_no,
    wheelchair_request,
    wheelchair_request_yes,
    wheelchair_request_no,
    tip_category
FROM CTE_duration_hvfhv_2019_2023
WHERE duration_days = 0
LIMIT 10
"""
con = duckdb.connect()
df_hvfhv_2019_2023 = con.execute(query_hvfhv_2019_2023).fetchdf()
df_hvfhv_2019_2023.head(10)

#USING SAMPLE 100000000 ROWS

Unnamed: 0,provider,provider_HV0002,provider_HV0003,provider_HV0004,provider_HV0005,duration_request,trip_distance,duration_seconds,total_amount,shared_before,shared_before_yes,shared_before_no,shared_during,shared_during_yes,shared_during_no,wheelchair_request,wheelchair_request_yes,wheelchair_request_no,tip_category
0,HV0003,0,1,0,0,232,2.45,579,10.41,Y,1,0,N,0,1,N,0,1,1
1,HV0003,0,1,0,0,921,1.71,490,10.809999,N,0,1,N,0,1,N,0,1,0
2,HV0005,0,0,0,1,156,5.01,2159,50.07,N,0,1,Y,1,0,N,0,1,1
3,HV0005,0,0,0,1,96,0.34,179,11.01,N,0,1,Y,1,0,N,0,1,0
4,HV0005,0,0,0,1,207,6.84,1799,31.130001,N,0,1,Y,1,0,N,0,1,0
5,HV0005,0,0,0,1,234,1.11,359,6.4,Y,1,0,Y,1,0,N,0,1,1
6,HV0005,0,0,0,1,289,4.53,1799,18.25,N,0,1,Y,1,0,N,0,1,1
7,HV0005,0,0,0,1,122,11.24,1739,33.959999,N,0,1,Y,1,0,N,0,1,1
8,HV0003,0,1,0,0,613,1.59,574,7.78,Y,1,0,N,0,1,N,0,1,1
9,HV0003,0,1,0,0,196,1.9,474,7.86,Y,1,0,N,0,1,N,0,1,1


In [3]:
df_hvfhv_2019_2023 .info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   provider                10 non-null     object 
 1   provider_HV0002         10 non-null     int32  
 2   provider_HV0003         10 non-null     int32  
 3   provider_HV0004         10 non-null     int32  
 4   provider_HV0005         10 non-null     int32  
 5   duration_request        10 non-null     int32  
 6   trip_distance           10 non-null     float32
 7   duration_seconds        10 non-null     int32  
 8   total_amount            10 non-null     float32
 9   shared_before           10 non-null     object 
 10  shared_before_yes       10 non-null     int32  
 11  shared_before_no        10 non-null     int32  
 12  shared_during           10 non-null     object 
 13  shared_during_yes       10 non-null     int32  
 14  shared_during_no        10 non-null     int32

# Select the required columns 
Feature tip_category will be the target and 14 other columns as predictor.

In [4]:
query_hvfhv_2019_2023 = """
WITH CTE_hvfhv_2019_2023 AS (
    SELECT 
        hvfhs_license_num AS provider,
        request_datetime AS request_time,
        pickup_datetime AS pick_up_time,
        CAST(trip_miles AS FLOAT) AS trip_distance,
        CAST(trip_time AS INTEGER) AS duration_seconds,
        CAST(base_passenger_fare AS FLOAT) AS base_fare,
        CAST(tolls AS FLOAT) AS toll_fare,
        CAST(bcf AS FLOAT) AS bcf_fare,
        CAST(sales_tax AS FLOAT) AS tax_fare,
        CAST(tips AS FLOAT) AS tip_amount,
        shared_request_flag AS shared_before,
        shared_match_flag AS shared_during,
        wav_request_flag AS wheelchair_request,
        CAST( CASE tips
            WHEN  0.0 THEN 1
            ELSE 0
        END AS INTEGER) AS tip_category
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/*/high_volume_for_hire_vehicle/fhvhv_tripdata_*.parquet'
    WHERE hvfhs_license_num IS NOT NULL
        AND request_datetime IS NOT NULL
        AND pickup_datetime IS NOT NULL
        AND trip_miles >= 0
        AND trip_miles <= 50
        AND trip_time >= 0
        AND base_passenger_fare >= 0
        AND tolls >= 0
        AND bcf >= 0
        AND sales_tax >= 0
        AND tips >= 0
        AND shared_request_flag IS NOT NULL
        AND shared_match_flag IS NOT NULL
        AND wav_request_flag IS NOT NULL
        AND request_datetime >= '2019-02-01' 
        AND request_datetime < '2023-10-01'
), CTE_duration_hvfhv_2019_2023 AS (
    SELECT
        provider,
        CAST(CASE provider WHEN 'HV0002' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0002,
        CAST(CASE provider WHEN 'HV0003' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0003,
        CAST(CASE provider WHEN 'HV0004' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0004,
        CAST(CASE provider WHEN 'HV0005' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0005,
        DATE_DIFF('day', request_time, pick_up_time) AS duration_days,
        CAST(EPOCH(pick_up_time - request_time) AS INTEGER) AS duration_request,
        trip_distance,
        duration_seconds,
        base_fare + toll_fare + bcf_fare + tax_fare + tip_amount AS total_amount,
        shared_before,
        CAST(CASE shared_before WHEN 'Y' THEN 1 ELSE 0 END AS INTEGER) AS shared_before_yes,
        CAST(CASE shared_before WHEN 'N' THEN 1 ELSE 0 END AS INTEGER) AS shared_before_no,
        shared_during,
        CAST(CASE shared_during WHEN 'Y' THEN 1 ELSE 0 END AS INTEGER) AS shared_during_yes,
        CAST(CASE shared_during WHEN 'N' THEN 1 ELSE 0 END AS INTEGER) AS shared_during_no,
        wheelchair_request,
        CAST(CASE wheelchair_request WHEN 'Y' THEN 1 ELSE 0 END AS INTEGER) AS wheelchair_request_yes,
        CAST(CASE wheelchair_request WHEN 'N' THEN 1 ELSE 0 END AS INTEGER) AS wheelchair_request_no,
        tip_category
    FROM CTE_hvfhv_2019_2023
)

SELECT 
    provider_HV0002,
    provider_HV0003,
    provider_HV0004,
    provider_HV0005,
    duration_request,
    trip_distance,
    duration_seconds,
    total_amount,
    shared_before_yes,
    shared_before_no,
    shared_during_yes,
    shared_during_no,
    wheelchair_request_yes,
    wheelchair_request_no,
    tip_category
FROM CTE_duration_hvfhv_2019_2023
WHERE duration_days = 0
LIMIT 5
"""
con = duckdb.connect()
df_hvfhv_2019_2023 = con.execute(query_hvfhv_2019_2023).fetchdf()
df_hvfhv_2019_2023.head()

Unnamed: 0,provider_HV0002,provider_HV0003,provider_HV0004,provider_HV0005,duration_request,trip_distance,duration_seconds,total_amount,shared_before_yes,shared_before_no,shared_during_yes,shared_during_no,wheelchair_request_yes,wheelchair_request_no,tip_category
0,0,1,0,0,232,2.45,579,10.41,1,0,0,1,0,1,1
1,0,1,0,0,921,1.71,490,10.809999,0,1,0,1,0,1,0
2,0,0,0,1,156,5.01,2159,50.07,0,1,1,0,0,1,1
3,0,0,0,1,96,0.34,179,11.01,0,1,1,0,0,1,0
4,0,0,0,1,207,6.84,1799,31.130001,0,1,1,0,0,1,0


In [5]:
df_hvfhv_2019_2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   provider_HV0002         5 non-null      int32  
 1   provider_HV0003         5 non-null      int32  
 2   provider_HV0004         5 non-null      int32  
 3   provider_HV0005         5 non-null      int32  
 4   duration_request        5 non-null      int32  
 5   trip_distance           5 non-null      float32
 6   duration_seconds        5 non-null      int32  
 7   total_amount            5 non-null      float32
 8   shared_before_yes       5 non-null      int32  
 9   shared_before_no        5 non-null      int32  
 10  shared_during_yes       5 non-null      int32  
 11  shared_during_no        5 non-null      int32  
 12  wheelchair_request_yes  5 non-null      int32  
 13  wheelchair_request_no   5 non-null      int32  
 14  tip_category            5 non-null      int32 

# Batching process

In [2]:
query_hvfhv_2019_2023 = """
WITH CTE_hvfhv_2019_2023 AS (
    SELECT 
        hvfhs_license_num AS provider,
        request_datetime AS request_time,
        pickup_datetime AS pick_up_time,
        CAST(trip_miles AS FLOAT) AS trip_distance,
        CAST(trip_time AS INTEGER) AS duration_seconds,
        CAST(base_passenger_fare AS FLOAT) AS base_fare,
        CAST(tolls AS FLOAT) AS toll_fare,
        CAST(bcf AS FLOAT) AS bcf_fare,
        CAST(sales_tax AS FLOAT) AS tax_fare,
        CAST(tips AS FLOAT) AS tip_amount,
        shared_request_flag AS shared_before,
        shared_match_flag AS shared_during,
        wav_request_flag AS wheelchair_request,
        CAST( CASE tips
            WHEN  0.0 THEN 1
            ELSE 0
        END AS INTEGER) AS tip_category
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/*/high_volume_for_hire_vehicle/fhvhv_tripdata_*.parquet'
    WHERE hvfhs_license_num IS NOT NULL
        AND request_datetime IS NOT NULL
        AND pickup_datetime IS NOT NULL
        AND trip_miles >= 0
        AND trip_miles <= 50
        AND trip_time >= 0
        AND base_passenger_fare >= 0
        AND tolls >= 0
        AND bcf >= 0
        AND sales_tax >= 0
        AND tips >= 0
        AND shared_request_flag IS NOT NULL
        AND shared_match_flag IS NOT NULL
        AND wav_request_flag IS NOT NULL
        AND request_datetime >= '2019-02-01' 
        AND request_datetime < '2023-10-01'
), CTE_duration_hvfhv_2019_2023 AS (
    SELECT
        provider,
        CAST(CASE provider WHEN 'HV0002' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0002,
        CAST(CASE provider WHEN 'HV0003' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0003,
        CAST(CASE provider WHEN 'HV0004' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0004,
        CAST(CASE provider WHEN 'HV0005' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0005,
        DATE_DIFF('day', request_time, pick_up_time) AS duration_days,
        CAST(EPOCH(pick_up_time - request_time) AS INTEGER) AS duration_request,
        trip_distance,
        duration_seconds,
        base_fare + toll_fare + bcf_fare + tax_fare + tip_amount AS total_amount,
        shared_before,
        CAST(CASE shared_before WHEN 'Y' THEN 1 ELSE 0 END AS INTEGER) AS shared_before_yes,
        CAST(CASE shared_before WHEN 'N' THEN 1 ELSE 0 END AS INTEGER) AS shared_before_no,
        shared_during,
        CAST(CASE shared_during WHEN 'Y' THEN 1 ELSE 0 END AS INTEGER) AS shared_during_yes,
        CAST(CASE shared_during WHEN 'N' THEN 1 ELSE 0 END AS INTEGER) AS shared_during_no,
        wheelchair_request,
        CAST(CASE wheelchair_request WHEN 'Y' THEN 1 ELSE 0 END AS INTEGER) AS wheelchair_request_yes,
        CAST(CASE wheelchair_request WHEN 'N' THEN 1 ELSE 0 END AS INTEGER) AS wheelchair_request_no,
        tip_category
    FROM CTE_hvfhv_2019_2023
)

SELECT 
    provider_HV0002,
    provider_HV0003,
    provider_HV0004,
    provider_HV0005,
    duration_request,
    trip_distance,
    duration_seconds,
    total_amount,
    shared_before_yes,
    shared_before_no,
    shared_during_yes,
    shared_during_no,
    wheelchair_request_yes,
    wheelchair_request_no,
    tip_category
FROM CTE_duration_hvfhv_2019_2023
WHERE duration_days = 0
LIMIT 60000000
"""
#con = duckdb.connect()
#df_hvfhv_2019_2023 = con.execute(query_hvfhv_2019_2023).fetchdf()
#df_hvfhv_2019_2023.head()
#LIMIT 1000000

In [3]:
import pandas as pd
from river import preprocessing, tree, metrics

# ----------------------------
# 1. Connect to DuckDB and query
# ----------------------------
con = duckdb.connect("my_data.duckdb")
res = con.execute(query_hvfhv_2019_2023)

# ----------------------------
# 2. Build pipeline
# ----------------------------
pipeline = (tree.HoeffdingTreeClassifier())

# Compact classification report table
all_metrics = metrics.ClassificationReport()

downsample_ratio = 3  # keep 1:3 balance

# ----------------------------
# 3. Streaming loop
# ----------------------------
while True:
    chunk = res.fetch_df_chunk(vectors_per_chunk=50_000)
    if chunk is None or len(chunk) == 0:
        break

    # Balance the batch
    minority = chunk[chunk["tip_category"] == 1]
    majority = chunk[chunk["tip_category"] == 0]

    if len(minority) > 0:
        majority_down = majority.sample(
            n=min(len(majority), downsample_ratio * len(minority)),
            random_state=42
        )
        batch_balanced = pd.concat([minority, majority_down], ignore_index=True)
    else:
        batch_balanced = chunk

    # Convert to dicts for speed
    records = batch_balanced.to_dict(orient="records")

    for r in records:
        x = {k: v for k, v in r.items() if k != "tip_category"}
        y = r["tip_category"]

        # Predict
        y_pred = pipeline.predict_one(x)

        # Update metrics
        if y_pred is not None:
            all_metrics.update(y, y_pred)

        # Train
        pipeline.learn_one(x, y)

    # Print compact classification report after each chunk
    print(all_metrics)

    # Clean up batches
    # del batch_balanced
    #del chunk
    #gc.collect()


           Precision   Recall    F1       Support   
                                                    
       0     100.00%    93.65%   96.72%    8234486  
       1      99.00%   100.00%   99.50%   51765513  
                                                    
   Macro      99.50%    96.83%   98.11%             
   Micro      99.13%    99.13%   99.13%             
Weighted      99.14%    99.13%   99.12%             

                  99.13% accuracy                   


##### Result: based on F1 score, the model is not discriminate the minority. Here we can not talk about performance since we need to upgrade the number of data feed into the batching. I think the proper batching strategy, memory assigntment and pararelization using GPU are required here.