# New York Taxi Tipping Prediction with Regression
The data that will be used are the same as the data for classification but the target is not categorized. It means the dependent variabel will be in float. 

### 1. Regression on tipping in Yellow and Green Taxi

In [1]:
import duckdb 

In [10]:
query_yellow_green = """
WITH CTE_yellow_2009 AS (
    SELECT 
        CAST(Trip_Pickup_DateTime AS TIMESTAMP) AS pick_up_time,
        CAST(Trip_Dropoff_DateTime AS TIMESTAMP) AS drop_off_time,
        CAST(Passenger_Count AS INTEGER) AS passenger_count,
        CAST(Trip_Distance AS INTEGER) AS trip_distance,
        Payment_Type AS payment_type,
        CAST(Total_Amt AS FLOAT)   AS total_amount,
        Tip_Amt AS tip_amount,
        CAST(Tip_Amt AS FLOAT)   AS tip_amount,
        CAST( CASE Payment_Type
            WHEN  'Credit' THEN 1
            WHEN  'CREDIT' THEN 1
            ELSE 2
        END AS INTEGER) AS payment_category
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/2009/yellow_taxi_2009/yellow_tripdata_*.parquet'
    WHERE Trip_Pickup_DateTime IS NOT NULL
        AND Trip_Dropoff_DateTime IS NOT NULL
        AND Passenger_Count >= 0
        AND Trip_Distance >= 0 
        AND Trip_Distance <= 50
        AND Payment_Type IS NOT NULL
        AND Total_Amt >= 0
        AND Tip_Amt >= 0
        AND Trip_Pickup_DateTime >= '2009-01-01' 
        AND Trip_Pickup_DateTime < '2010-01-01'
), CTE_duration_yellow_2009 AS (
    SELECT
        pick_up_time,
        drop_off_time,
        passenger_count,
        trip_distance,
        total_amount,
        payment_category,
        tip_amount,
        DATE_DIFF('day', pick_up_time, drop_off_time) AS duration_days,
        EPOCH(drop_off_time - pick_up_time) AS duration_seconds
    FROM CTE_yellow_2009
    WHERE payment_category = 1
), CTE_yellow_2010 AS (
    SELECT 
        CAST(pickup_datetime AS TIMESTAMP) AS pick_up_time,
        CAST(dropoff_datetime AS TIMESTAMP) AS drop_off_time,
        CAST(passenger_count AS INTEGER) AS passenger_count,
        CAST(total_amount AS FLOAT)   AS total_amount,
        CAST(trip_distance AS FLOAT)   AS trip_distance,
        payment_type,
        tip_amount,
        CAST( CASE payment_type
            WHEN  'Cre' THEN 1
            WHEN  'CRE' THEN 1
            ELSE 2
        END AS INTEGER) AS payment_category
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/2010/yellow_taxi_2010/yellow_tripdata_*.parquet'
    WHERE pickup_datetime IS NOT NULL
        AND dropoff_datetime  IS NOT NULL
        AND passenger_count >= 0
        AND trip_distance >= 0
        AND trip_distance <= 50
        AND payment_type IS NOT NULL
        AND total_amount >= 0
        AND tip_amount >= 0 
        AND pickup_datetime >= '2010-01-01' 
        AND pickup_datetime < '2011-01-01'
), CTE_duration_yellow_2010 AS (
    SELECT
        pick_up_time,
        drop_off_time,
        passenger_count,
        trip_distance,
        total_amount,
        payment_category,
        tip_amount,
        DATE_DIFF('day', pick_up_time, drop_off_time) AS duration_days,
        EPOCH(drop_off_time - pick_up_time) AS duration_seconds
    FROM CTE_yellow_2010
    WHERE payment_category = 1
), CTE_yellow_2011_2023 AS (
    SELECT 
        tpep_pickup_datetime AS pick_up_time,
        tpep_dropoff_datetime AS drop_off_time,
        CAST(passenger_count AS INTEGER) AS passenger_count,
        CAST(total_amount AS FLOAT)   AS total_amount,
        CAST(trip_distance AS FLOAT)   AS trip_distance,
        CAST(payment_type AS INTEGER) AS payment_category,
        tip_amount
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/*/yellow_taxi/yellow_tripdata_*.parquet'
    WHERE tpep_pickup_datetime IS NOT NULL
        AND tpep_dropoff_datetime  IS NOT NULL
        AND passenger_count IS NOT NULL
        AND trip_distance >= 0
        AND trip_distance <= 50
        AND payment_type IS NOT NULL
        AND total_amount >= 0
        AND tip_amount >= 0
        AND tpep_pickup_datetime >= '2011-01-01' 
        AND tpep_pickup_datetime < '2023-10-01'
), CTE_duration_yellow_2011_2023 AS (
    SELECT
        pick_up_time,
        drop_off_time,
        passenger_count,
        trip_distance,
        total_amount,
        payment_category,
        tip_amount,
        DATE_DIFF('day', pick_up_time, drop_off_time) AS duration_days,
        EPOCH(drop_off_time - pick_up_time) AS duration_seconds
    FROM CTE_yellow_2011_2023
    WHERE payment_category = 1
), CTE_green_2011_2023 AS (
    SELECT 
        lpep_pickup_datetime AS pick_up_time,
        lpep_dropoff_datetime AS drop_off_time,
        CAST(passenger_count AS INTEGER) AS passenger_count,
        CAST(total_amount AS FLOAT)   AS total_amount,
        CAST(trip_distance AS FLOAT)   AS trip_distance,
        CAST(payment_type AS INTEGER) AS payment_category,
        tip_amount
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/*/green_taxi/green_tripdata_*.parquet'
    WHERE lpep_pickup_datetime IS NOT NULL
        AND lpep_dropoff_datetime  IS NOT NULL
        AND passenger_count IS NOT NULL
        AND trip_distance >= 0
        AND trip_distance <= 50
        AND payment_type IS NOT NULL
        AND total_amount >= 0
        AND tip_amount >= 0
        AND lpep_pickup_datetime >= '2009-01-01' 
        AND lpep_pickup_datetime < '2023-10-01'
), CTE_duration_green_2011_2023 AS (
    SELECT
        pick_up_time,
        drop_off_time,
        passenger_count,
        trip_distance,
        total_amount,
        payment_category,
        tip_amount,
        DATE_DIFF('day', pick_up_time, drop_off_time) AS duration_days,
        CAST(EPOCH(drop_off_time - pick_up_time) AS FLOAT) AS duration_seconds
    FROM CTE_green_2011_2023
    WHERE payment_category = 1
), CTE_union_all AS (
    SELECT * FROM CTE_duration_yellow_2009
    UNION ALL
    SELECT * FROM CTE_duration_yellow_2010
    UNION ALL
    SELECT * FROM CTE_duration_yellow_2011_2023
    UNION ALL
    SELECT * FROM CTE_duration_green_2011_2023
)

SELECT 
    passenger_count,
    trip_distance,
    total_amount,
    duration_seconds,
    tip_amount
FROM CTE_union_all
WHERE (duration_days = 0) AND (duration_seconds > 0)
LIMIT 5
"""

con = duckdb.connect()
df_yellow_green = con.execute(query_yellow_green).fetchdf()
df_yellow_green.head()

Unnamed: 0,passenger_count,trip_distance,total_amount,duration_seconds,tip_amount
0,3,5.0,14.6,420.0,2.0
1,5,10.0,28.440001,840.0,4.74
2,1,5.0,18.450001,1262.0,3.05
3,1,0.0,6.7,585.0,1.0
4,1,2.0,10.0,679.0,1.3


#### The query for data manipulation:

In [22]:
query_yellow_green = """
WITH CTE_yellow_2009 AS (
    SELECT 
        CAST(Trip_Pickup_DateTime AS TIMESTAMP) AS pick_up_time,
        CAST(Trip_Dropoff_DateTime AS TIMESTAMP) AS drop_off_time,
        CAST(Passenger_Count AS INTEGER) AS passenger_count,
        CAST(Trip_Distance AS INTEGER) AS trip_distance,
        Payment_Type AS payment_type,
        CAST(Total_Amt AS FLOAT)   AS total_amount,
        Tip_Amt AS tip_amount,
        CAST(Tip_Amt AS FLOAT)   AS tip_amount,
        CAST( CASE Payment_Type
            WHEN  'Credit' THEN 1
            WHEN  'CREDIT' THEN 1
            ELSE 2
        END AS INTEGER) AS payment_category
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/2009/yellow_taxi_2009/yellow_tripdata_*.parquet'
    WHERE Trip_Pickup_DateTime IS NOT NULL
        AND Trip_Dropoff_DateTime IS NOT NULL
        AND Passenger_Count >= 0
        AND Trip_Distance >= 0 
        AND Trip_Distance <= 50
        AND Payment_Type IS NOT NULL
        AND Total_Amt >= 0
        AND Tip_Amt >= 0
        AND Trip_Pickup_DateTime >= '2009-01-01' 
        AND Trip_Pickup_DateTime < '2010-01-01'
), CTE_duration_yellow_2009 AS (
    SELECT
        pick_up_time,
        drop_off_time,
        passenger_count,
        trip_distance,
        total_amount,
        payment_category,
        tip_amount,
        DATE_DIFF('day', pick_up_time, drop_off_time) AS duration_days,
        EPOCH(drop_off_time - pick_up_time) AS duration_seconds
    FROM CTE_yellow_2009
    WHERE payment_category = 1
), CTE_yellow_2010 AS (
    SELECT 
        CAST(pickup_datetime AS TIMESTAMP) AS pick_up_time,
        CAST(dropoff_datetime AS TIMESTAMP) AS drop_off_time,
        CAST(passenger_count AS INTEGER) AS passenger_count,
        CAST(total_amount AS FLOAT)   AS total_amount,
        CAST(trip_distance AS FLOAT)   AS trip_distance,
        payment_type,
        tip_amount,
        CAST( CASE payment_type
            WHEN  'Cre' THEN 1
            WHEN  'CRE' THEN 1
            ELSE 2
        END AS INTEGER) AS payment_category
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/2010/yellow_taxi_2010/yellow_tripdata_*.parquet'
    WHERE pickup_datetime IS NOT NULL
        AND dropoff_datetime  IS NOT NULL
        AND passenger_count >= 0
        AND trip_distance >= 0
        AND trip_distance <= 50
        AND payment_type IS NOT NULL
        AND total_amount >= 0
        AND tip_amount >= 0 
        AND pickup_datetime >= '2010-01-01' 
        AND pickup_datetime < '2011-01-01'
), CTE_duration_yellow_2010 AS (
    SELECT
        pick_up_time,
        drop_off_time,
        passenger_count,
        trip_distance,
        total_amount,
        payment_category,
        tip_amount,
        DATE_DIFF('day', pick_up_time, drop_off_time) AS duration_days,
        EPOCH(drop_off_time - pick_up_time) AS duration_seconds
    FROM CTE_yellow_2010
    WHERE payment_category = 1
), CTE_yellow_2011_2023 AS (
    SELECT 
        tpep_pickup_datetime AS pick_up_time,
        tpep_dropoff_datetime AS drop_off_time,
        CAST(passenger_count AS INTEGER) AS passenger_count,
        CAST(total_amount AS FLOAT)   AS total_amount,
        CAST(trip_distance AS FLOAT)   AS trip_distance,
        CAST(payment_type AS INTEGER) AS payment_category,
        tip_amount
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/*/yellow_taxi/yellow_tripdata_*.parquet'
    WHERE tpep_pickup_datetime IS NOT NULL
        AND tpep_dropoff_datetime  IS NOT NULL
        AND passenger_count IS NOT NULL
        AND trip_distance >= 0
        AND trip_distance <= 50
        AND payment_type IS NOT NULL
        AND total_amount >= 0
        AND tip_amount >= 0
        AND tpep_pickup_datetime >= '2011-01-01' 
        AND tpep_pickup_datetime < '2023-10-01'
), CTE_duration_yellow_2011_2023 AS (
    SELECT
        pick_up_time,
        drop_off_time,
        passenger_count,
        trip_distance,
        total_amount,
        payment_category,
        tip_amount,
        DATE_DIFF('day', pick_up_time, drop_off_time) AS duration_days,
        EPOCH(drop_off_time - pick_up_time) AS duration_seconds
    FROM CTE_yellow_2011_2023
    WHERE payment_category = 1
), CTE_green_2011_2023 AS (
    SELECT 
        lpep_pickup_datetime AS pick_up_time,
        lpep_dropoff_datetime AS drop_off_time,
        CAST(passenger_count AS INTEGER) AS passenger_count,
        CAST(total_amount AS FLOAT)   AS total_amount,
        CAST(trip_distance AS FLOAT)   AS trip_distance,
        CAST(payment_type AS INTEGER) AS payment_category,
        tip_amount
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/*/green_taxi/green_tripdata_*.parquet'
    WHERE lpep_pickup_datetime IS NOT NULL
        AND lpep_dropoff_datetime  IS NOT NULL
        AND passenger_count IS NOT NULL
        AND trip_distance >= 0
        AND trip_distance <= 50
        AND payment_type IS NOT NULL
        AND total_amount >= 0
        AND tip_amount >= 0
        AND lpep_pickup_datetime >= '2009-01-01' 
        AND lpep_pickup_datetime < '2023-10-01'
), CTE_duration_green_2011_2023 AS (
    SELECT
        pick_up_time,
        drop_off_time,
        passenger_count,
        trip_distance,
        total_amount,
        payment_category,
        tip_amount,
        DATE_DIFF('day', pick_up_time, drop_off_time) AS duration_days,
        CAST(EPOCH(drop_off_time - pick_up_time) AS FLOAT) AS duration_seconds
    FROM CTE_green_2011_2023
    WHERE payment_category = 1
), CTE_union_all AS (
    SELECT * FROM CTE_duration_yellow_2009
    UNION ALL
    SELECT * FROM CTE_duration_yellow_2010
    UNION ALL
    SELECT * FROM CTE_duration_yellow_2011_2023
    UNION ALL
    SELECT * FROM CTE_duration_green_2011_2023
)

SELECT 
    passenger_count,
    trip_distance,
    total_amount,
    LOG(duration_seconds + 1) AS log_duration,
    tip_amount
FROM CTE_union_all
WHERE (duration_days = 0) and (duration_seconds > 0)
LIMIT 1000000
"""

### 1.1 Linear Regression

In [25]:
import pandas as pd
import numpy as np
import duckdb
from river import preprocessing, linear_model, metrics

# ----------------------------
# 1. Connect to DuckDB and query
# ----------------------------
con = duckdb.connect("my_data.duckdb")
res = con.execute(query_yellow_green)

# ----------------------------
# 2. Build regression pipeline
# ----------------------------
pipeline = preprocessing.StandardScaler() | linear_model.LinearRegression()

# Track regression metrics
all_metrics = metrics.MAE() + metrics.MSE() + metrics.R2()

# ----------------------------
# 3. Streaming loop
# ----------------------------
while True:
    chunk = res.fetch_df_chunk(vectors_per_chunk=50_000)
    if chunk is None or len(chunk) == 0:
        break

    # Convert to dicts for speed
    records = chunk.to_dict(orient="records")

    #chunk["log_duration"] = np.log1p(chunk["duration_seconds"])
    #chunk.drop(columns=["duration_seconds"], inplace=True)

    #print(chunk.describe().T[["min","max"]].head(15))
    #print(chunk[["tip_amount"]].head(20))
    #print(chunk[["tip_amount"]].describe())

    for r in records:
        # Suppose your target column is "tip_amount"
        x = {k: v for k, v in r.items() if k != "tip_amount"}
        y = r["tip_amount"]

        # Predict
        y_pred = pipeline.predict_one(x)

        # Update metrics
        if y_pred is not None:
            all_metrics.update(y, y_pred)

        # Train
        pipeline.learn_one(x, y)

    # Print metrics after each chunk
    print(all_metrics)


MAE: 0.735155
MSE: 2.641463
R2: 0.309785


##### Result: the R-squared is 0.309785. I think we may increase the result of r-squared to have better model.

### 1.2 PAR-I Regressor

In [27]:
import pandas as pd
import duckdb
import numpy as np
from river import preprocessing, linear_model, metrics

# ----------------------------
# 1. Connect to DuckDB and query
# ----------------------------
con = duckdb.connect("my_data.duckdb")
res = con.execute(query_yellow_green)

# ----------------------------
# 2. Build regression pipeline
# ----------------------------
pipeline = (
    preprocessing.StandardScaler() |
    linear_model.PARegressor(C=0.01, mode=1)  
    # mode=1 -> PA-I, mode=2 -> PA-II (try both!)
)

all_metrics = metrics.MAE() + metrics.MSE() + metrics.R2()

target_col = "tip_amount"
drop_cols = []

# ----------------------------
# 3. Streaming loop
# ----------------------------
while True:
    chunk = res.fetch_df_chunk(vectors_per_chunk=50_000)
    if chunk is None or len(chunk) == 0:
        break

    records = chunk.to_dict(orient="records")

    for r in records:
        x = {k: v for k, v in r.items() if k not in drop_cols + [target_col]}
        y = r[target_col]

        # Transform duration before feeding
        if "duration_seconds" in x and x["duration_seconds"] is not None:
            x["log_duration"] = np.log1p(x["duration_seconds"])
            del x["duration_seconds"]

        y_pred = pipeline.predict_one(x)

        if y_pred is not None:
            all_metrics.update(y, y_pred)

        pipeline.learn_one(x, y)

    print(all_metrics)


MAE: 0.596283
MSE: 1.637795
R2: 0.572044


##### Result: this is the best R-squared value amongst all models

### 1.3 PAR-II Regressor

In [28]:
import pandas as pd
import duckdb
import numpy as np
from river import preprocessing, linear_model, metrics

# ----------------------------
# 1. Connect to DuckDB and query
# ----------------------------
con = duckdb.connect("my_data.duckdb")
res = con.execute(query_yellow_green)

# ----------------------------
# 2. Build regression pipeline
# ----------------------------
pipeline = (
    preprocessing.StandardScaler() |
    linear_model.PARegressor(C=0.01, mode=2)  
    # mode=1 -> PA-I, mode=2 -> PA-II (try both!)
)

all_metrics = metrics.MAE() + metrics.MSE() + metrics.R2()

target_col = "tip_amount"
drop_cols = []

# ----------------------------
# 3. Streaming loop
# ----------------------------
while True:
    chunk = res.fetch_df_chunk(vectors_per_chunk=50_000)
    if chunk is None or len(chunk) == 0:
        break

    records = chunk.to_dict(orient="records")

    for r in records:
        x = {k: v for k, v in r.items() if k not in drop_cols + [target_col]}
        y = r[target_col]

        # Transform duration before feeding
        if "duration_seconds" in x and x["duration_seconds"] is not None:
            x["log_duration"] = np.log1p(x["duration_seconds"])
            del x["duration_seconds"]

        y_pred = pipeline.predict_one(x)

        if y_pred is not None:
            all_metrics.update(y, y_pred)

        pipeline.learn_one(x, y)

    print(all_metrics)


MAE: 0.867999
MSE: 3.236576
R2: 0.154282


##### Result: the r-squaren is decreasing drammatically.

### 1.3. Linear Regression with Regularization 

In [32]:
import pandas as pd
import duckdb
import numpy as np
from river import preprocessing, linear_model, metrics, optim

# ----------------------------
# 1. Connect to DuckDB and query
# ----------------------------
con = duckdb.connect("my_data.duckdb")
res = con.execute(query_yellow_green)

# ----------------------------
# 2. Build regression pipeline
# ----------------------------
pipeline = (
    preprocessing.StandardScaler() |
    linear_model.LinearRegression(
        optimizer=optim.SGD(0.01),  # learning rate
        l2=0.0001                   # regularization strength
    )
)

all_metrics = metrics.MAE() + metrics.MSE() + metrics.R2()

target_col = "tip_amount"
drop_cols = []

# ----------------------------
# 3. Streaming loop
# ----------------------------
while True:
    chunk = res.fetch_df_chunk(vectors_per_chunk=50_000)
    if chunk is None or len(chunk) == 0:
        break

    records = chunk.to_dict(orient="records")

    for r in records:
        x = {k: v for k, v in r.items() if k not in drop_cols + [target_col]}
        y = r[target_col]

        # Transform duration
        if "duration_seconds" in x and x["duration_seconds"] is not None:
            x["log_duration"] = np.log1p(x["duration_seconds"])
            del x["duration_seconds"]

        y_pred = pipeline.predict_one(x)

        if y_pred is not None:
            all_metrics.update(y, y_pred)

        pipeline.learn_one(x, y)

    print(all_metrics)


MAE: 0.73513
MSE: 2.64112
R2: 0.309875


##### Result: the R-squared is 0.30 which is far bellow with PAR-I Regressor.

### 2. Regression on tipping in High Value for Hire Vehicle

### Query for data manipulation:

In [33]:
query_hvfhv_2019_2023 = """
WITH CTE_hvfhv_2019_2023 AS (
    SELECT 
        hvfhs_license_num AS provider,
        request_datetime AS request_time,
        pickup_datetime AS pick_up_time,
        CAST(trip_miles AS FLOAT) AS trip_distance,
        CAST(trip_time AS INTEGER) AS duration_seconds,
        CAST(base_passenger_fare AS FLOAT) AS base_fare,
        CAST(tolls AS FLOAT) AS toll_fare,
        CAST(bcf AS FLOAT) AS bcf_fare,
        CAST(sales_tax AS FLOAT) AS tax_fare,
        CAST(tips AS FLOAT) AS tip_amount,
        shared_request_flag AS shared_before,
        shared_match_flag AS shared_during,
        wav_request_flag AS wheelchair_request
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/*/high_volume_for_hire_vehicle/fhvhv_tripdata_*.parquet'
    WHERE hvfhs_license_num IS NOT NULL
        AND request_datetime IS NOT NULL
        AND pickup_datetime IS NOT NULL
        AND trip_miles >= 0
        AND trip_miles <= 50
        AND trip_time >= 0
        AND base_passenger_fare >= 0
        AND tolls >= 0
        AND bcf >= 0
        AND sales_tax >= 0
        AND tips >= 0
        AND shared_request_flag IS NOT NULL
        AND shared_match_flag IS NOT NULL
        AND wav_request_flag IS NOT NULL
        AND request_datetime >= '2019-02-01' 
        AND request_datetime < '2023-10-01'
), CTE_duration_hvfhv_2019_2023 AS (
    SELECT
        provider,
        DATE_DIFF('day', request_time, pick_up_time) AS duration_days,
        EPOCH(pick_up_time - request_time) AS duration_request,
        trip_distance,
        duration_seconds,
        base_fare + toll_fare + bcf_fare + tax_fare  AS total_amount,
        shared_before,
        shared_during,
        wheelchair_request,
        tip_amount
    FROM CTE_hvfhv_2019_2023
)

SELECT 
    provider,
    duration_request,
    trip_distance,
    duration_seconds,
    total_amount,
    shared_before,
    shared_during,
    wheelchair_request,
    tip_amount
FROM CTE_duration_hvfhv_2019_2023
WHERE duration_days = 0
LIMIT 5
"""

con = duckdb.connect()
df_hvfhv_2019_2023 = con.execute(query_hvfhv_2019_2023).fetchdf()
df_hvfhv_2019_2023.head()

Unnamed: 0,provider,duration_request,trip_distance,duration_seconds,total_amount,shared_before,shared_during,wheelchair_request,tip_amount
0,HV0003,232.0,2.45,579,10.41,Y,N,N,0.0
1,HV0003,921.0,1.71,490,8.809999,N,N,N,2.0
2,HV0005,156.0,5.01,2159,50.07,N,Y,N,0.0
3,HV0005,96.0,0.34,179,8.01,N,Y,N,3.0
4,HV0005,207.0,6.84,1799,27.130001,N,Y,N,4.0


### Transform the categorical columns with one hot encoding with query:

In [34]:
query_hvfhv_2019_2023 = """
WITH CTE_hvfhv_2019_2023 AS (
    SELECT 
        hvfhs_license_num AS provider,
        request_datetime AS request_time,
        pickup_datetime AS pick_up_time,
        CAST(trip_miles AS FLOAT) AS trip_distance,
        CAST(trip_time AS INTEGER) AS duration_seconds,
        CAST(base_passenger_fare AS FLOAT) AS base_fare,
        CAST(tolls AS FLOAT) AS toll_fare,
        CAST(bcf AS FLOAT) AS bcf_fare,
        CAST(sales_tax AS FLOAT) AS tax_fare,
        CAST(tips AS FLOAT) AS tip_amount,
        shared_request_flag AS shared_before,
        shared_match_flag AS shared_during,
        wav_request_flag AS wheelchair_request,
        CAST( CASE tips
            WHEN  0.0 THEN 1
            ELSE 0
        END AS INTEGER) AS tip_category
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/*/high_volume_for_hire_vehicle/fhvhv_tripdata_*.parquet'
    WHERE hvfhs_license_num IS NOT NULL
        AND request_datetime IS NOT NULL
        AND pickup_datetime IS NOT NULL
        AND trip_miles >= 0
        AND trip_miles <= 50
        AND trip_time >= 0
        AND base_passenger_fare >= 0
        AND tolls >= 0
        AND bcf >= 0
        AND sales_tax >= 0
        AND tips >= 0
        AND shared_request_flag IS NOT NULL
        AND shared_match_flag IS NOT NULL
        AND wav_request_flag IS NOT NULL
        AND request_datetime >= '2019-02-01' 
        AND request_datetime < '2023-10-01'
), CTE_duration_hvfhv_2019_2023 AS (
    SELECT
        provider,
        CAST(CASE provider WHEN 'HV0002' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0002,
        CAST(CASE provider WHEN 'HV0003' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0003,
        CAST(CASE provider WHEN 'HV0004' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0004,
        CAST(CASE provider WHEN 'HV0005' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0005,
        DATE_DIFF('day', request_time, pick_up_time) AS duration_days,
        CAST(EPOCH(pick_up_time - request_time) AS INTEGER) AS duration_request,
        trip_distance,
        duration_seconds,
        base_fare + toll_fare + bcf_fare + tax_fare + tip_amount AS total_amount,
        shared_before,
        CAST(CASE shared_before WHEN 'Y' THEN 1 ELSE 0 END AS INTEGER) AS shared_before_yes,
        CAST(CASE shared_before WHEN 'N' THEN 1 ELSE 0 END AS INTEGER) AS shared_before_no,
        shared_during,
        CAST(CASE shared_during WHEN 'Y' THEN 1 ELSE 0 END AS INTEGER) AS shared_during_yes,
        CAST(CASE shared_during WHEN 'N' THEN 1 ELSE 0 END AS INTEGER) AS shared_during_no,
        wheelchair_request,
        CAST(CASE wheelchair_request WHEN 'Y' THEN 1 ELSE 0 END AS INTEGER) AS wheelchair_request_yes,
        CAST(CASE wheelchair_request WHEN 'N' THEN 1 ELSE 0 END AS INTEGER) AS wheelchair_request_no,
        tip_category
    FROM CTE_hvfhv_2019_2023
)

SELECT 
    provider_HV0002,
    provider_HV0003,
    provider_HV0004,
    provider_HV0005,
    duration_request,
    trip_distance,
    duration_seconds,
    total_amount,
    shared_before_yes,
    shared_before_no,
    shared_during_yes,
    shared_during_no,
    wheelchair_request_yes,
    wheelchair_request_no,
    tip_category
FROM CTE_duration_hvfhv_2019_2023
WHERE duration_days = 0
LIMIT 5
"""
con = duckdb.connect()
df_hvfhv_2019_2023 = con.execute(query_hvfhv_2019_2023).fetchdf()
df_hvfhv_2019_2023.head()

Unnamed: 0,provider_HV0002,provider_HV0003,provider_HV0004,provider_HV0005,duration_request,trip_distance,duration_seconds,total_amount,shared_before_yes,shared_before_no,shared_during_yes,shared_during_no,wheelchair_request_yes,wheelchair_request_no,tip_category
0,0,1,0,0,232,2.45,579,10.41,1,0,0,1,0,1,1
1,0,1,0,0,921,1.71,490,10.809999,0,1,0,1,0,1,0
2,0,0,0,1,156,5.01,2159,50.07,0,1,1,0,0,1,1
3,0,0,0,1,96,0.34,179,11.01,0,1,1,0,0,1,0
4,0,0,0,1,207,6.84,1799,31.130001,0,1,1,0,0,1,0


### Query for batching:

In [52]:
query_hvfhv_2019_2023 = """
WITH CTE_hvfhv_2019_2023 AS (
    SELECT 
        hvfhs_license_num AS provider,
        request_datetime AS request_time,
        pickup_datetime AS pick_up_time,
        CAST(trip_miles AS FLOAT) AS trip_distance,
        CAST(trip_time AS INTEGER) AS duration_seconds,
        CAST(base_passenger_fare AS FLOAT) AS base_fare,
        CAST(tolls AS FLOAT) AS toll_fare,
        CAST(bcf AS FLOAT) AS bcf_fare,
        CAST(sales_tax AS FLOAT) AS tax_fare,
        CAST(tips AS FLOAT) AS tip_amount,
        shared_request_flag AS shared_before,
        shared_match_flag AS shared_during,
        wav_request_flag AS wheelchair_request
    FROM 'C:/Users/ekadw/Documents/DATA/NY_Taxi/*/high_volume_for_hire_vehicle/fhvhv_tripdata_*.parquet'
    WHERE hvfhs_license_num IS NOT NULL
        AND request_datetime IS NOT NULL
        AND pickup_datetime IS NOT NULL
        AND trip_miles >= 0
        AND trip_miles <= 50
        AND trip_time >= 0
        AND base_passenger_fare >= 0
        AND tolls >= 0
        AND bcf >= 0
        AND sales_tax >= 0
        AND tips >= 0
        AND shared_request_flag IS NOT NULL
        AND shared_match_flag IS NOT NULL
        AND wav_request_flag IS NOT NULL
        AND request_datetime >= '2019-02-01' 
        AND request_datetime < '2023-10-01'
), CTE_duration_hvfhv_2019_2023 AS (
    SELECT
        provider,
        CAST(CASE provider WHEN 'HV0002' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0002,
        CAST(CASE provider WHEN 'HV0003' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0003,
        CAST(CASE provider WHEN 'HV0004' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0004,
        CAST(CASE provider WHEN 'HV0005' THEN 1 ELSE 0 END AS INTEGER) AS provider_HV0005,
        DATE_DIFF('day', request_time, pick_up_time) AS duration_days,
        CAST(EPOCH(pick_up_time - request_time) AS INTEGER) AS duration_request,
        trip_distance,
        duration_seconds,
        base_fare + toll_fare + bcf_fare + tax_fare + tip_amount AS total_amount,
        shared_before,
        CAST(CASE shared_before WHEN 'Y' THEN 1 ELSE 0 END AS INTEGER) AS shared_before_yes,
        CAST(CASE shared_before WHEN 'N' THEN 1 ELSE 0 END AS INTEGER) AS shared_before_no,
        shared_during,
        CAST(CASE shared_during WHEN 'Y' THEN 1 ELSE 0 END AS INTEGER) AS shared_during_yes,
        CAST(CASE shared_during WHEN 'N' THEN 1 ELSE 0 END AS INTEGER) AS shared_during_no,
        wheelchair_request,
        CAST(CASE wheelchair_request WHEN 'Y' THEN 1 ELSE 0 END AS INTEGER) AS wheelchair_request_yes,
        CAST(CASE wheelchair_request WHEN 'N' THEN 1 ELSE 0 END AS INTEGER) AS wheelchair_request_no,
        tip_amount
    FROM CTE_hvfhv_2019_2023
)

SELECT 
    provider_HV0002,
    provider_HV0003,
    provider_HV0004,
    provider_HV0005,
    LOG(duration_request + 1) AS log_duration_request,
    LOG(trip_distance + 1) AS log_trip_distance,
    LOG(duration_seconds + 1) AS log_duration_seconds,
    LOG(total_amount + 1) AS log_total_amount,
    shared_before_yes,
    shared_before_no,
    shared_during_yes,
    shared_during_no,
    wheelchair_request_yes,
    wheelchair_request_no,
    tip_amount
FROM CTE_duration_hvfhv_2019_2023
WHERE duration_days = 0
LIMIT 1000000
"""

In [53]:
import pandas as pd
import numpy as np
import duckdb
from river import preprocessing, linear_model, metrics

# ----------------------------
# 1. Connect to DuckDB and query
# ----------------------------
con = duckdb.connect("my_data.duckdb")
res = con.execute(query_hvfhv_2019_2023)

# ----------------------------
# 2. Build regression pipeline
# ----------------------------
pipeline = linear_model.LinearRegression()

# Track regression metrics
all_metrics = metrics.MAE() + metrics.MSE() + metrics.R2()

# ----------------------------
# 3. Streaming loop
# ----------------------------
while True:
    chunk = res.fetch_df_chunk(vectors_per_chunk=50_000)
    if chunk is None or len(chunk) == 0:
        break

    # Convert to dicts for speed
    records = chunk.to_dict(orient="records")

    #chunk["log_duration"] = np.log1p(chunk["duration_seconds"])
    #chunk.drop(columns=["duration_seconds"], inplace=True)

    print(chunk.describe().T[["min","max"]].head(15))
    #print(chunk[["tip_amount"]].head(20))
    #print(chunk[["tip_amount"]].describe())

    for r in records:
        # Suppose your target column is "tip_amount"
        x = {k: v for k, v in r.items() if k != "tip_amount"}
        y = r["tip_amount"]

        # Predict
        y_pred = pipeline.predict_one(x)

        # Update metrics
        if y_pred is not None:
            all_metrics.update(y, y_pred)

        # Train
        pipeline.learn_one(x, y)

    # Print metrics after each chunk
    print(all_metrics)


                             min         max
provider_HV0002         0.000000    1.000000
provider_HV0003         0.000000    1.000000
provider_HV0004         0.000000    1.000000
provider_HV0005         0.000000    1.000000
log_duration_request    0.845098    3.944236
log_trip_distance       0.000000    1.707570
log_duration_seconds    0.000000    4.534787
log_total_amount        0.000000    2.714975
shared_before_yes       0.000000    1.000000
shared_before_no        0.000000    1.000000
shared_during_yes       0.000000    1.000000
shared_during_no        0.000000    1.000000
wheelchair_request_yes  0.000000    1.000000
wheelchair_request_no   0.000000    1.000000
tip_amount              0.000000  100.000000
MAE: 1.011807
MSE: 3.346576
R2: -0.177547


##### Result: the R-squared result is negative. There is something must be done here.

### 2.1 PAR-I Regression

In [55]:
import pandas as pd
import duckdb
import numpy as np
from river import preprocessing, linear_model, metrics

# ----------------------------
# 1. Connect to DuckDB and query
# ----------------------------
con = duckdb.connect("my_data.duckdb")
res = con.execute(query_hvfhv_2019_2023)

# ----------------------------
# 2. Build regression pipeline
# ----------------------------
pipeline = (
    linear_model.PARegressor(C=0.01, mode=1)  
    # mode=1 -> PA-I, mode=2 -> PA-II (try both!)
)

all_metrics = metrics.MAE() + metrics.MSE() + metrics.R2()

target_col = "tip_amount"
drop_cols = []

# ----------------------------
# 3. Streaming loop
# ----------------------------
while True:
    chunk = res.fetch_df_chunk(vectors_per_chunk=50_000)
    if chunk is None or len(chunk) == 0:
        break

    records = chunk.to_dict(orient="records")

    for r in records:
        x = {k: v for k, v in r.items() if k not in drop_cols + [target_col]}
        y = r[target_col]

        # Transform duration before feeding
        #if "duration_seconds" in x and x["duration_seconds"] is not None:
        #    x["log_duration"] = np.log1p(x["duration_seconds"])
        #    del x["duration_seconds"]

        y_pred = pipeline.predict_one(x)

        if y_pred is not None:
            all_metrics.update(y, y_pred)

        pipeline.learn_one(x, y)

    print(all_metrics)


MAE: 0.581095
MSE: 3.015136
R2: -0.060924


##### Result: R-squared is still negative.

In [56]:
import pandas as pd
import duckdb
import numpy as np
from river import preprocessing, linear_model, metrics

# ----------------------------
# 1. Connect to DuckDB and query
# ----------------------------
con = duckdb.connect("my_data.duckdb")
res = con.execute(query_hvfhv_2019_2023)

# ----------------------------
# 2. Build regression pipeline
# ----------------------------
pipeline = (
    linear_model.PARegressor(C=0.01, mode=2)  
    # mode=1 -> PA-I, mode=2 -> PA-II (try both!)
)

all_metrics = metrics.MAE() + metrics.MSE() + metrics.R2()

target_col = "tip_amount"
drop_cols = []

# ----------------------------
# 3. Streaming loop
# ----------------------------
while True:
    chunk = res.fetch_df_chunk(vectors_per_chunk=50_000)
    if chunk is None or len(chunk) == 0:
        break

    records = chunk.to_dict(orient="records")

    for r in records:
        x = {k: v for k, v in r.items() if k not in drop_cols + [target_col]}
        y = r[target_col]

        # Transform duration before feeding
        if "duration_seconds" in x and x["duration_seconds"] is not None:
            x["log_duration"] = np.log1p(x["duration_seconds"])
            del x["duration_seconds"]

        y_pred = pipeline.predict_one(x)

        if y_pred is not None:
            all_metrics.update(y, y_pred)

        pipeline.learn_one(x, y)

    print(all_metrics)


MAE: 1.430486
MSE: 4.679029
R2: -0.646392


##### Result: the R-squared values is worse here.

In [57]:
import duckdb
import xgboost as xgb
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ----------------------------
# 1. Connect to DuckDB
# ----------------------------
con = duckdb.connect("my_data.duckdb")
res = con.execute(query_hvfhv_2019_2023)

# ----------------------------
# 2. Define training params
# ----------------------------
params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "learning_rate": 0.05,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "tree_method": "hist",   # fast, scalable
}

num_boost_round = 500   # total boosting iterations
batch_size = 200_000    # rows per batch

# ----------------------------
# 3. Streaming training loop
# ----------------------------
bst = None
dtest = None

while True:
    chunk = res.fetch_df_chunk(vectors_per_chunk=batch_size)
    if chunk is None or len(chunk) == 0:
        break

    # Split features/target
    y = chunk["tip_amount"].values
    X = chunk.drop(columns=["tip_amount"])

    dtrain = xgb.DMatrix(X, label=y)

    if bst is None:
        # First batch → initialize model
        bst = xgb.train(params, dtrain, num_boost_round=50)
    else:
        # Subsequent batches → update model
        bst = xgb.train(params, dtrain, num_boost_round=50, xgb_model=bst)

# ----------------------------
# 4. Evaluate on a held-out test set
# ----------------------------
# Load test set (smaller chunk or separate query)
df_test = con.execute(query_hvfhv_2019_2023).fetch_df()
y_test = df_test["tip_amount"].values
X_test = df_test.drop(columns=["tip_amount"])
dtest = xgb.DMatrix(X_test)

y_pred = bst.predict(dtest)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))


MAE: 0.7052770853042603
MSE: 2.2249197959899902
R2: 0.21712613105773926


##### Result: try the XGBoost with better value of R-squared. May be the data itself is nonlinear and some approach such as feature selection can be done here to increase the model performance.