In [89]:
from matplotlib import pyplot as plt
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
from pandas.api.types import is_numeric_dtype as is_num
from river import tree

chunk_size = 50000

# Storage for training data
all_X_train = []
all_y_train = []
total_X_test = pd.DataFrame()
total_y_test = pd.Series(dtype='float64')

# Columns
all_columns = ['ID', 'vendorid', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'ratecodeid', 'store_and_fwd_flag',
       'pulocationid', 'dolocationid', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee', 'duration']

columns_in_eval = ['ID', 'vendorid', 'tpep_pickup_datetime', 'passenger_count',
       'trip_distance', 'ratecodeid', 'store_and_fwd_flag', 'pulocationid',
       'dolocationid', 'payment_type', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount',
       'congestion_surcharge', 'airport_fee']

columns_not_in_eval = list(set(all_columns) - set(columns_in_eval))

unneeded_columns = [ 
        'airport_fee', 'payment_type', 'congestion_surcharge',
        'passenger_count', 'vendorid', 'improvement_surcharge', 'tolls_amount',
        'extra', 'tip_amount', 'ratecodeid', 'store_and_fwd_flag',
    ]

needed_columns = list(set(all_columns) - set(unneeded_columns))

# Read data in chunks
chunks = pd.read_csv("training_dataset.csv", chunksize=chunk_size, usecols=needed_columns)

# Initialize models
models = {
    "RandomForestRegressor": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, warm_start=True),
    "SGDRegressor": SGDRegressor(alpha=0.0001, eta0=0.0001, learning_rate="adaptive", warm_start=True),
    "HoeffdingTreeRegressor": tree.HoeffdingTreeRegressor(),
}

In [103]:
counter = 0

for df in chunks:
    if counter == 1:
        break
    print(f"Processing chunk {counter}...")
    counter += 1
    
    # Drop ID column
    df.drop(columns=['ID'], inplace=True)
    
    # Handle missing values
    df.dropna(inplace=True)

    # Convert datetime columns
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['tpep_pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
    df.drop(columns=['tpep_pickup_datetime'], inplace=True)
    df.drop(columns=['tpep_dropoff_datetime'], inplace=True)

    
    for col in df.columns:
        assert is_num(df[col]), f"The '{col}' column contained categorical values"
    
    
    # # Outlier filtering
    # df = df[(df['duration'] < 2880) & (df['duration'] > 30)]
    # df = df[(df['trip_distance'] < 300) & (df['trip_distance'] > 0.25)]
    # df = df[(df['fare_amount'] < 300) & (df['fare_amount'] > 0)]
    # # df = df.applymap(lambda x: x if x > 0 else None).dropna()
    df = df[df.ge(0.01).all(1)]


    for col in df.columns:
        assert (df[col] > 0).all(), f"The '{col}' column contained negative values"


    # Split features and target
    X_data = df.drop(columns = 'duration')
    y_data = df['duration']


    # Train-tt
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

    # Standardize features
    #X_scaler = RobustScaler(quantile_range=(1.0, 99.0))
    X_scaler = StandardScaler()
    X_train = pd.DataFrame(X_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    X_test = pd.DataFrame(X_scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

    #y_Scaler = RobustScaler(quantile_range=(1.0, 99.0))
    y_Scaler = StandardScaler()
    y_train = y_Scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()
    y_test = pd.Series(y_Scaler.transform(y_test.values.reshape(-1, 1)).flatten(), index=X_test.index)

    # Accumulate test data
    total_X_test = pd.concat([total_X_test, X_test])
    total_y_test = pd.concat([total_y_test, y_test])
    
    # Train the model chunk by chunk    
    for model_name in models.keys():
            if model_name == "RandomForestRegressor":
                if counter == 0:
                    models[model_name].fit(X_train, y_train)  # First chunk: normal fit
                else:
                    models[model_name].n_estimators += 10  # Increase trees
                    models[model_name].fit(X_train, y_train)  # Fit on new chunk
            elif model_name == "SGDRegressor":
                models[model_name].partial_fit(X_train, y_train)
            elif model_name == "HoeffdingTreeRegressor":
               for x, y in zip(X_train.to_dict(orient="records"), y_train):
                    models[model_name].learn_one(x, y)
                    



Processing chunk 0...


In [104]:
# Evaluate models
for model_name in models.keys():
    if model_name == "SGDRegressor":
        # Predict & inverse transform
        y_pred = models["SGDRegressor"].predict(total_X_test)
        y_pred = y_Scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()

        # Compute MSE & R2 score
        mse = mean_squared_error(y_true=total_y_test, y_pred=y_pred)
        score = r2_score(y_true=total_y_test, y_pred=y_pred)

    elif model_name == "HoeffdingTreeRegressor":
        # Predict using predict_one()
        y_pred = [models["HoeffdingTreeRegressor"].predict_one(x) for x in total_X_test.to_dict(orient="records")]

        # Compute MSE & R2 score
        mse = mean_squared_error(y_true=total_y_test, y_pred=y_pred)
        score = r2_score(y_true=total_y_test, y_pred=y_pred)

    else:  # RandomForestRegressor or other batch models
        
        scaler = StandardScaler()
        total_X_test = pd.DataFrame(X_scaler.transform(total_X_test), columns=total_X_test.columns, index=total_X_test.index)
        y_pred = models[model_name].predict(total_X_test)

        # Compute MSE & R2 score
        mse = mean_squared_error(y_true=total_y_test, y_pred=y_pred)
        score = r2_score(y_true=total_y_test, y_pred=y_pred)
        

    print(f"MSE {model_name}: {mse:.4f}")
    print(f"Score {model_name}: {score:.4f}")



MSE RandomForestRegressor: 1.0389
Score RandomForestRegressor: -0.0122
MSE SGDRegressor: 364020596224376.8125
Score SGDRegressor: -354648821548328.5625
MSE HoeffdingTreeRegressor: 153096.6067
Score HoeffdingTreeRegressor: -149154.1074


In [105]:
# Load the CSV file
df_eval = pd.read_csv("eval.csv")

# Store IDs for final output
eval_ids = df_eval["ID"]

# Drop ID column
df_eval.drop(columns=["ID"], inplace=True)

# Convert datetime column
df_eval['tpep_pickup_datetime'] = pd.to_datetime(df_eval['tpep_pickup_datetime'])
df_eval['tpep_pickup_hour'] = df_eval['tpep_pickup_datetime'].dt.hour

cols_to_drop = unneeded_columns + ['tpep_pickup_datetime']
# Drop only existing columns
df_eval.drop(columns=[col for col in cols_to_drop if col in df_eval.columns], inplace=True)

# Standardize features
#X_scaler = RobustScaler(quantile_range=(1.0, 99.0))
eval_scaler = StandardScaler()
df_eval = pd.DataFrame(eval_scaler.fit_transform(df_eval), columns=df_eval.columns, index=df_eval.index)

# Make predictions
y_pred_eval = models["RandomForestRegressor"].predict(df_eval)

# Save output
df_out = pd.DataFrame({"ID": eval_ids, "duration": y_pred_eval}).set_index("ID")
df_out.to_csv("submission.csv")

