In [27]:
from matplotlib import pyplot as plt
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
from pandas.api.types import is_numeric_dtype as is_num
from river import tree

chunk_size = 50000
all_X_train = []
all_y_train = []
total_X_test = pd.DataFrame()
total_y_test = pd.Series(dtype='float64')

all_columns = ['ID', 'vendorid', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'ratecodeid', 'store_and_fwd_flag',
       'pulocationid', 'dolocationid', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee', 'duration']

unneeded_columns = [ 
    'airport_fee', 'payment_type', 'congestion_surcharge', 'passenger_count',
    'vendorid', 'improvement_surcharge', 'tolls_amount', 'extra', 'tip_amount',
    'ratecodeid', 'store_and_fwd_flag'
]

needed_columns = list(set(all_columns) - set(unneeded_columns))
chunks = pd.read_csv("training_dataset.csv", chunksize=chunk_size, usecols=needed_columns)

models = {
    "RandomForestRegressor": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    "SGDRegressor": SGDRegressor(alpha=0.0001, eta0=0.0001, learning_rate="adaptive", warm_start=True),
    "HoeffdingTreeRegressor": tree.HoeffdingTreeRegressor()
}

counter = 0
for df in chunks:
    if counter == 5:
        break
    print(f"Processing chunk {counter}...")
    counter += 1
    
    df.drop(columns=['ID'], inplace=True)
    df.dropna(inplace=True)
    
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['tpep_pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
    df.drop(columns=['tpep_pickup_datetime', 'tpep_dropoff_datetime'], inplace=True)
    
    for col in df.columns:
        assert is_num(df[col]), f"The '{col}' column contained categorical values"
    
    df = df[(df['duration'] < 2880) & (df['duration'] > 30)]
    df = df[(df['trip_distance'] < 300) & (df['trip_distance'] > 0.25)]
    df = df[(df['fare_amount'] < 300) & (df['fare_amount'] > 0)]
    
    X_data = df.drop(columns=['duration'])
    y_data = df['duration']
    
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)
    
    X_scaler = StandardScaler()
    X_train = pd.DataFrame(X_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    X_test = pd.DataFrame(X_scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
    
    total_X_test = pd.concat([total_X_test, X_test])
    total_y_test = pd.concat([total_y_test, y_test])
    all_X_train.append(X_train)
    all_y_train.append(y_train)
    
    models["SGDRegressor"].partial_fit(X_train, y_train)
    
    for x, y in zip(X_train.to_dict(orient="records"), y_train):
        models["HoeffdingTreeRegressor"].learn_one(x, y)

# Train RandomForestRegressor from scratch on all accumulated training data
X_train_full = pd.concat(all_X_train)
y_train_full = pd.concat(all_y_train)
models["RandomForestRegressor"].fit(X_train_full, y_train_full)

for model_name, model in models.items():
    if model_name == "SGDRegressor":
        y_pred = model.predict(total_X_test)
    elif model_name == "HoeffdingTreeRegressor":
        y_pred = [model.predict_one(x) for x in total_X_test.to_dict(orient="records")]
    else:
        y_pred = model.predict(total_X_test)  # No extra scaling applied here
    
    mse = mean_squared_error(total_y_test, y_pred)
    r2 = r2_score(total_y_test, y_pred)
    print(f"MSE {model_name}: {mse:.4f}")
    print(f"R2 Score {model_name}: {r2:.4f}")
    
    if model_name == "RandomForestRegressor":
        feature_importances = sorted(zip(X_train_full.columns, model.feature_importances_), key=lambda x: x[1], reverse=True)
        print("Feature importances:", feature_importances)


Processing chunk 0...
Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
MSE RandomForestRegressor: 21623.7175
R2 Score RandomForestRegressor: 0.9348
Feature importances: [('fare_amount', np.float64(0.8625000420410602)), ('trip_distance', np.float64(0.07521496218266374)), ('total_amount', np.float64(0.019707269614345907)), ('tpep_pickup_hour', np.float64(0.017305322586798164)), ('dolocationid', np.float64(0.010960812615031852)), ('pulocationid', np.float64(0.009870290048355239)), ('mta_tax', np.float64(0.004441300911744955))]
MSE SGDRegressor: 89336.5302
R2 Score SGDRegressor: 0.7308
MSE HoeffdingTreeRegressor: 40075.8355
R2 Score HoeffdingTreeRegressor: 0.8792


In [26]:
# Load the CSV file
df_eval = pd.read_csv("eval.csv")

# Store IDs for final output
eval_ids = df_eval["ID"]

# Drop ID column
df_eval.drop(columns=["ID"], inplace=True)

# Convert datetime column
df_eval['tpep_pickup_datetime'] = pd.to_datetime(df_eval['tpep_pickup_datetime'])
df_eval['tpep_pickup_hour'] = df_eval['tpep_pickup_datetime'].dt.hour

cols_to_drop = unneeded_columns + ['tpep_pickup_datetime']
# Drop only existing columns
df_eval.drop(columns=[col for col in cols_to_drop if col in df_eval.columns], inplace=True)

# Standardize features
#X_scaler = RobustScaler(quantile_range=(1.0, 99.0))
eval_scaler = StandardScaler()
df_eval = pd.DataFrame(eval_scaler.fit_transform(df_eval), columns=df_eval.columns, index=df_eval.index)

# Make predictions
y_pred_eval = models["RandomForestRegressor"].predict(df_eval)

# Save output
df_out = pd.DataFrame({"ID": eval_ids, "duration": y_pred_eval}).set_index("ID")
df_out.to_csv("submission.csv")



FileNotFoundError: [Errno 2] No such file or directory: 'eval.csv'