In [35]:
from pandas import DataFrame
from common.utils import load_dataset, optimize_memory, get_params, DatasetType
import pandas as pd
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [36]:
train_df: DataFrame = load_dataset("nyc-taxi-trip-duration", DatasetType.TRAIN, index=True)
train_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [37]:
test_df: DataFrame = load_dataset("nyc-taxi-trip-duration", DatasetType.TEST, index=True)
train_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [38]:
import numpy as np

# Convert pickup_datetime to datetime
train_df['pickup_datetime'] = pd.to_datetime(train_df['pickup_datetime'])
test_df['pickup_datetime'] = pd.to_datetime(test_df['pickup_datetime'])

# Time-based features
for df in [train_df, test_df]:
    df['pickup_hour'] = df['pickup_datetime'].dt.hour
    df['pickup_day'] = df['pickup_datetime'].dt.dayofweek
    df['pickup_month'] = df['pickup_datetime'].dt.month

# Haversine distance
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    return 2 * R * np.arcsin(np.sqrt(a))

train_df['distance_km'] = haversine(train_df['pickup_latitude'], train_df['pickup_longitude'],
                                    train_df['dropoff_latitude'], train_df['dropoff_longitude'])

test_df['distance_km'] = haversine(test_df['pickup_latitude'], test_df['pickup_longitude'],
                                   test_df['dropoff_latitude'], test_df['dropoff_longitude'])

# Encode store_and_fwd_flag
train_df['store_and_fwd_flag'] = train_df['store_and_fwd_flag'].map({'N': 0, 'Y': 1})
test_df['store_and_fwd_flag'] = test_df['store_and_fwd_flag'].map({'N': 0, 'Y': 1})

In [39]:
from sklearn.model_selection import train_test_split

features = [
    'vendor_id', 'passenger_count', 'pickup_hour', 'pickup_day',
    'pickup_month', 'store_and_fwd_flag', 'distance_km'
]

# Target variable (log-transform)
train_df['trip_duration'] = np.log1p(train_df['trip_duration'])

X = train_df[features]
y = train_df['trip_duration']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
model = XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    n_estimators=500,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    early_stopping_rounds=20,
    random_state=42,
    verbosity=1
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=20
)


[0]	validation_0-rmse:0.79485
[20]	validation_0-rmse:0.50034
[40]	validation_0-rmse:0.46951
[60]	validation_0-rmse:0.46710
[80]	validation_0-rmse:0.46641
[100]	validation_0-rmse:0.46621
[120]	validation_0-rmse:0.46610
[139]	validation_0-rmse:0.46613


In [41]:
# Predict log-transformed duration
y_pred_log = model.predict(X_val)

# Compute RMSE in log space
rmse_log = np.sqrt(mean_squared_error(y_val, y_pred_log))

# Convert back to seconds
y_val_actual = np.expm1(y_val)
y_pred_actual = np.expm1(y_pred_log)
rmse_real = np.sqrt(mean_squared_error(y_val_actual, y_pred_actual))

print(f"RMSE (log-transformed): {rmse_log:.4f}")
print(f"RMSE (actual seconds): {rmse_real:.2f}")

RMSE (log-transformed): 0.4661
RMSE (actual seconds): 3207.46


In [42]:
X_test = test_df[features]
test_preds = model.predict(X_test)
test_df['trip_duration'] = np.expm1(test_preds)

submission = test_df[['id', 'trip_duration']]
submission.to_csv('data/xgb_submission.csv', index=False)


In [44]:
model.save_model("../../models/xgb_nyc_trip_duration.json")