In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/added-ds/holiday_data.csv
/kaggle/input/added-ds/osrm_data_train.csv
/kaggle/input/added-ds/weather_data.csv
/kaggle/input/weather-data/weather_data.csv
/kaggle/input/nyc-taxi-trip-duration/train.zip
/kaggle/input/nyc-taxi-trip-duration/test.zip
/kaggle/input/nyc-taxi-trip-duration/sample_submission.zip
/kaggle/input/osrmnyctaxidata/fastest_routes_train_part_1.csv/._fastest_routes_train_part_1.csv
/kaggle/input/osrmnyctaxidata/fastest_routes_train_part_1.csv/fastest_routes_train_part_1.csv
/kaggle/input/osrmnyctaxidata/fastest_routes_test.csv/._fastest_routes_test.csv
/kaggle/input/osrmnyctaxidata/fastest_routes_test.csv/fastest_routes_test.csv
/kaggle/input/for-test-ds/Project5_test_data.csv
/kaggle/input/for-test-ds/Project5_osrm_data_test.csv


In [4]:
# Kaggle notebook cell: full training + save model
# Copy-paste entire block and run "Run All"

import os
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor
from sklearn.linear_model import LassoCV
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.cluster import KMeans
from pandas.tseries.holiday import USFederalHolidayCalendar

# --- Helper functions (condensed from features.py) ---
R = 6371.0
def haversine_series(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

def prepare_data(df, weather_df=None, holiday_df=None, osrm_df=None, fit_kmeans=True, kmeans_clusters=30, exclude_fill_cols=[]):
    df = df.copy()
    # safe datetime
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')
    df['date'] = df['pickup_datetime'].dt.date
    # basic fe
    df['hour'] = df['pickup_datetime'].dt.hour.fillna(0).astype(int)
    df['weekday'] = df['pickup_datetime'].dt.weekday.fillna(0).astype(int)
    df['month'] = df['pickup_datetime'].dt.month.fillna(0).astype(int)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['weekday_sin'] = np.sin(2 * np.pi * df['weekday'] / 7)
    df['weekday_cos'] = np.cos(2 * np.pi * df['weekday'] / 7)

    # merge weather if available
    if weather_df is not None:
        w = weather_df.copy()
        w['date'] = pd.to_datetime(w['date'], dayfirst=True, errors='coerce').dt.date
        w.columns = w.columns.str.strip().str.lower().str.replace(" ", "_")
        if 'precipitation' in w.columns:
            w['precipitation'] = w['precipitation'].replace('T', 0.01).astype(float).fillna(0)
        if 'snow_fall' in w.columns:
            w['snow_fall'] = w['snow_fall'].replace('T', 0.01).astype(float).fillna(0)
        w['precip_intensity'] = w.get('precipitation', 0) + w.get('snow_fall', 0)
        df = df.merge(w[['date','precipitation','snow_fall','precip_intensity']], on='date', how='left')

    # holidays
    if holiday_df is not None:
        hd = holiday_df.copy()
        if 'date' in hd.columns:
            hd['pickup_date'] = pd.to_datetime(hd['date'], errors='coerce').dt.date
        elif 'pickup_date' in hd.columns:
            hd['pickup_date'] = pd.to_datetime(hd['pickup_date'], errors='coerce').dt.date
        else:
            hd['pickup_date'] = pd.Series(dtype='object')
        hd = hd.rename(columns={'holiday':'holiday'}).loc[:, hd.columns.intersection(['pickup_date','holiday'])]
        df['pickup_date'] = df['date']
        df = df.merge(hd, on='pickup_date', how='left')
        df['pickup_holiday'] = df['holiday'].notnull().astype(int)
        df.drop(columns=['holiday','pickup_date'], inplace=True, errors='ignore')
    else:
        df['pickup_holiday'] = 0

    # osrm merges
    if osrm_df is not None and 'id' in osrm_df.columns:
        df = df.merge(osrm_df[['id','total_distance','total_travel_time','number_of_steps']], on='id', how='left')
    df['osrm_total_distance_km'] = df.get('total_distance', 0) / 1000.0
    df['osrm_total_travel_time_s'] = df.get('total_travel_time', np.nan)

    # geo
    df['manhattan_dist'] = (df['pickup_latitude'] - df['dropoff_latitude']).abs() + (df['pickup_longitude'] - df['dropoff_longitude']).abs()
    df['haversine_dist'] = haversine_series(df['pickup_latitude'], df['pickup_longitude'], df['dropoff_latitude'], df['dropoff_longitude'])
    df['pickup_geohash'] = df.apply(lambda r: f"{round(r['pickup_latitude']/0.01,2)}_{round(r['pickup_longitude']/0.01,2)}", axis=1)
    freq = df['pickup_geohash'].value_counts().to_dict()
    df['pickup_density'] = df['pickup_geohash'].map(freq).fillna(0)

    lat1 = np.radians(df['pickup_latitude'])
    lat2 = np.radians(df['dropoff_latitude'])
    dlon = np.radians(df['dropoff_longitude'] - df['pickup_longitude'])
    df['bearing'] = np.arctan2(np.sin(dlon) * np.cos(lat2),
                               np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dlon))
    df['bearing_deg'] = (np.degrees(df['bearing']) + 360) % 360
    df['direction_bin'] = pd.cut(df['bearing_deg'], bins=[0,45,135,225,315,360], labels=['E','N','W','S','E2'])
    df['direction_bin'] = df['direction_bin'].map({'N':0,'E':1,'S':2,'W':3,'E2':1}).fillna(0).astype(int)

    # clusters
    coords_p = df[['pickup_latitude','pickup_longitude']].fillna(0)
    coords_d = df[['dropoff_latitude','dropoff_longitude']].fillna(0)
    if fit_kmeans:
        kmeans_pickup = KMeans(n_clusters=kmeans_clusters, random_state=42).fit(coords_p)
        kmeans_dropoff = KMeans(n_clusters=kmeans_clusters, random_state=42).fit(coords_d)
    else:
        kmeans_pickup = None
        kmeans_dropoff = None

    if kmeans_pickup is not None:
        df['pickup_zone'] = kmeans_pickup.predict(coords_p)
    if kmeans_dropoff is not None:
        df['dropoff_zone'] = kmeans_dropoff.predict(coords_d)

    # rush/holiday flags via USFederalHolidayCalendar
    try:
        cal = USFederalHolidayCalendar()
        hols = cal.holidays(start=df['pickup_datetime'].min(), end=df['pickup_datetime'].max())
        df['is_holiday'] = df['pickup_datetime'].dt.date.isin(hols.date).astype(int)
    except Exception:
        df['is_holiday'] = 0
    df['rush_hour'] = (((df['hour'] >= 7) & (df['hour'] <= 9)) | ((df['hour'] >= 16) & (df['hour'] <= 19))).astype(int)

    # fill numeric nans EXCLUDING SPECIFIED COLUMNS
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    numeric_cols = [col for col in numeric_cols if col not in exclude_fill_cols]  # Exclude target
    df[numeric_cols] = df[numeric_cols].fillna(0)
    return df, kmeans_pickup, kmeans_dropoff

# --- Load data ---
print("Loading data...")
train = pd.read_csv('/kaggle/input/nyc-taxi-trip-duration/train.zip')
test = pd.read_csv('/kaggle/input/nyc-taxi-trip-duration/test.zip')
weather = pd.read_csv('/kaggle/input/weather-data/weather_data.csv')
holiday = pd.read_csv('/kaggle/input/added-ds/holiday_data.csv', sep=';')  # adjust path/name if different
# optional osrm files - update file paths if you have them
osrm_train_path = '/kaggle/input/added-ds/osrm_data_train.csv'
osrm_test_path = '/kaggle/input/for-test-ds/Project5_osrm_data_test.csv'
osrm_df = pd.DataFrame()
if os.path.exists(osrm_train_path):
    osrm_df = pd.read_csv(osrm_train_path)
if os.path.exists(osrm_test_path):
    osrm_df = pd.concat([osrm_df, pd.read_csv(osrm_test_path)], ignore_index=True)

# combine train/test
test['trip_duration'] = np.nan
combined = pd.concat([train, test], axis=0, ignore_index=True)
print(f"Original train: {train.shape}, test: {test.shape}, combined: {combined.shape}")

# Prepare data (EXCLUDE 'trip_duration' FROM FILLING)
print("Preparing combined features...")
combined_prep, kmeans_p, kmeans_d = prepare_data(
    combined, 
    weather_df=weather, 
    holiday_df=holiday, 
    osrm_df=osrm_df, 
    fit_kmeans=True,
    exclude_fill_cols=['trip_duration']  # Critical fix
)

# --- DEBUG: Check split integrity ---
print(f"Null 'trip_duration' in combined_prep: {combined_prep['trip_duration'].isnull().sum()} (should be test set size)")

# Split into train/test
train_proc = combined_prep[combined_prep['trip_duration'].notnull()].copy()
test_proc = combined_prep[combined_prep['trip_duration'].isnull()].copy()
print(f"After split - train_proc: {train_proc.shape}, test_proc: {test_proc.shape}")

# Clean train data (test remains unfiltered)
train_proc = train_proc[
    (train_proc['trip_duration'].between(30, 7200)) &
    (train_proc['pickup_longitude'].between(-74.3, -73.7)) &
    (train_proc['pickup_latitude'].between(40.5, 41.0)) &
    (train_proc['dropoff_longitude'].between(-74.3, -73.7)) &
    (train_proc['dropoff_latitude'].between(40.5, 41.0))
]
print(f"After cleaning - train_proc: {train_proc.shape}")

y = np.log1p(train_proc['trip_duration'])

# select numeric candidate features
drop_cols = ['id','pickup_datetime','dropoff_datetime','date','pickup_geohash','log_trip_duration']
features = [c for c in train_proc.columns if c not in drop_cols and train_proc[c].dtype in [np.float64, np.int64]]
print("Candidate numeric features:", len(features))

X = train_proc[features].fillna(0)
X_test = test_proc[features].fillna(0)
print(f"X shape: {X.shape}, X_test shape: {X_test.shape}")

# scale
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
X_test_scaled = scaler.transform(X_test)

# Lasso selection
print("Running Lasso selection...")
lasso = LassoCV(cv=5, random_state=42, n_jobs=-1).fit(X_scaled, y)
linear_features = [f for f,coef in zip(features, lasso.coef_) if coef != 0]
print("Lasso selected:", len(linear_features))

# Tree selection using XGB (prefit)
print("Running XGB for tree-based selection...")
xgb_sel = xgb.XGBRegressor(n_estimators=200, random_state=42, n_jobs=-1).fit(X_scaled, y)
selector = SelectFromModel(xgb_sel, prefit=True, threshold="median")
tree_features = list(np.array(features)[selector.get_support()])
selected_features = list(set(linear_features).union(set(tree_features)))
if not selected_features:
    selected_features = features  # fallback
print("Total selected features:", len(selected_features))

X_sel = X[selected_features].fillna(0)
X_test_sel = X_test[selected_features].fillna(0)

final_scaler = StandardScaler().fit(X_sel)
X_scaled_sel = final_scaler.transform(X_sel)
X_test_scaled_sel = final_scaler.transform(X_test_sel)

# Models
print("Training base models (may be slow).")
xgb_model = xgb.XGBRegressor(n_estimators=8000, learning_rate=0.05, max_depth=6, random_state=42, n_jobs=-1)
lgb_model = lgb.LGBMRegressor(n_estimators=4000, learning_rate=0.05, max_depth=6, random_state=42, n_jobs=-1)
cat_model = cb.CatBoostRegressor(iterations=4000, learning_rate=0.05, depth=6, random_seed=42, verbose=False)

xgb_model.fit(X_scaled_sel, y)
lgb_model.fit(X_scaled_sel, y)
cat_model.fit(X_scaled_sel, y)

print("Training stacked model...")
stacked = StackingRegressor(
    estimators=[('xgb', xgb_model), ('lgb', lgb_model), ('cat', cat_model)],
    final_estimator=GradientBoostingRegressor(n_estimators=500),
    cv=KFold(n_splits=5, shuffle=True, random_state=42)
)
stacked.fit(X_scaled_sel, y)

# Save model and preprocessors to Kaggle output
print("Saving model and utilities...")
joblib.dump(stacked, "taxi_duration_model.pkl")
joblib.dump(final_scaler, "scaler.pkl")
joblib.dump(selected_features, "selected_features.pkl")
joblib.dump(kmeans_p, "kmeans_pickup.pkl")
joblib.dump(kmeans_d, "kmeans_dropoff.pkl")
print("Saved: taxi_duration_model.pkl, scaler.pkl, selected_features.pkl, kmeans_pickup.pkl, kmeans_dropoff.pkl")

Loading data...
Original train: (1458644, 11), test: (625134, 10), combined: (2083778, 11)
Preparing combined features...




Null 'trip_duration' in combined_prep: 625134 (should be test set size)
After split - train_proc: (1458644, 39), test_proc: (625134, 39)
After cleaning - train_proc: (1450700, 39)
Candidate numeric features: 31
X shape: (1450700, 31), X_test shape: (625134, 31)
Running Lasso selection...
Lasso selected: 25
Running XGB for tree-based selection...
Total selected features: 28
Training base models (may be slow).
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.225501 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3716
[LightGBM] [Info] Number of data points in the train set: 1450700, number of used features: 28
[LightGBM] [Info] Start training from score 6.472453
Training stacked model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.215460 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3716
[LightGBM] [Info] Numb

In [6]:
# Save model and preprocessors to Kaggle output
print("Saving model and utilities...")
joblib.dump(stacked, "taxi_duration_model.pkl")
joblib.dump(final_scaler, "scaler.pkl")
joblib.dump(selected_features, "selected_features.pkl")
joblib.dump(kmeans_p, "kmeans_pickup.pkl")
joblib.dump(kmeans_d, "kmeans_dropoff.pkl")
print("Saved: taxi_duration_model.pkl, scaler.pkl, selected_features.pkl, kmeans_pickup.pkl, kmeans_dropoff.pkl")

Saving model and utilities...
Saved: taxi_duration_model.pkl, scaler.pkl, selected_features.pkl, kmeans_pickup.pkl, kmeans_dropoff.pkl
