In [17]:
import pandas as pd
import numpy as np
import glob
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from geopy.distance import geodesic

Load and Process CSVS

In [18]:
csv_paths = glob.glob('./*.csv')
df_list = []

for path in csv_paths:
    try:
        df = pd.read_csv(path)
        required_cols = {
            'timestamp', 'mode', 'duration_seconds', 'duration_in_traffic',
            'origin_lat', 'origin_lng', 'destination_lat', 'destination_lng'
        }
        if required_cols.issubset(df.columns):
            df = df.copy()
            df['duration_final'] = df.apply(
                lambda row: row['duration_in_traffic'] if row['mode'] == 'driving' else row['duration_seconds'],
                axis=1
            )
            df['geo_distance'] = df.apply(lambda row: geodesic(
                (row['origin_lat'], row['origin_lng']),
                (row['destination_lat'], row['destination_lng'])
            ).meters, axis=1)
            df_list.append(df)
    except Exception as e:
        print(f"Failed to load {path}: {e}")

if not df_list:
    raise ValueError("No valid CSVs found.")

full_df = pd.concat(df_list, ignore_index=True)

Feature Engineering

In [19]:
full_df['datetime'] = pd.to_datetime(full_df['timestamp'], unit='s')
full_df['hour_of_day'] = full_df['datetime'].dt.hour
full_df['day_of_week'] = full_df['datetime'].dt.dayofweek  # Monday = 0

# Drop missing values
full_df.dropna(subset=['duration_final', 'geo_distance'], inplace=True)

# One-hot encode mode only
full_df = pd.get_dummies(full_df, columns=['mode'])

# Define feature set
features = [
    'hour_of_day', 'day_of_week', 'geo_distance',
    'origin_lat', 'origin_lng', 'destination_lat', 'destination_lng'
] + [col for col in full_df.columns if col.startswith('mode_')]

X = full_df[features]
y = full_df['duration_final']

Train / Test split with Model Training

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# === Model evaluation ===
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:")
print(f"MAE: {mae:.2f} seconds")
print(f"R² Score: {r2:.4f}")

Model Performance:
MAE: 12.77 seconds
R² Score: 0.9916


Prediction Utility

In [25]:
def build_sample_geo(origin, destination, mode, hour, day, model_columns):

    if not (6 <= hour <= 22):
        raise ValueError(f"Invalid hour: {hour}. Must be between 0 and 23.")
    if not (0 <= day <= 6):
        raise ValueError(f"Invalid day: {day}. Must be between 0 (Mon) and 6 (Sun).")

    data = {
        'hour_of_day': hour,
        'day_of_week': day,
        'origin_lat': origin[0],
        'origin_lng': origin[1],
        'destination_lat': destination[0],
        'destination_lng': destination[1],
    }
    data['geo_distance'] = geodesic(origin, destination).meters

    for m in ['bicycling', 'driving', 'transit', 'walking']:
        data[f'mode_{m}'] = 1 if m == mode else 0

    sample_df = pd.DataFrame([data])
    for col in model_columns:
        if col not in sample_df.columns:
            sample_df[col] = 0
    sample_df = sample_df[model_columns]

    return sample_df


Example Prediction

In [26]:
origin = (47.6205, -122.3492)  # Space Needle
destination = (47.6116, -122.3375)  # Westlake Park
sample = build_sample_geo(origin, destination, mode='driving', hour=9, day=3, model_columns=X.columns)
pred = model.predict(sample)
print(f"Predicted travel time: {pred[0]:.2f} seconds")

Predicted travel time: 396.92 seconds


Save THE MODEL

In [27]:
import joblib

# Save the model and feature column list
joblib.dump(model, 'travel_time_model.pkl')
joblib.dump(X.columns.tolist(), 'model_columns.pkl')

['model_columns.pkl']