In [16]:
import pandas as pd
import numpy as np
import glob
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

load and clean route files

In [18]:
csv_paths = glob.glob('./*.csv')
df_list = []

for path in csv_paths:
    try:
        df = pd.read_csv(path)
        # Ensure required columns exist
        required_cols = {'timestamp', 'mode', 'duration_seconds', 'duration_in_traffic',
                         'origin_place', 'destination_place', 'distance_meters'}
        if required_cols.issubset(df.columns):
            df = df.copy()
            df['duration_final'] = df.apply(
                lambda row: row['duration_in_traffic'] if row['mode'] == 'driving' else row['duration_seconds'],
                axis=1
            )
            df_list.append(df)
    except Exception as e:
        print(f"Failed to load {path}: {e}")

# === Combine into a full DataFrame ===
if len(df_list) == 0:
    raise ValueError("No valid CSV files found.")
full_df = pd.concat(df_list, ignore_index=True)
print("Loaded and combined:", full_df.shape)

Loaded and combined: (952, 15)


Feature Engineering


In [19]:
full_df['datetime'] = pd.to_datetime(full_df['timestamp'], unit='s')
full_df['hour_of_day'] = full_df['datetime'].dt.hour
full_df['day_of_week'] = full_df['datetime'].dt.dayofweek  # Monday = 0

# Drop missing durations or distances
full_df.dropna(subset=['duration_final', 'distance_meters'], inplace=True)

# One-hot encode categorical columns
full_df = pd.get_dummies(full_df, columns=['mode', 'origin_place', 'destination_place'])

# Feature columns
features = ['hour_of_day', 'day_of_week', 'distance_meters'] + \
           [col for col in full_df.columns if col.startswith(('mode_', 'origin_place_', 'destination_place_'))]

X = full_df[features]
y = full_df['duration_final']

Train/Test Split and Model Training

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

Evaluation

In [44]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nModel Performance:")
print(f"MAE: {mae:.2f} seconds")
print(f"R² Score: {r2:.2f}")



Model Performance:
MAE: 7.12 seconds
R² Score: 1.00


Predict for a sample Route

In [43]:
sample = pd.DataFrame({
    'hour_of_day': [8, 17],
    'day_of_week': [0, 3],  # Monday, Thursday
    'distance_meters': [1500, 2500],
    'mode_bicycling': [0, 0],
    'mode_driving': [0, 0],
    'mode_transit': [1, 1],
    'mode_walking': [0, 0],
    'origin_place_Seattle Aquarium': [1, 0],
    'origin_place_Space Needle': [0, 1],
    'destination_place_Westlake Park': [1, 0],
    'destination_place_Pike Place Market': [0, 1],
})

# Fill in missing columns with 0s (in case one-hot columns not present in sample)
for col in X.columns:
    if col not in sample.columns:
        sample[col] = 0

# Ensure correct column order
sample = sample[X.columns]

# Predict
sample_pred = model.predict(sample)
print("\nPredictions on New Inputs (in seconds):")
for i, pred in enumerate(sample_pred):
    print(f"Case {i+1}: {int(pred)} seconds")


Predictions on New Inputs (in seconds):
Case 1: 585 seconds
Case 2: 927 seconds
