In [None]:
import pandas as pd
import numpy as np
import glob
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

load and clean route files

In [None]:
csv_paths = glob.glob('./*.csv')  # adjust path as needed
df_list = []

for path in csv_paths:
    try:
        df = pd.read_csv(path)
        # Only keep rows with complete data
        if {'mode', 'timestamp', 'duration_seconds', 'duration_in_traffic'}.issubset(df.columns):
            df = df.copy()
            df['duration_final'] = df.apply(
                lambda row: row['duration_in_traffic'] if row['mode'] == 'driving' else row['duration_seconds'],
                axis=1
            )
            df_list.append(df)
    except Exception as e:
        print(f"Error loading {path}: {e}")

# Combine all route data
if len(df_list) == 0:
    raise ValueError("No usable CSVs found.")
full_df = pd.concat(df_list, ignore_index=True)
print("Combined data shape:", full_df.shape)


Combined data shape: (6664, 15)


Feature Engineering


In [None]:
full_df['datetime'] = pd.to_datetime(full_df['timestamp'], unit='s')
full_df['hour_of_day'] = full_df['datetime'].dt.hour
full_df['day_of_week'] = full_df['datetime'].dt.dayofweek  # Monday = 0

# One-hot encode mode
full_df = pd.get_dummies(full_df, columns=['mode'])

# Drop rows with missing duration_final values before splitting
full_df.dropna(subset=['duration_final'], inplace=True)

# Features and label
features = ['hour_of_day', 'day_of_week'] + [col for col in full_df.columns if col.startswith("mode_")]
X = full_df[features]
y = full_df['duration_final']

Train/Test Split and Model Training

In [None]:
# Drop rows with missing duration_final values before splitting
full_df.dropna(subset=['duration_final'], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

Evaluation

In [None]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f} seconds")
print(f"R² Score: {r2:.2f}")


MAE: 219.34 seconds
R² Score: 0.60


Predict for a sample Route

In [None]:
sample = pd.DataFrame({
    'hour_of_day': [8, 17, 20],
    'day_of_week': [0, 3, 6],  # Monday, Thursday, Sunday
    'mode_bicycling': [0, 0, 1],
    'mode_driving': [1, 0, 0],
    'mode_transit': [0, 1, 0],
    'mode_walking': [0, 0, 0],
})
sample_pred = model.predict(sample)
print("\nSample predictions (in seconds):")
for i, pred in enumerate(sample_pred):
    print(f"Case {i+1}: {int(pred)} seconds")


Sample predictions (in seconds):
Case 1: 507 seconds
Case 2: 874 seconds
Case 3: 630 seconds
