In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import roc_auc_score, precision_score, accuracy_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
import numpy as np
from sklearn.metrics import r2_score, explained_variance_score


In [2]:
data = pd.read_csv("public_transportation_dataset.csv")


In [3]:
data['Special_Schedules'].fillna('Not Available', inplace=True)

In [4]:
data['Total_Stops'] = data['Intermediate_Stops'].apply(lambda x: len(x.split(',')) + 2)


In [5]:
data.head()

Unnamed: 0,Route_ID,Starting_Point,Ending_Point,Intermediate_Stops,Distance,Estimated_Travel_Time,Type_of_Transportation,Frequency_of_Service,Time_Schedule,Days_of_Operation,Special_Schedules,Total_Stops
0,R1,Adamstad,Stewartland,"['North Patrickstad', 'Middletonton', 'Wallvil...",10.799057,18,subway,1,07:05:17,Weekdays,Should recognize whose medical bring adult lis...,5
1,R2,Brookeburgh,New Lindaport,"['Youngberg', 'Jasonville']",9.271333,49,bus,1,09:16:09,Weekends,Not Available,4
2,R3,Port Patrickstad,Josephmouth,"['South Alexandriahaven', 'Lake Madison', 'Sco...",15.774874,17,tram,1,03:53:52,Weekends,Not Available,7
3,R4,West Melissa,Sanchezstad,['West Sean'],15.439269,16,tram,2,19:11:59,Weekdays,Into involve clear left.,3
4,R5,South Justinborough,Port Kathleenburgh,"['Port Aaronfurt', 'Ellisfurt', 'South Nathani...",6.503155,34,tram,4,22:23:10,Weekends,Not Available,5


In [5]:
X = data.drop(['Route_ID','Estimated_Travel_Time'], axis=1)
y = data['Estimated_Travel_Time']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply random oversampling (optional, depending on the distribution of 'Estimated_Travel_Time')
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)


In [22]:
def evaluate_regression_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    r2 = r2_score(y_test, y_pred)
    explained_var = explained_variance_score(y_test, y_pred)

    return mae, mse, rmse, mape, r2, explained_var


In [14]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns


In [15]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])


In [16]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [17]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [18]:
lasso_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lasso', Lasso(alpha=0.1))
])


In [25]:
mae, mse, rmse, mape, r2, explained_var = evaluate_regression_model(lasso_model, X_resampled, X_test, y_resampled, y_test)
print("\nLasso Regression Results:")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAPE: {mape:.4f}%")
print(f"R-squared: {r2:.4f}")
print(f"Explained Variance: {explained_var:.4f}")



Lasso Regression Results:
MAE: 11.3421
MSE: 170.7241
RMSE: 13.0661
MAPE: 36.1493%
R-squared: -0.0021
Explained Variance: 0.0011


In [26]:
dt_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('dt', DecisionTreeRegressor(random_state=42))
])

mae, mse, rmse, mape, r2, explained_var = evaluate_regression_model(lasso_model, X_resampled, X_test, y_resampled, y_test)
print("\nDecision Tree Regression Results:")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAPE: {mape:.4f}%")
print(f"R-squared: {r2:.4f}")
print(f"Explained Variance: {explained_var:.4f}")



Decision Tree Regression Results:
MAE: 11.3421
MSE: 170.7241
RMSE: 13.0661
MAPE: 36.1493%
R-squared: -0.0021
Explained Variance: 0.0011


In [27]:
ridge_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('ridge', Ridge(alpha=0.1))
])

mae, mse, rmse, mape, r2, explained_var = evaluate_regression_model(ridge_model, X_resampled, X_test, y_resampled, y_test)
print("\nRidge Regression Results:")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAPE: {mape:.4f}%")
print(f"R-squared: {r2:.4f}")
print(f"Explained Variance: {explained_var:.4f}")



Ridge Regression Results:
MAE: 11.4410
MSE: 174.7762
RMSE: 13.2203
MAPE: 36.2626%
R-squared: -0.0258
Explained Variance: -0.0213


In [28]:
xgb_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgb', XGBRegressor(random_state=42))
])

mae, mse, rmse, mape, r2, explained_var = evaluate_regression_model(xgb_model, X_resampled, X_test, y_resampled, y_test)
print("\nXGBoost Regressor Results:")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAPE: {mape:.4f}%")
print(f"R-squared: {r2:.4f}")
print(f"Explained Variance: {explained_var:.4f}")


  if is_sparse(data):



XGBoost Regressor Results:
MAE: 11.3367
MSE: 171.4056
RMSE: 13.0922
MAPE: 36.2112%
R-squared: -0.0061
Explained Variance: -0.0036
