In [9]:
import pandas as pd
import numpy as np
from cleaning_and_preprocessing_function import clean_n_preprocess
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

In [2]:
df = pd.read_csv("./../dataset/port_operations_log.csv")
X = df.drop(columns=['Actual_Operation_Duration_Hours'])
y = df['Actual_Operation_Duration_Hours']

### Split dataset into train/test 
Split the dataset into train/test first THEN preprocess only the training dataset to ensure no data leakage (to prevent info from the supposedly unseen test data from influencing the preprocessing)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_processed = clean_n_preprocess(X_train)
X_test_processed = clean_n_preprocess(X_test)

### Baseline & Challenger models
1. Linear Regression
2. Ridge Regression
3. Lasso Regression
4. Decision Tree Regression
5. Random Forest Regression
6. Gradient Boosting Regression
7. Support Vector Regression

In [10]:
lr_model = LinearRegression()
lr_model.fit(X_train_processed, y_train)
y_pred_lr = lr_model.predict(X_test_processed)
print("--- Linear Regression Performance ---")
print(f"MAE: {mean_absolute_error(y_test, y_pred_lr):.2f} hours")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_lr)):.2f} hours")
print(f"R-squared: {r2_score(y_test, y_pred_lr):.2f}")


ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_processed, y_train)
y_pred_ridge = ridge_model.predict(X_test_processed)
print("--- Ridge Regression Performance ---")
print(f"MAE: {mean_absolute_error(y_test, y_pred_ridge):.2f} hours")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_ridge)):.2f} hours")
print(f"R-squared: {r2_score(y_test, y_pred_ridge):.2f}")


lasso_model = Lasso(alpha=1.0)
ridge_model.fit(X_train_processed, y_train)
y_pred_lasso = ridge_model.predict(X_test_processed)
print("--- Lasso Regression Performance ---")
print(f"MAE: {mean_absolute_error(y_test, y_pred_lasso):.2f} hours")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_lasso)):.2f} hours")
print(f"R-squared: {r2_score(y_test, y_pred_lasso):.2f}")


dt_model = DecisionTreeRegressor(max_depth=5, random_state=42) # Hyperparameters to tune
dt_model.fit(X_train_processed, y_train)
y_pred_dt = dt_model.predict(X_test_processed)
print("--- Decision Tree Regression Performance ---")
print(f"MAE: {mean_absolute_error(y_test, y_pred_dt):.2f} hours")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_dt)):.2f} hours")
print(f"R-squared: {r2_score(y_test, y_pred_dt):.2f}")


rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1) # Hyperparameters to tune
rf_model.fit(X_train_processed, y_train)
y_pred_rf = rf_model.predict(X_test_processed)
print("--- Random Forest Regression Performance ---")
print(f"MAE: {mean_absolute_error(y_test, y_pred_rf):.2f} hours")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_rf)):.2f} hours")
print(f"R-squared: {r2_score(y_test, y_pred_rf):.2f}")


gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_model.fit(X_train_processed, y_train)
y_pred_gb = gb_model.predict(X_test_processed)
print("--- Gradient Boost Regression Performance ---")
print(f"MAE: {mean_absolute_error(y_test, y_pred_gb):.2f} hours")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_gb)):.2f} hours")
print(f"R-squared: {r2_score(y_test, y_pred_gb):.2f}")


svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1) # Hyperparameters to tune
svr_model.fit(X_train_processed, y_train)
y_pred_svr = svr_model.predict(X_test_processed)
print("--- Support Vector Regression Performance ---")
print(f"MAE: {mean_absolute_error(y_test, y_pred_svr):.2f} hours")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_svr)):.2f} hours")
print(f"R-squared: {r2_score(y_test, y_pred_svr):.2f}")

--- Linear Regression Performance ---
MAE: 19.70 hours
RMSE: 26.49 hours
R-squared: 0.84
--- Ridge Regression Performance ---
MAE: 19.70 hours
RMSE: 26.50 hours
R-squared: 0.84
--- Lasso Regression Performance ---
MAE: 19.70 hours
RMSE: 26.50 hours
R-squared: 0.84
--- Decision Tree Regression Performance ---
MAE: 10.60 hours
RMSE: 14.52 hours
R-squared: 0.95
--- Random Forest Regression Performance ---
MAE: 9.08 hours
RMSE: 12.94 hours
R-squared: 0.96
--- Gradient Boost Regression Performance ---
MAE: 9.16 hours
RMSE: 12.89 hours
R-squared: 0.96
--- Support Vector Regression Performance ---
MAE: 18.77 hours
RMSE: 32.42 hours
R-squared: 0.77


From a quick evaluation (looking only at 3 metrics: MAE, RMSE, R-squared), the top 3 best performing models before hyperparameter tuning are
- Decision Tree
- Random Forest
- Gradient Boost