# Delay Modelling with CatBoost

As the majority of features are categorical, we select a method reputed to be well suited to such situations: CatBoost. It requires specifying which columns are to be treated as categorical.

The following libraries are required:

In [None]:
import os

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

from catboost import CatBoostRegressor

import matplotlib.pyplot as plt
import seaborn as sns

from ipynb_utils import CFG, plt_savefig

We load the processed data from disk:

In [None]:
df = pd.read_pickle(CFG["PROCESSED_DATA_PATH"])

Let us specify the (categorical) features and perform a train-test split of the data.

In [None]:
# Target
target_col = "target"

# Features (not necessarily categorical)
feature_cols = [ 
    "STATUS",
    # "FLTID",
    "AC",
    "flight_time",
    "DEPSTN", 
    "ARRSTN",
    "DATOP_year", 
    "DATOP_month", 
    "DATOP_day",
    # "STD",
]

# Features to be treated categorically
cat_cols = [
    "STATUS",
    # "FLTID",
    "AC",
    # "flight_time",
    "DEPSTN", 
    "ARRSTN",
    "DATOP_year", 
    "DATOP_month", 
    "DATOP_day",
    # "STD",
]

# Target-feature split
y = df[target_col]
X = df[feature_cols]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=CFG["RSEED"]
)

Based on this data preparation, let GridSearch determine the optimal CatBoostRegressor model.

In [None]:
# Path to CatBoost log files
train_dir = os.path.join(CFG["LOGS_DIR"], "catboost")

model = CatBoostRegressor(
    verbose=0,
    train_dir=train_dir,
)

# Parameter grid for GridSearchCV
param_grid = {
    # Number of boosting rounds (trees). 
    "iterations": [1000],

    # Learning rate controls the step size in gradient descent.
    # "learning_rate": [0.01],
    "learning_rate": [0.09, 0.1, 0.11],
    # Best learning rate around 0.1.

    # Maximum depth of each decision tree.
    "depth": [8, 9, 10],
    # Best depth between 8 and 10.

    # Specifies the categorical columns in the dataset.
    "cat_features": [cat_cols],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",
    cv=5, 
    verbose=0,
)

grid_search.fit(X, y)

best_params = grid_search.best_params_

print("Best parameters:")
for k, v in grid_search.best_params_.items():
    print(f"  {k:<14}: {v}")


best_model = grid_search.best_estimator_

The evaluation of our model is performed by the following code block:

In [None]:
y_pred = best_model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print("RMSE      :", rmse)
print("R_2       :", r2)
print("MAP Error :", mape)

t = 1000

# Plot residuals
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.plot([0, t], [0, t], color="red", linestyle="--")
plt.xlabel("Reality")
plt.ylabel("Prediction")
plt.xlim(0, 3000)
plt.title("Real vs. Predicted Values")
plt_savefig("catboost_actual-vs-predicted_dist")
plt.show()

Evidently, our model tends towards systematic underestimation when the delay duration is extraordinarily long. Let us therefore examine how our model performs if we exclude such exceptional cases.

In [None]:
is_y_test_below = (y_test <= 750).astype(bool)
y_test_trim = y_test[is_y_test_below]
y_pred_trim = y_pred[is_y_test_below]

rmse = mean_squared_error(y_test_trim, y_pred_trim, squared=False)
r2 = r2_score(y_test_trim, y_pred_trim)
mape = mean_absolute_percentage_error(y_test_trim, y_pred_trim)

print("RMSE      :", rmse)
print("R_2       :", r2)
print("MAP Error :", mape)

# Plot residuals
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test_trim, y=y_pred_trim)
plt.plot([0, t], [0, t], color="red", linestyle="--")
plt.xlabel("Reality")
plt.ylabel("Prediction")
plt.xlim(0, 3000)
plt.title("Real vs. Predicted Values")
plt_savefig("catboost_actual-vs-predicted_trim_dist")
plt.show()