# Delay Modelling with CatBoost

## A Model Using CatBoost

As among the features there are many categorical ones, we choose a method that claims to be suited for such situations ... CatBoost! It requires to specify which columns should be treated as categorical.

In [None]:
import os

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

from catboost import CatBoostRegressor

import matplotlib.pyplot as plt
import seaborn as sns

from local_nbutils import CFG, plt_savefig

In [None]:
df = pd.read_pickle(CFG["PROCESSED_DATA_PATH"])

In [None]:
target_col = "target"
feature_cols = [
    "STATUS",
    # "FLTID",
    "AC",
    "flight_time",
    "DEPSTN", 
    "ARRSTN",
    "DATOP_year", 
    "DATOP_month", 
    "DATOP_day",
    # "STD",
]

cat_cols = [
    "STATUS",
    # "FLTID",
    "AC",
    # "flight_time",
    "DEPSTN", 
    "ARRSTN",
    "DATOP_year", 
    "DATOP_month", 
    "DATOP_day",
    # "STD",
]

y = df[target_col]
X = df[feature_cols]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=CFG["RSEED"]
)

In [None]:
train_dir = os.path.join(CFG["LOGS_DIR"], "catboost")

model = CatBoostRegressor(
    verbose=0,
    train_dir=train_dir,
)

# Parameter grid for GridSearchCV
param_grid = {
    # # # Number of boosting rounds (trees). 
    # "iterations": [1000],

    # Learning rate controls the step size in gradient descent.
    # "learning_rate": [0.01],
    "learning_rate": [0.1],
    # Best learning rate around 0.1.

    # Maximum depth of each decision tree.
    # "depth": [4, 5, 6, 7, 8],
    "depth": [10],
    # Best depth between 8 and 10.

    # Specifies the categorical columns in the dataset.
    "cat_features": [cat_cols],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",
    cv=5, 
    verbose=0,
)

grid_search.fit(X, y)

best_params = grid_search.best_params_
print("Best parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

In [None]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R_2: {r2}")
print(f"MAP Error: {mape}")

t = 1000

# Plot residuals
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.plot([0, t], [0, t], color="red", linestyle="--")
plt.xlabel("Reality")
plt.ylabel("Prediction")
plt.xlim(0, 3000)
plt.title("Real vs. Predicted Values")
plt_savefig("catboost_actual-vs-predicted_dist")
plt.show()

In [None]:
is_y_test_below = (y_test <= 750).astype(bool)
y_test_trim = y_test[is_y_test_below]
y_pred_trim = y_pred[is_y_test_below]

rmse = mean_squared_error(y_test_trim, y_pred_trim, squared=False)
r2 = r2_score(y_test_trim, y_pred_trim)
mape = mean_absolute_percentage_error(y_test_trim, y_pred_trim)

print(f"RMSE: {rmse}")
print(f"R_2: {r2}")
print(f"MAP Error: {mape}")

# Plot residuals
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test_trim, y=y_pred_trim)
plt.plot([0, t], [0, t], color="red", linestyle="--")
plt.xlabel("Reality")
plt.ylabel("Prediction")
plt.xlim(0, 3000)
plt.title("Real vs. Predicted Values")
plt_savefig("catboost_actual-vs-predicted_trim_dist")
plt.show()