# Delay Modelling with Linear Regression

We aim to predict the flight delay from a single feature such as the weekday or the aircraft code.

The following libraries are required:

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

import matplotlib.pyplot as plt
import seaborn as sns

from local_nbutils import CFG, plt_savefig

We load the processed data from disk:

In [None]:
df = pd.read_pickle(CFG["PROCESSED_DATA_PATH"])

We set up and fit a **linear regression** model that using the weekday as sole covariate.

In [None]:
col = "DATOP_day"
# col = "AC"

df_encoded = pd.get_dummies(df, columns=[col], prefix=col)

y = df.target
X = df_encoded.drop("target", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=CFG["RSEED"]
)

cols = [col_ for col_ in df_encoded.columns if col_.startswith(f"{col}_")]

X_train_red = X_train[cols]
X_test_red = X_test[cols]

model = LinearRegression()

model.fit(X_train_red, y_train)

print("Fitted parameters:")
print("  Coefficients:", model.coef_)
print("  Intercept:   ", model.intercept_)

To analyse the prediction errors of this model, we plot the actual values against the predicted values for the test set:

In [None]:
y_pred = model.predict(X_test_red)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print("RMSE      :", rmse)
print("R_2       :", r2)
print("MAP Error :", mape)

t = 1000

# # Calculate residuals
# residuals = y_test - y_pred

# Plot residuals
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.plot([0, t], [0, t], color='red', linestyle='--')
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.xlim(0, 3000)
plt.title("Residuals Plot")
plt_savefig("lin-reg_actual-vs-predicted_dist")
plt.show()