# DS ML Project ⟡ Flight Delay Prediction Challenge

## Synopsis

**TODO** Write this paragraph

## Requirements

In [None]:
# Data Science
import pandas as pd
import missingno as msno

# Scientific Computation
import numpy as np
import scipy.stats as stats

# Scikit-Learn Tools
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Power predictive score
import ppscore as pps

import warnings

# Configs

Next, let us specify all required configurations to have them in one place.

In [None]:
# Level of the warnings module
WARNINGS_LEVEL = "ignore"
warnings.filterwarnings(WARNINGS_LEVEL)

# Path to train data
PATH_DATA_TRAIN = "./data/train.csv"

# Random seed
RSEED = 42

# Resolution when storing plots in files
DPI = 600

# Matplotlib style
PLT_STYLE = "seaborn"
try:
    plt.style.use(PLT_STYLE)
except:
    warnings.warn(f"Could not load matplotlib style {PLT_STYLE}", UserWarning)

We read the data from file into a pandas data frame and create a copy that will incorporate our manipulations.

In [None]:
df_0 = pd.read_csv(PATH_DATA_TRAIN)

df = df_0.copy()

In [None]:
print(df.head())
print(df.shape)
print(df.isnull().sum())
print(df.dtypes)

Variable definitions (according to <https://zindi.africa/competitions/flight-delay-prediction-challenge/data>):

- **DATOP**: Date of flight
- **FLTID**: Flight number
- **DEPSTN**: Departure point
- **ARRSTN**: Arrival point
- **STD**: Scheduled Time departure
- **STA**: Scheduled Time arrival
- **STATUS**: Flight status
- **ETD**: Expected Time departure
- **ETA**: Expected Time arrival
- **ATD**: Actual Time of Departure
- **ATA**: Actual Time of arrival
- **DELAY1**: Delay code 1
- **DUR1**: delay time 1
- **DELAY2**: Delay code 2
- **DUR2**: delay time 2
- **DELAY3**: Delay code 3
- **DUR3**: delay time 3
- **DELAY4**: Delay code 4
- **DUR4**: delay time 4
- **AC**: Aircraft Code

In [None]:
# Sorry for "statuses" ...
statuses = df["STATUS"].unique()

print("All Statuses:")
for status in statuses:
    print(f"  Number of entries of {status}: {df[df['STATUS'] == status].shape[0]}")
    print(f"  Mean: {df[df['STATUS'] == status]['target'].mean()}")
    print(f"  Median: {df[df['STATUS'] == status]['target'].median()}")

**TODO** Find out the meaning of statuses

<!-- TODO Find out the meaning of statuses -->

In [None]:
df["STA"] = df["STA"].str.replace(".", ":")

In [None]:
df.head()

In [None]:
df["DATOP"] = pd.to_datetime(df["DATOP"], format="%Y-%m-%d")
df["STD"] = pd.to_datetime(df["STD"], format="%Y-%m-%d %H:%M:%S")
df["STA"] = pd.to_datetime(df["STA"], format="%Y-%m-%d %H:%M:%S")

In [None]:
# extract year, month, dayofweek and hour information out of column publish_time and build new column for each
df["DATOP_year"] = df["DATOP"].dt.year
df["DATOP_month"] = df["DATOP"].dt.month
df["DATOP_day"] = df["DATOP"].dt.dayofweek + 1
# df["publish_hour"]=df["publish_time"].dt.hour

In [None]:
df["flight_time"] = (df["STA"] - df["STD"]).dt.total_seconds() / 60

In [164]:
df.head()


Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target
0,train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12.55.00,ATA,TU 32AIMN,260.0
1,train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16.55.00,ATA,TU 31BIMO,20.0
2,train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06.45.00,ATA,TU 32AIMN,0.0
3,train_id_3,2016-01-17,TU 0480,DJE,NTE,2016-01-17 14:10:00,2016-01-17 17.00.00,ATA,TU 736IOK,0.0
4,train_id_4,2016-01-17,TU 0338,TUN,ALG,2016-01-17 14:30:00,2016-01-17 15.50.00,ATA,TU 320IMU,22.0


A PP-score matrix:

In [None]:
pp_scores = pps.matrix(df)[["x", "y", "ppscore"]].pivot(
    columns="x", index="y", values="ppscore"
)

pp_scores = pp_scores.round(2)

plt.figure(figsize=(12, 8))

sns.heatmap(
    pp_scores,
    vmin=0,
    vmax=1,
    cmap="Reds",
    linewidths=0.5,
    annot=True,
)

plt.savefig("./img/pp-scores.png", dpi=DPI, bbox_inches="tight")

plt.plot()

Scatterplot

In [None]:
sns.pairplot(df)

# WARNING Takes really long!
# plt.savefig("./img/pairplot.png", dpi=DPI, bbox_inches="tight")

plt.show()

In [None]:
DATOP_years = df["DATOP_year"].unique()

for status in DATOP_years:
    plt.figure(figsize=(8, 4))
    df_year = df[df["DATOP_year"] == status]
    df_year["DATOP_month"].hist(bins=12)
    plt.title(f"Flight Distribution per Month – {status}")
    plt.xlabel("Month")
    plt.ylabel("Number of Flights")
    plt.xticks(range(1, 13))
    plt.tight_layout()
    plt.show()

In [None]:
for status in DATOP_years:
    plt.figure(figsize=(8, 4))
    df_year = df[df["DATOP_year"] == status]
    df_year.groupby("DATOP_month")["target"].sum().plot(
        kind="line",
        title=f"Monthly Sum of Target for {status}",
        xlabel="Month",
        ylabel="Sum of Target",
    )

# Baseline Model (a.k.a. A Feeble Try), Version I

**Hypothesis**: The flight delay can be predicted from the Aircraft Code.

In [None]:
df_encoded = pd.get_dummies(df, columns=["AC"], prefix="AC")

y = df.target
X = df_encoded.drop("target", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=RSEED
)

Fit -- Predict -- Evaluate

In [None]:
cols = [col for col in df_encoded.columns if col.startswith("AC_")]

X_0 = X_train[cols]
y_0 = y_train
X_1 = X_test[cols]
y_1 = y_test

model = KNeighborsRegressor(n_neighbors=3)
# model = LinearRegression()

model.fit(X_0, y_0)

y_pred = model.predict(X_1)

mse = mean_squared_error(y_1, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_1, y_pred)

# print("Coefficients:", linreg.coef_)
# print("Intercept:", model.intercept_)
print("Root Mean Squared Error:", rmse)
# print("R-squared Score:", r2)

In [None]:
# X_test.to_csv("data/X_test.csv")
# y_test.to_csv("data/y_test.csv")