# EDA Experiments Moritz

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor

from ipynb_utils import CFG, plt_savefig

In [None]:
# df = pd.read_pickle(CFG["PROCESSED_DATA_PATH"])
df = pd.read_csv(CFG["TRAIN_DATA_PATH"])

In [None]:
print(df.head())
print(df.shape)
print(df.isnull().sum())
print(df.dtypes)
df.nunique()

In [None]:
df["route"] = df["DEPSTN"] + "-" + df["ARRSTN"]

In [None]:
df.plot(x="route", y="target", kind="scatter")

In [None]:
print(df["STATUS"].unique())

col_entries = ["ATA", "DEP", "RTR", "SCH", "DEL"]

for i in col_entries:
    print(f"Number of entries of {i}: {df[df['STATUS'] == i].shape[0]}")
    print(f"Mean: {df[df['STATUS'] == i]['target'].mean()}")
    print(f"Median: {df[df['STATUS'] == i]['target'].median()}")

In [None]:
df["STA"] = df["STA"].str.replace(".", ":")

In [None]:
df["target"].hist(bins=50, log=True)

In [None]:
df.head(20)

In [None]:
df["DATOP"] = pd.to_datetime(df["DATOP"], format="%Y-%m-%d")
df["STD"] = pd.to_datetime(df["STD"], format="%Y-%m-%d %H:%M:%S")
df["STA"] = pd.to_datetime(df["STA"], format="%Y-%m-%d %H:%M:%S")

In [None]:
# extract year, month, dayofweek and hour information out of column publish_time and build new column for each
df["DATOP_year"] = df["DATOP"].dt.year
df["DATOP_month"] = df["DATOP"].dt.month
df["DATOP_day"] = df["DATOP"].dt.dayofweek + 1
# df["publish_hour"]=df["publish_time"].dt.hour

In [None]:
df["flight_time"] = (df["STA"] - df["STD"]).dt.total_seconds() / 60

In [None]:
df.head()

In [None]:
print(df["DATOP_year"].unique())

col_entries = [2016, 2017, 2018]

for i in col_entries:
    plt.figure(figsize=(8, 4))
    df_year = df[df["DATOP_year"] == i]
    df_year["DATOP_month"].hist(bins=12)
    plt.title(f"Flight Distribution per Month – {i}")
    plt.xlabel("Month")
    plt.ylabel("Number of Flights")
    plt.xticks(range(1, 13))
    plt.tight_layout()
    plt_savefig(f"month-to-sum-flight-on-{i}_hist")
    plt.show()

In [None]:
print(df["DATOP_year"].unique())

col_entries = [2016, 2017, 2018]

for i in col_entries:
    plt.figure(figsize=(8, 4))
    df_year = df[df["DATOP_year"] == i]
    df_year.groupby("DATOP_month")["target"].sum().plot(
        kind="line",
        title=f"Monthly Sum of Target for {i}",
        xlabel="Month",
        ylabel="Sum of Target",
    )

In [None]:
df2 = pd.get_dummies(
    df, columns=["DATOP_day"], prefix="day", drop_first=True, dtype=int
)
df2 = pd.get_dummies(
    df2, columns=["DATOP_year"], prefix="yr", drop_first=True, dtype=int
)
df2 = pd.get_dummies(
    df2, columns=["DATOP_month"], prefix="mon", drop_first=True, dtype=int
)
df2 = pd.get_dummies(df2, columns=["DEPSTN"], prefix="dep", drop_first=True, dtype=int)
df2 = pd.get_dummies(df2, columns=["ARRSTN"], prefix="arr", drop_first=True, dtype=int)
df2 = pd.get_dummies(df2, columns=["AC"], prefix="ac", drop_first=True, dtype=int)

In [None]:
df2.head()

In [None]:
y = df2.target
X = df2.drop("target", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=CFG["RSEED"]
)

In [None]:
y_train.hist(bins=50, log=True)
y_test.hist(bins=50, log=True)

In [None]:
df2.head()

Moritz - Add weekday to model

- Weekday vs delay

In [None]:
try1 = df.groupby("DATOP_day")["target"].sum()
try2 = df.groupby("DATOP_day")["ID"].count()
try1.corr(try2)

In [None]:
grouped = df.groupby(["DATOP_year", "DATOP_day"])["target"].sum().unstack(level=0)

grouped.plot(kind="line", figsize=(10, 6))
plt.xlabel("Day of Week")
plt.ylabel("Sum of hours delay")
plt.title("Hours delay per weekday by year")
plt.legend(title="Year")
plt.grid(True)
plt.show()

In [None]:
grouped = df.groupby(["DATOP_year", "DATOP_day"])["ID"].count().unstack(level=0)

grouped.plot(kind="line", figsize=(10, 6))
plt.xlabel("Day of Week")
plt.ylabel("Number of Flights")
plt.title("Number of flights per weekday by Year")
plt.legend(title="Year")
plt.grid(True)
plt_savefig("weekday-to-delay-by-year_graph")
plt.show()

In [None]:
df2.head()

In [None]:
prefixes = ["day_", "yr_", "mt_", "ac_", "dep_", "arr_"]

# Collect columns that match those prefixes
feature_cols = [
    col for col in df2.columns if any(col.startswith(p) for p in prefixes)
] + ["flight_time"]

x0 = X_train[feature_cols]
x1 = X_test[feature_cols]


model = LinearRegression()
# model = KNeighborsRegressor(n_neighbors=5)

model.fit(x0, y_train)
y_pred_test = model.predict(x1)

print(np.sqrt(mean_squared_error(y_test, y_pred_test)))
print(r2_score(y_test, y_pred_test))

In [None]:
# Define the XGBoost model
xgb_model = XGBRegressor(
    random_state=CFG["RSEED"],
    verbosity=0,
)

# Define the hyperparameter grid
param_grid = {
    "n_estimators": [100],
    "max_depth": [10, 30],
    "learning_rate": [0.01, 0.1],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
    verbose=1,
)

# Fit on training data
grid_search.fit(x0, y_train)

# Best model
best_model = grid_search.best_estimator_

# Predict on test data
y_pred_test = best_model.predict(x1)

# Evaluate
print("Best Parameters:", grid_search.best_params_)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))
print("R_2:", r2_score(y_test, y_pred_test))

In [None]:
df.columns

In [None]:
df.plot(x="flight_time", y="target", kind="scatter")

plt_savefig("flight-time-to-delay_scatterplot")
plt.show()

In [None]:
df.plot(x="flight_time", y="target", kind="scatter")

plt.xlim(0, 1000)

plt_savefig("flight-time-to-delay_trimmed_scatterplot")
plt.show()