In [219]:
import pandas as pd
from pathlib import Path
import numpy as np

from sklearn.metrics import roc_auc_score, brier_score_loss
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier


### Sette basen for de ulike dataene.

In [220]:
base = Path("/Users/hodanielkhuu/Downloads/202509_Datasett")

### Importere de nødvendige filene

In [221]:
df_train = pd.read_csv(base / "training_data.csv")
df_infer = pd.read_csv(base / "inference_data_oct2025.csv")
df_mal = pd.read_csv(base / "preds_mal.csv")


In [222]:
# Viser kolonnenavn og antall rader
print("Train Shape:", df_train.shape)
print("Infer Shape:", df_infer.shape)
print("Mal Shape:", df_mal.shape)

print("\nTrain Columns:", df_train.columns.tolist())
print("Infer Columns:", df_infer.columns.tolist())
print("Mal Columns:", df_mal.columns.tolist())

Train Shape: (465031, 7)
Infer Shape: (5208, 6)
Mal Shape: (5208, 4)

Train Columns: ['airport_group', 'date', 'hour', 'target', 'feat_season', 'feat_sched_flights_cnt', 'feat_sched_concurrence']
Infer Columns: ['airport_group', 'date', 'hour', 'feat_season', 'feat_sched_flights_cnt', 'feat_sched_concurrence']
Mal Columns: ['airport_group', 'date', 'hour', 'pred']


In [223]:
## Finne duplikater rader i treningsdata, trenger bare for disse kolonnene fordi det er det som sørger for at belastningen er riktig.
key_cols = ['airport_group', 'date', 'hour']
n_rows = len(df_train)
n_unique_rows = df_train[key_cols].drop_duplicates().shape[0]

if n_rows == n_unique_rows:
    print("Ingen duplikater funnet i treningsdata.")
else:
    print(f"Funnet {n_rows - n_unique_rows} duplikater i treningsdata.")

Ingen duplikater funnet i treningsdata.


In [224]:
# Sammenlinge train og infer data for å se hva vi skal predikere
cols_train = set(df_train.columns)
cols_infer = set(df_infer.columns)

extra_in_train = cols_train - cols_infer
extra_in_infer = cols_infer - cols_train

print("\nKolonner i train, men ikke i infer:", extra_in_train)
print("\nKolonner i infer, men ikke i train:", extra_in_infer)
# Blir riktig å finne target fordi vi skal predikere dette.


Kolonner i train, men ikke i infer: {'target'}

Kolonner i infer, men ikke i train: set()


## Kopiere baseline over hit


In [225]:
#Importere pakker
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, brier_score_loss

In [226]:
base = Path("/Users/hodanielkhuu/Downloads/202509_Datasett")
KEY = ["airport_group", "date", "hour"]
GROUP, DATE, HOUR, DOW = KEY[0], KEY[1], KEY[2], "dow"

df_train = pd.read_csv(base / "training_data.csv", parse_dates=[DATE])
df_infer = pd.read_csv(base / "inference_data_oct2025.csv", parse_dates=[DATE])
df_mal = pd.read_csv(base / "preds_mal.csv")

In [227]:
for df in (df_train, df_infer, df_mal):
    df[DATE] = pd.to_datetime(df[DATE])
    df[HOUR] = df[HOUR].astype(int)


for name, df in (("train", df_train), ("mal", df_mal)):
    dupes = df.duplicated(KEY).sum()
    assert dupes == 0, f"{name} has {dupes} duplicate keys"

infer_keys = set(map(tuple, df_infer[KEY].to_numpy()))
mal_keys = set(map(tuple, df_mal[KEY].to_numpy()))
assert infer_keys == mal_keys, "Mismatch between inference and mal keys"



## Calenders features and time split

In [228]:
for df in (df_train, df_infer, df_mal):
    df[DOW] = df[DATE].dt.dayofweek

last_month = df_train[DATE].dt.to_period("M").max()
mask_val = df_train[DATE].dt.to_period("M") == last_month
train_hist = df_train.loc[~mask_val].copy()
valid = df_train.loc[mask_val].copy()

if train_hist.empty:
    raise ValueError("Training history is empty; need at least two months.")



print("Train range:", train_hist[DATE].min(), "→", train_hist[DATE].max())
print("Valid range:",  valid[DATE].min(),      "→", valid[DATE].max())
print("Last month:", last_month)



Train range: 2018-01-01 00:00:00 → 2025-06-30 00:00:00
Valid range: 2025-07-01 00:00:00 → 2025-07-31 00:00:00
Last month: 2025-07


## Historical Rates Tables from train_hist only

In [229]:
rate_global = train_hist["target"].mean()
alpha = 20.0


def build_rate_table(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    assert "target" in df.columns and len(df) > 0
    agg = (
        df.groupby(cols, dropna=False)
        .agg(count=("target", "size"), sum1=("target", "sum"))
        .reset_index()
    )
    agg["rate"] = np.where(agg["count"] > 0, agg["sum1"] / agg["count"], rate_global)
    agg["rate_smoothed"] = (agg["sum1"] + alpha * rate_global) / (agg["count"] + alpha)
    return agg

tbl_A = build_rate_table(train_hist, [GROUP, HOUR])
tbl_B = build_rate_table(train_hist, [GROUP, DOW, HOUR])

# Keep only smoothed rates and rename to feature names
tbl_A_feat = tbl_A[[GROUP, HOUR, "rate_smoothed"]].rename(
    columns={"rate_smoothed":"rate_group_hour"}
)
tbl_B_feat = tbl_B[[GROUP, DOW, HOUR, "rate_smoothed"]].rename(
    columns={"rate_smoothed":"rate_group_dow_hour"}
)
print("Rate tables built. Sizes:", len(tbl_A_feat), len(tbl_B_feat), "rate_global=", round(rate_global,4))


valid_X = valid[[GROUP, DATE, HOUR, DOW]].copy()
valid_y = valid["target"]



Rate tables built. Sizes: 168 1176 rate_global= 0.2219


### Join Rate features and simple calender flags



In [230]:
def join_rates(df: pd.DataFrame) -> pd.DataFrame:
    out = df.merge(tbl_A_feat, on=[GROUP, HOUR], how ="left")
    out = out.merge(tbl_B_feat, on=[GROUP, DOW, HOUR], how="left")
    out["rate_group_hour"] = out["rate_group_hour"].fillna(rate_global)
    out["rate_group_dow_hour"] = out["rate_group_dow_hour"].fillna(rate_global)
    return out

train_hist = join_rates(train_hist)
valid = join_rates(valid)
df_infer = join_rates(df_infer)

for df in (train_hist,valid,df_infer):
    df["is_weekend"] = (df[DOW] >= 5).astype(int)

print("Rate features joined and calendar flags added.")

Rate features joined and calendar flags added.


### Lag features

In [231]:
# --- Part 6: Lag features (strictly past within group) ---
def add_lags(df: pd.DataFrame, group_col: str, cols: list[str], lags=(1,)) -> pd.DataFrame:
    out = df.sort_values([group_col, DATE, HOUR]).copy()
    for c in cols:
        for l in lags:
            out[f"{c}_lag{l}"] = out.groupby(group_col)[c].shift(l)
    return out

lag_cols = ["feat_sched_flights_cnt", "feat_sched_concurrence"]

train_hist = add_lags(train_hist, GROUP, lag_cols, lags=(1,))
valid      = add_lags(valid,      GROUP, lag_cols, lags=(1,))
df_infer   = add_lags(df_infer,   GROUP, lag_cols, lags=(1,))

# First hour per group has NaN lags → fill with 0
for df in (train_hist, valid, df_infer):
    for c in [f"{x}_lag1" for x in lag_cols]:
        df[c] = df[c].fillna(0.0)

print("Lag features created.")


Lag features created.


# --- Part 7: Freeze feature list & build X/y ---


In [232]:
feature_cols = [
    "hour",
    DOW,
    "is_weekend",
    "feat_sched_flights_cnt",
    "feat_sched_concurrence",
    "feat_sched_flights_cnt_lag1",
    "feat_sched_concurrence_lag1",
    "rate_group_hour",
    "rate_group_dow_hour",
    # Optional cyclics later: "hour_sin", "hour_cos",
]

X_train = train_hist[feature_cols].copy()
y_train = train_hist["target"].astype(int)

X_val = valid[feature_cols].copy()
y_val = valid["target"].astype(int)

single_class_val = (y_val.nunique() < 2)
print("Feature matrix shapes:", X_train.shape, X_val.shape)


Feature matrix shapes: (459984, 9) (5047, 9)


In [233]:
# --- Part 8: Logistic Regression (scaled) ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)

lr = LogisticRegression(max_iter=500, solver="lbfgs")
lr.fit(X_train_scaled, y_train)
pred_lr = lr.predict_proba(X_val_scaled)[:,1]

auc_lr   = np.nan if single_class_val else roc_auc_score(y_val, pred_lr)
brier_lr = brier_score_loss(y_val, pred_lr)

print(f"[LR]   AUC={auc_lr:.4f}" if not np.isnan(auc_lr) else "[LR]   AUC=NA (single-class val)")
print(f"[LR]   Brier={brier_lr:.4f}")


[LR]   AUC=0.9640
[LR]   Brier=0.0582


## HistgradientBoosting classifier


In [234]:
hgb =  HistGradientBoostingClassifier(
    learning_rate = 0.05,
    max_iter=400,
    min_samples_leaf = 40,
    random_state = 42
)

hgb.fit(X_train, y_train)
pred_hgb = hgb.predict_proba(X_val)[:,1]

auc_hgb = np.nan if single_class_val else roc_auc_score(y_val, pred_hgb)
brier_hgb = brier_score_loss(y_val, pred_hgb)

print(f"[HGB]  AUC={auc_hgb:.4f}" if not np.isnan(auc_hgb) else "[HGB]  AUC=NA (single-class val)")
print(f"[HGB] Brier={brier_hgb:.4f}")



[HGB]  AUC=0.9731
[HGB] Brier=0.0503


## prefer AUC else Brier

In [235]:
def pick_best(auc_lr, brier_lr, auc_hgb, brier_hgb):
    if not np.isnan(auc_lr) and not np.isnan(auc_hgb):
        return ("hgb", hgb) if auc_hgb >= auc_lr else ("lr", lr)

    return ("hgb", hgb) if brier_hgb <= brier_lr else ("lr", lr)

best_name, best_model = pick_best(auc_lr, brier_lr, auc_hgb, brier_hgb)
print(f"Selected best model: {best_name.upper()}")

Selected best model: HGB


In [236]:
# --- Part 11: Inference build → predict → save submission ---
X_infer = df_infer[feature_cols].copy()

if best_name == "lr":
    X_infer_scaled = scaler.transform(X_infer)
    p_infer = best_model.predict_proba(X_infer_scaled)[:,1]
else:
    p_infer = best_model.predict_proba(X_infer)[:,1]

p_infer = np.clip(p_infer, 0, 1)
p_infer = np.round(p_infer, 3)

# Build submission dataframe from df_infer keys and the predictions.
# This ensures `submission` is defined and has the correct row order.
submission = df_infer[KEY].copy()
submission["pred"] = p_infer

# Basic sanity checks
assert submission["pred"].notna().all()
assert submission["pred"].between(0, 1).all()

# Keep only the competition schema (order matters)
submission = submission[KEY + ["pred"]]

# Final assert to be safe
assert list(submission.columns) == KEY + ["pred"]

# Save to disk. Use save_dir (exists in notebook) and Path from earlier imports.
out_path = Path(save_dir) / "/Users/hodanielkhuu/vscode/avinor/outputs/preds_ml.csv"
submission.to_csv(out_path, index=False)
print(f"Saved {out_path}")



Saved /Users/hodanielkhuu/vscode/avinor/outputs/preds_ml.csv


In [242]:
pred_hgb_series = pd.Series(pred_hgb, index=y_val.index)


In [243]:
group_metrics = []
for g, df_g in valid.groupby("airport_group"):
    if df_g["target"].nunique() < 2:
        group_metrics.append((g, len(df_g), np.nan, np.nan))
        continue
    
    idx = df_g.index
    
    auc = roc_auc_score(y_val.loc[idx], pred_hgb_series.loc[idx])
    brier = brier_score_loss(y_val.loc[idx], pred_hgb_series.loc[idx])
    
    group_metrics.append((g, len(df_g), auc, brier))

group_df = pd.DataFrame(group_metrics, columns=["airport_group","n","AUC","Brier"])
print(group_df.sort_values("n", ascending=False))


  airport_group    n       AUC     Brier
0             A  721  0.940967  0.050347
1             B  721  0.974890  0.058303
2             C  721  0.994713  0.014529
3             D  721  0.979583  0.048810
4             E  721  0.970005  0.077572
5             F  721  0.973726  0.029333
6             G  721  0.942403  0.073485
