In [193]:
#Importere pakker
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, brier_score_loss

In [214]:
base = Path("/Users/hodanielkhuu/Downloads/202509_Datasett")
KEY = ["airport_group", "date", "hour"]
GROUP, DATE, HOUR, DOW = KEY[0], KEY[1], KEY[2], "dow"

df_train = pd.read_csv(base / "training_data.csv", parse_dates=[DATE])
df_infer = pd.read_csv(base / "inference_data_oct2025.csv", parse_dates=[DATE])
df_mal = pd.read_csv(base / "preds_mal.csv")

In [215]:
df_mal[DATE] = pd.to_datetime(df_mal[DATE])

for name, df in (("train", df_train), ("mal", df_mal)):
    dupes = df.duplicated(KEY).sum()
    assert dupes == 0, f"{name} has {dupes} duplicate keys"

infer_keys = set(map(tuple, df_infer[KEY].to_numpy()))
mal_keys = set(map(tuple, df_mal[KEY].to_numpy()))
assert infer_keys == mal_keys, "Mismatch between inference and mal keys"



In [212]:
df_train[DOW] = df_train[DATE].dt.dayofweek
df_mal[DOW]   = df_mal[DATE].dt.dayofweek

In [191]:
last_month = df_train[DATE].dt.to_period("M").max()
mask_val = df_train[DATE].dt.to_period("M") == last_month
train_hist = df_train.loc[~mask_val]
valid = df_train.loc[mask_val]



In [196]:
if train_hist.empty:
    raise ValueError("Training history is empty; need at least two months.")

rate_global = train_hist["target"].mean()
alpha = 20.0


In [197]:
def build_rate_table(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    agg = (
        df.groupby(cols)
        .agg(count=("target", "size"), sum1=("target", "sum"))
        .reset_index()
    )
    agg["rate"] = agg["sum1"] / agg["count"]
    agg["rate_smoothed"] = (agg["sum1"] + alpha * rate_global) / (agg["count"] + alpha)
    return agg
      

In [198]:
tbl_A = build_rate_table(train_hist, [GROUP, HOUR])
tbl_B = build_rate_table(train_hist, [GROUP, DOW, HOUR])

tbl_A_idx = tbl_A.set_index([GROUP, HOUR])
tbl_B_idx = tbl_B.set_index([GROUP, DOW, HOUR])



In [199]:
def predict_A(df_feat: pd.DataFrame) -> pd.Series:
    merged = df_feat.join(tbl_A_idx, on = [GROUP, HOUR])
    preds = merged["rate_smoothed"].combine_first(merged["rate"])
    return preds.fillna(rate_global)


In [200]:
def predict_with_fallback(df_feat: pd.DataFrame) -> tuple[pd.Series, pd.Series]:
    merged = df_feat.join(tbl_B_idx, on =[GROUP, DOW, HOUR])
    preds = merged["rate_smoothed"].combine_first(merged["rate"])
    source = pd.Series("B", index= df_feat.index, dtype=object)

    missing = preds.isna()
    if missing.any():
        fallback = df_feat.loc[missing].join(tbl_A_idx, on= [GROUP, HOUR])
        preds.loc[missing] = fallback["rate_smoothed"].combine_first(fallback["rate"])
        source.loc[missing] = "A"

    missing = preds.isna()
    if missing.any(): 
        preds.loc[missing] = rate_global
        source.loc[missing] = "GLOBAL"

    return preds.astype(float), source 

In [201]:
valid_X = valid[[GROUP, DATE, HOUR, DOW]].copy()
valid_y = valid["target"]


In [202]:
pA_valid = predict_A(valid_X).clip(0.0, 1.0)
pB_valid, _ = predict_with_fallback(valid_X)
pB_valid = pB_valid.clip(0.0, 1.0)

In [150]:
print("AUC A:", roc_auc_score(valid_y, pA_valid))
print("Brier A:", brier_score_loss(valid_y, pA_valid))
print("AUC B:", roc_auc_score(valid_y, pB_valid))
print("Brier B:", brier_score_loss(valid_y, pB_valid))

AUC A: 0.7878585627007095
Brier A: 0.1537974626394816
AUC B: 0.8325313305187022
Brier B: 0.1413057350677002


In [203]:
group_rows = []
for g, df_g in valid.groupby(GROUP):
    if df_g["target"].nunique() < 2:
        group_rows.append((g, len(df_g), np.nan, np.nan))
        continue
    idx = df_g.index
    group_rows.append(
        (
            g,
            len(df_g),
            roc_auc_score(valid_y.loc[idx], pA_valid.loc[idx]),
            roc_auc_score(valid_y.loc[idx], pB_valid.loc[idx]),
        )
    )
group_metrics = pd.DataFrame(group_rows, columns=["group", "n", "auc_A", "auc_B"])
print(group_metrics.head())

  group    n     auc_A     auc_B
0     A  721  0.669597  0.715590
1     B  721  0.797966  0.868869
2     C  721  0.691979  0.786030
3     D  721  0.661273  0.777762
4     E  721  0.850581  0.871968


In [237]:
infer_X = df_infer[[GROUP, DATE, HOUR, DOW]].copy()
p_infer, source_tags = predict_with_fallback(infer_X)
p_infer = p_infer.clip(0.0, 1.0).round(3)
assert p_infer.between(0.0, 1.0).all()

fallback_share = source_tags.value_counts(normalize=True).sort_index()
print("Fallback usage:", fallback_share.to_dict())


Fallback usage: {'B': 1.0}


In [238]:
pred_df = df_infer[KEY].copy()
pred_df["pred"] = p_infer.values

submission = df_mal.merge(pred_df, on=KEY, how="left", sort=False)
# After merge, 'pred' from df_mal becomes 'pred_x', and from pred_df becomes 'pred_y'
# Select the new predictions and rename to 'pred'
submission = submission[KEY + ["pred_y"]].rename(columns={"pred_y": "pred"})
assert submission["pred"].notna().all()
assert submission["pred"].between(0.0, 1.0).all()
assert list(submission.columns) == KEY + ["pred"]

Path("outputs").mkdir(exist_ok=True)
submission.to_csv(Path("/Users/hodanielkhuu/vscode/avinor/outputs") / "preds_baseline.csv", index=False)
print("Saved outputs/preds_baseline.csv")


Saved outputs/preds_baseline.csv


### PSEUDO CODE

# Konstanter og nøkler

In [207]:
KEY = ["airport_group", "date", "hour"]
GROUP = "airport_group"
DATE = "date"
HOUR = "hour"
DOW  = "dow"   # 0=Mon..6=Sun

# Laste data og grunnsjekk

In [109]:
df_train  <- read_csv("training_data.csv")
df_infer  <- read_csv("inference_data_oct2025.csv")
df_mal    <- read_csv("preds_mal.csv")

assert columns_exist(df_train,  KEY + ["target"])
assert columns_exist(df_infer,  KEY)
assert columns_exist(df_mal,    KEY + ["pred"])

assert unique_keys(df_train, KEY)     # no dupes
assert unique_keys(df_mal,   KEY)     # no dupes

# parse date
df_train[DATE] <- to_datetime(df_train[DATE])
df_infer[DATE] <- to_datetime(df_infer[DATE])

# derive weekday
df_train[DOW] <- weekday(df_train[DATE])   # 0..6
df_infer[DOW] <- weekday(df_infer[DATE])

# quick shapes to log
log_shape(df_train, df_infer, df_mal)


NameError: name 'read_csv' is not defined

# Definere tids-splitt

In [None]:
last_month_in_train <- max_month(df_train[DATE])       # e.g. "2025-09"
mask_val  <- month(df_train[DATE]) == last_month_in_train
mask_hist <- month(df_train[DATE])  < last_month_in_train

train_hist <- df_train[mask_hist]   # for å beregne rater
valid_mon  <- df_train[mask_val]    # siste måned – for evaluering

assert not_empty(train_hist)
assert not_empty(valid_mon)


# Beregn rater kun fra train-hist

In [None]:
rate_global <- mean(train_hist["target"])

tbl_A <- groupby(train_hist, [GROUP, HOUR]) 
         |> agg(
             count = n(),
             sum1  = sum(target),
             rate  = sum1 / count
         )


In [None]:
# Valgfri justering mot global rate
alpha = 20   # justert etter datastørrelse
tbl_A["rate_smoothed"] = (sum1 + alpha*rate_global) / (count + alpha)


In [None]:
# Lage baseline for B: rate per GROUP, DOW, HOUR
tbl_B <- groupby(train_hist, [GROUP, DOW, HOUR])
         |> agg(
             count = n(),
             sum1  = sum(target),
             rate  = sum1 / count
         )

tbl_B["rate_smoothed"] = (sum1 + alpha*rate_global) / (count + alpha)  # valgfri


# Lage predikasjoner


In [None]:
valid_X <- select(valid_mon, [GROUP, DATE, HOUR, DOW])
valid_y <- valid_mon["target"]


# Slå opp predikasjoenr: A baseline 

In [None]:
pred_A <- left_join(valid_X, tbl_A, on=[GROUP,HOUR])
pA     <- if_not_null(pred_A["rate_smoothed"], pred_A["rate"], fallback=NULL)

# Fallbacks:
# if pA is NULL → try rate_global
pA <- fillna(pA, rate_global)


In [None]:
# lage predikasjoenr for baseline B
pred_B <- left_join(valid_X, tbl_B, on=[GROUP,DOW,HOUR])
pB     <- choose(
           if_not_null(pred_B["rate_smoothed"], pred_B["rate"]),
           # fallback 1: bruk (g,h)
           lookup(tbl_A, [GROUP,HOUR], valid_X) -> rate_smoothed or rate,
           # fallback 2: global
           rate_global
         )



In [None]:
# klipp og logg
pA <- clip(pA, 0, 1)
pB <- clip(pB, 0, 1)

log_summary_stats(pA, pB)  # min/max/mean


# Evaluere baseline på valideringsmåned

In [None]:
auc_A   <- roc_auc(valid_y, pA)
brier_A <- brier_score(valid_y, pA)

auc_B   <- roc_auc(valid_y, pB)
brier_B <- brier_score(valid_y, pB)

# Per-gruppe AUC – viktig for fairness/robusthet
by_group_metrics <- for each g in unique(valid_X[GROUP]):
    idx = (valid_X[GROUP] == g)
    return {
      group: g,
      n: sum(idx),
      auc_A_g: roc_auc(valid_y[idx], pA[idx]),
      auc_B_g: roc_auc(valid_y[idx], pB[idx]),
      brier_A_g: brier_score(valid_y[idx], pA[idx]),
      brier_B_g: brier_score(valid_y[idx], pB[idx]),
    }

print_table(auc_A, brier_A, auc_B, brier_B)
print_table(by_group_metrics)


# Bruke baseline til inference (fremtid)

In [110]:
infer_X <- select(df_infer, [GROUP, DATE, HOUR, DOW])


NameError: name 'infer_X' is not defined

In [None]:
p_infer <- choose(
  lookup(tbl_B, [GROUP,DOW,HOUR]) -> rate_smoothed or rate,
  lookup(tbl_A, [GROUP,HOUR])     -> rate_smoothed or rate,
  rate_global
)

p_infer <- clip(p_infer, 0, 1)
p_infer <- round(p_infer, 3)


In [None]:
sub <- left_join(df_mal[[GROUP,DATE,HOUR]], 
                 data_frame([GROUP,DATE,HOUR,p_infer]),
                 on=KEY)

rename sub[p_infer] -> sub["pred"]

assert not_null(sub["pred"])
assert all_between(sub["pred"], 0, 1)
assert columns_equal(sub.columns, ["airport_group","date","hour","pred"])


In [None]:
#Output submission
write_csv(sub, "preds_baseline.csv")
