# Konstanter og nøkler

In [112]:
KEY = ["airport_group", "date", "hour"]
GROUP = "airport_group"
DATE = "date"
HOUR = "hour"
DOW  = "dow"   # 0=Mon..6=Sun


In [117]:
#Importere pakker
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, brier_score_loss

In [121]:
base = Path("/Users/hodanielkhuu/Downloads/202509_Datasett")
KEY = ["airport_group", "date", "hour"]

df_train = pd.read_csv(base / "training_data.csv", parse_dates=["date"])
df_infer = pd.read_csv(base / "inference_data_oct2025.csv", parse_dates=["date"])
df_mal = pd.read_csv(base / "preds_mal.csv")

In [122]:
df_train["dow"] = df_train["date"].dt.dayofweek
df_infer["dow"] = df_infer["date"].dt.dayofweek

last_month = df_train["date"].dt.to_period("M").max()
mask_val = df_train["date"].dt.to_period("M") == last_month
train_hist = df_train.loc[~mask_val]
valid = df_train.loc[mask_val]



# Laste data og grunnsjekk

In [109]:
df_train  <- read_csv("training_data.csv")
df_infer  <- read_csv("inference_data_oct2025.csv")
df_mal    <- read_csv("preds_mal.csv")

assert columns_exist(df_train,  KEY + ["target"])
assert columns_exist(df_infer,  KEY)
assert columns_exist(df_mal,    KEY + ["pred"])

assert unique_keys(df_train, KEY)     # no dupes
assert unique_keys(df_mal,   KEY)     # no dupes

# parse date
df_train[DATE] <- to_datetime(df_train[DATE])
df_infer[DATE] <- to_datetime(df_infer[DATE])

# derive weekday
df_train[DOW] <- weekday(df_train[DATE])   # 0..6
df_infer[DOW] <- weekday(df_infer[DATE])

# quick shapes to log
log_shape(df_train, df_infer, df_mal)


NameError: name 'read_csv' is not defined

# Definere tids-splitt

In [None]:
last_month_in_train <- max_month(df_train[DATE])       # e.g. "2025-09"
mask_val  <- month(df_train[DATE]) == last_month_in_train
mask_hist <- month(df_train[DATE])  < last_month_in_train

train_hist <- df_train[mask_hist]   # for å beregne rater
valid_mon  <- df_train[mask_val]    # siste måned – for evaluering

assert not_empty(train_hist)
assert not_empty(valid_mon)


# Beregn rater kun fra train-hist

In [None]:
rate_global <- mean(train_hist["target"])

tbl_A <- groupby(train_hist, [GROUP, HOUR]) 
         |> agg(
             count = n(),
             sum1  = sum(target),
             rate  = sum1 / count
         )


In [None]:
# Valgfri justering mot global rate
alpha = 20   # justert etter datastørrelse
tbl_A["rate_smoothed"] = (sum1 + alpha*rate_global) / (count + alpha)


In [None]:
# Lage baseline for B: rate per GROUP, DOW, HOUR
tbl_B <- groupby(train_hist, [GROUP, DOW, HOUR])
         |> agg(
             count = n(),
             sum1  = sum(target),
             rate  = sum1 / count
         )

tbl_B["rate_smoothed"] = (sum1 + alpha*rate_global) / (count + alpha)  # valgfri


# Lage predikasjoner


In [None]:
valid_X <- select(valid_mon, [GROUP, DATE, HOUR, DOW])
valid_y <- valid_mon["target"]


# Slå opp predikasjoenr: A baseline 

In [None]:
pred_A <- left_join(valid_X, tbl_A, on=[GROUP,HOUR])
pA     <- if_not_null(pred_A["rate_smoothed"], pred_A["rate"], fallback=NULL)

# Fallbacks:
# if pA is NULL → try rate_global
pA <- fillna(pA, rate_global)


In [None]:
# lage predikasjoenr for baseline B
pred_B <- left_join(valid_X, tbl_B, on=[GROUP,DOW,HOUR])
pB     <- choose(
           if_not_null(pred_B["rate_smoothed"], pred_B["rate"]),
           # fallback 1: bruk (g,h)
           lookup(tbl_A, [GROUP,HOUR], valid_X) -> rate_smoothed or rate,
           # fallback 2: global
           rate_global
         )



In [None]:
# klipp og logg
pA <- clip(pA, 0, 1)
pB <- clip(pB, 0, 1)

log_summary_stats(pA, pB)  # min/max/mean


# Evaluere baseline på valideringsmåned

In [None]:
auc_A   <- roc_auc(valid_y, pA)
brier_A <- brier_score(valid_y, pA)

auc_B   <- roc_auc(valid_y, pB)
brier_B <- brier_score(valid_y, pB)

# Per-gruppe AUC – viktig for fairness/robusthet
by_group_metrics <- for each g in unique(valid_X[GROUP]):
    idx = (valid_X[GROUP] == g)
    return {
      group: g,
      n: sum(idx),
      auc_A_g: roc_auc(valid_y[idx], pA[idx]),
      auc_B_g: roc_auc(valid_y[idx], pB[idx]),
      brier_A_g: brier_score(valid_y[idx], pA[idx]),
      brier_B_g: brier_score(valid_y[idx], pB[idx]),
    }

print_table(auc_A, brier_A, auc_B, brier_B)
print_table(by_group_metrics)


# Bruke baseline til inference (fremtid)

In [110]:
infer_X <- select(df_infer, [GROUP, DATE, HOUR, DOW])


NameError: name 'infer_X' is not defined

In [None]:
p_infer <- choose(
  lookup(tbl_B, [GROUP,DOW,HOUR]) -> rate_smoothed or rate,
  lookup(tbl_A, [GROUP,HOUR])     -> rate_smoothed or rate,
  rate_global
)

p_infer <- clip(p_infer, 0, 1)
p_infer <- round(p_infer, 3)


In [None]:
sub <- left_join(df_mal[[GROUP,DATE,HOUR]], 
                 data_frame([GROUP,DATE,HOUR,p_infer]),
                 on=KEY)

rename sub[p_infer] -> sub["pred"]

assert not_null(sub["pred"])
assert all_between(sub["pred"], 0, 1)
assert columns_equal(sub.columns, ["airport_group","date","hour","pred"])


In [None]:
#Output submission
write_csv(sub, "preds_baseline.csv")
