In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from pandas.tseries.offsets import MonthEnd
from statsmodels.tsa.api import VAR
import matplotlib.ticker as mticker
from collections import Counter
from pandas.tseries.offsets import MonthEnd
from sklearn.linear_model import Ridge
from lightgbm import LGBMRegressor

In [2]:
# --- Last inn data ---
url = "https://raw.githubusercontent.com/jensmorten/onesixtynine/main/data/pollofpolls_master.csv"
df = pd.read_csv(url, index_col="Mnd", parse_dates=True)
df = df.sort_index()
df.index = df.index.to_period('M').to_timestamp('M')  # månadsslutt

In [3]:
df

Unnamed: 0_level_0,Ap,Hoyre,Frp,SV,SP,KrF,Venstre,MDG,Rodt,Andre
Mnd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2008-01-31,29.3,17.2,23.9,7.4,6.1,6.4,6.5,0.0,1.3,0.0
2008-02-29,29.0,17.3,25.2,6.7,5.9,6.3,6.6,0.0,1.3,0.0
2008-03-31,28.7,18.1,25.2,7.0,5.6,6.1,6.3,0.0,1.1,0.0
2008-04-30,29.0,16.9,25.4,6.5,5.5,7.0,6.8,0.0,1.2,0.0
2008-05-31,28.9,17.8,25.9,6.7,5.7,6.2,6.2,0.0,1.4,0.0
...,...,...,...,...,...,...,...,...,...,...
2025-06-30,28.3,16.2,21.0,6.9,5.6,3.7,4.5,3.0,6.2,4.5
2025-07-31,27.7,14.8,21.5,8.2,6.3,3.2,4.7,3.5,5.9,4.1
2025-08-31,27.3,15.3,21.2,6.3,6.2,4.6,4.2,4.3,6.1,4.5
2025-09-30,27.1,14.4,21.0,6.0,5.9,4.7,4.3,6.2,6.0,4.4


In [4]:
elections = pd.DataFrame(
    {
        "Ap":      [35.4, 30.8, 27.4, 26.3, 28.0],
        "Hoyre":   [17.2, 26.8, 25.0, 20.4, 23.8],
        "Frp":     [22.9, 16.3, 15.2, 11.6, 14.6],
        "SV":      [6.2, 4.1, 6.0, 7.6, 5.6],
        "SP":      [6.2, 5.5, 10.3, 13.5, 5.6],
        "KrF":     [5.5, 5.6, 4.2, 3.8, 4.2],
        "Venstre": [3.9, 5.2, 4.4, 4.6, 3.7],
        "MDG":     [0.3, 2.8, 3.2, 3.9, 4.7],
        "Rodt":    [1.3, 1.1, 2.4, 4.7, 5.3],
        "Andre":   [1.1, 1.8, 1.9, 3.6, 4.5],
    },
    index=pd.to_datetime(
        ["2009-09-30", "2013-09-30", "2017-09-30", "2021-09-30", "2025-09-30"]
    )
)

elections.index.name = "date"

In [5]:
elections

Unnamed: 0_level_0,Ap,Hoyre,Frp,SV,SP,KrF,Venstre,MDG,Rodt,Andre
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2009-09-30,35.4,17.2,22.9,6.2,6.2,5.5,3.9,0.3,1.3,1.1
2013-09-30,30.8,26.8,16.3,4.1,5.5,5.6,5.2,2.8,1.1,1.8
2017-09-30,27.4,25.0,15.2,6.0,10.3,4.2,4.4,3.2,2.4,1.9
2021-09-30,26.3,20.4,11.6,7.6,13.5,3.8,4.6,3.9,4.7,3.6
2025-09-30,28.0,23.8,14.6,5.6,5.6,4.2,3.7,4.7,5.3,4.5


In [6]:
def compute_regime_weight(rs, low=0.8, high=1.2, gamma=1.0):
    """
    rs   = regime_strength
    low  = below this: ML off
    high = above this: ML fully on
    gamma = shape of ramp (1 = linear, >1 = concave, <1 = convex)
    """
    if rs <= low:
        return 0.0
    elif rs >= high:
        return 1.0
    else:
        x = (rs - low) / (high - low)     # in [0,1]
        return x**gamma

In [7]:
def hybrid_var_ml_forecast(
    df,
    n_months,
    var_lags,
    lags_ML,
    tau=3.0,
    vol_window=6,
    min_alpha=0.0,
    max_alpha=1.0,
    ml_params=None,
    rs_low=0.8,
    rs_high=1.2,
    rs_gamma=1.0,
):
    """
    Hybrid VAR + ML with:
      - adaptive α per party
      - regime gating based on volatility ratio
      - horizon decay

    Regime strength = recent_vol / long_run_vol
    """
    model = VAR(df)
    var_res = model.fit(maxlags=var_lags, method="ols", trend="n")

    mean_var, lower_var, upper_var = var_res.forecast_interval(
        var_res.endog, steps=n_months
    )

    # VAR residuals (in-sample)
    fitted = var_res.fittedvalues
    true = df.iloc[var_res.k_ar:]
    resid = true.values - fitted.values

    # Build ML dataset
    X, y = [], []
    for i in range(lags_ML, len(df)):
        if i - var_res.k_ar < 0:
            continue
        X.append(df.iloc[i-lags_ML:i].values.flatten())
        y.append(resid[i - var_res.k_ar])

    X = np.asarray(X)
    y = np.asarray(y)

    n_parties = df.shape[1]
    ml_resid_forecast = np.zeros((n_months, n_parties))

    if len(X) < 100:
        return mean_var, lower_var, upper_var

    # Pre-compute volatility regime indicators
    ddf = df.diff()

    recent_vol = (
        ddf.iloc[-vol_window:]
        .abs()
        .mean(axis=0)
        .values
    )

    long_run_vol = (
        ddf.abs()
        .mean(axis=0)
        .values
        + 1e-8
    )

    regime_strength = recent_vol / long_run_vol  # per party

    for j in range(n_parties):
        yj = y[:, j]

        # --- ML model ---
        if ml_params is None:
            ml_params = {
            "n_estimators": 500,
            "num_leaves": 16,
            "learning_rate": 0.01,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "random_state": 123,
            "verbose": -1,
        }

        model_ml = LGBMRegressor(**ml_params)
        model_ml.fit(X, yj)

        # --- adaptive α (in-sample calibration) ---
        y_hat_train = model_ml.predict(X)

        num = np.dot(yj, y_hat_train)
        den = np.dot(y_hat_train, y_hat_train) + 1e-8
        alpha_j = num / den
        alpha_j = float(np.clip(alpha_j, min_alpha, max_alpha))

        # --- regime-gated weight ---
        rs = regime_strength[j]
        regime_weight = compute_regime_weight(rs, low=rs_low, high=rs_high, gamma=rs_gamma)

        # --- forecasting ---
        win = df.values[-lags_ML:].copy()

        for t in range(n_months):
            r_raw = model_ml.predict(win.reshape(1, -1))[0]

            if tau is not None:
                time_decay = np.exp(-t / tau)
            else:
                time_decay = 1.0

            r = alpha_j * regime_weight * time_decay * r_raw

            ml_resid_forecast[t, j] = r

            # IMPORTANT: no feedback of corrected forecast
            win = np.vstack([win[1:], mean_var[t]])

    forecast = mean_var + ml_resid_forecast
    lower = lower_var + ml_resid_forecast
    upper = upper_var + ml_resid_forecast

    return forecast, lower, upper


In [8]:
def evaluate_on_elections_month_end(
    df,
    elections,
    var_lags,
    lags_ML,
    tau,
    vol_window,
    max_alpha,
    months_before_election=3,
    ml_params=None,
    
):
    errors = []

    for election_date, row in elections.iterrows():
        if election_date not in df.index:
            continue

        target_loc = df.index.get_loc(election_date)
        train_end = target_loc - months_before_election

        if train_end <= max(var_lags, lags_ML):
            continue

        train = df.iloc[:train_end]
        n_months = target_loc - train_end + 1

        forecast, _, _ = hybrid_var_ml_forecast(
            train,
            n_months=n_months,
            var_lags=var_lags,
            lags_ML=lags_ML,
            tau=tau,
            vol_window=vol_window,
            min_alpha=0.0,
            max_alpha=max_alpha,
        )

        y_pred = forecast[-1]
        y_true = row[df.columns].values

        errors.append(
            np.mean(np.abs(y_true - y_pred))
        )

    return np.mean(errors) if errors else np.nan


In [9]:
VAR_LAGS_GRID = [2, 3, 4, 5, 6]
ML_LAGS_GRID  = [6, 9, 12, 15]

In [10]:
TAU = 3.0
VOL_WINDOW = 6
MAX_ALPHA = 0.8

In [11]:
results = []

for var_lags in VAR_LAGS_GRID:
    for ml_lags in ML_LAGS_GRID:
        score = evaluate_on_elections_month_end(
            df,
            elections,
            var_lags=var_lags,
            lags_ML=ml_lags,
            tau=TAU,
            vol_window=VOL_WINDOW,
            max_alpha=MAX_ALPHA,
            months_before_election=3,
        )

        results.append({
            "var_lags": var_lags,
            "ml_lags": ml_lags,
            "MAE": score,
        })

structure_tuning = (
    pd.DataFrame(results)
    .dropna()
    .sort_values("MAE")
)

structure_tuning.head(10)


  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_v

Unnamed: 0,var_lags,ml_lags,MAE
6,3,12,1.774737
5,3,9,1.781421
7,3,15,1.785734
4,3,6,1.785862
14,5,12,2.255053
12,5,6,2.259436
10,4,12,2.25982
13,5,9,2.260601
15,5,15,2.260798
11,4,15,2.265051


In [12]:
ML_GRID = [
    # conservative (lower variance)
    dict(num_leaves=8,  learning_rate=0.03, n_estimators=200),

    # baseline (your current setup)
    dict(num_leaves=16, learning_rate=0.01, n_estimators=500),

    # slightly more expressive
    dict(num_leaves=32, learning_rate=0.01, n_estimators=800),

    # smooth but strong learner
    dict(num_leaves=16, learning_rate=0.005, n_estimators=1500),
]


In [13]:
ml_results = []

for i, params in enumerate(ML_GRID):
    score = evaluate_on_elections_month_end(
        df,
        elections,
        var_lags=4,
        lags_ML=12,
        tau=3.0,
        vol_window=6,
        max_alpha=0.8,
        months_before_election=3,
        ml_params=params,                # <-- pass through
    )

    ml_results.append({
        "model": i,
        "params": params,
        "MAE": score,
    })

ml_tuning = (
    pd.DataFrame(ml_results)
    .sort_values("MAE")
)
ml_tuning

  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))


Unnamed: 0,model,params,MAE
0,0,"{'num_leaves': 8, 'learning_rate': 0.03, 'n_es...",2.25982
1,1,"{'num_leaves': 16, 'learning_rate': 0.01, 'n_e...",2.25982
2,2,"{'num_leaves': 32, 'learning_rate': 0.01, 'n_e...",2.25982
3,3,"{'num_leaves': 16, 'learning_rate': 0.005, 'n_...",2.25982


In [14]:
pd.set_option("display.max_colwidth", None)
ml_tuning

Unnamed: 0,model,params,MAE
0,0,"{'num_leaves': 8, 'learning_rate': 0.03, 'n_estimators': 200}",2.25982
1,1,"{'num_leaves': 16, 'learning_rate': 0.01, 'n_estimators': 500}",2.25982
2,2,"{'num_leaves': 32, 'learning_rate': 0.01, 'n_estimators': 800}",2.25982
3,3,"{'num_leaves': 16, 'learning_rate': 0.005, 'n_estimators': 1500}",2.25982


In [15]:

def threshold_weight(y_true, threshold=4.0, sigma=0.75, max_weight=3.0):
    """
    Weight errors more heavily near the threshold.

    sigma controls width of the sensitive zone (~3–5%).
    """
    w = 1.0 + (max_weight - 1.0) * np.exp(
        -0.5 * ((y_true - threshold) / sigma) ** 2
    )
    return min(w, max_weight)

In [16]:
def election_threshold_loss(
    y_true,
    y_pred,
    threshold=4.0,
    sigma=0.75,
    max_weight=3.0,
):
    """
    Threshold-aware MAE for one election.
    """
    loss = 0.0

    for yt, yp in zip(y_true, y_pred):
        w = threshold_weight(
            yt,
            threshold=threshold,
            sigma=sigma,
            max_weight=max_weight
        )
        loss += w * abs(yt - yp)

    return loss / len(y_true)


In [17]:
def evaluate_on_elections_threshold_aware(
    df,
    elections,
    var_lags,
    lags_ML,
    tau,
    vol_window,
    max_alpha,
    months_before_election=3,
    threshold=4.0,
    sigma=0.75,
    max_weight=3.0,
    rs_low=0.2,
    rs_high=0.8,
):
    losses = []

    for election_date, row in elections.iterrows():
        if election_date not in df.index:
            continue

        loc = df.index.get_loc(election_date)
        train_end = loc - months_before_election
        if train_end <= max(var_lags, lags_ML):
            continue

        train = df.iloc[:train_end]
        n_months = loc - train_end + 1

        forecast, _, _ = hybrid_var_ml_forecast(
            train,
            n_months=n_months,
            var_lags=var_lags,
            lags_ML=lags_ML,
            tau=tau,
            vol_window=vol_window,
            min_alpha=0.0,
            max_alpha=max_alpha,
            rs_low=rs_low,
            rs_high=rs_high,
        )

        y_pred = forecast[-1]
        y_true = row[df.columns].values

        loss = election_threshold_loss(
            y_true,
            y_pred,
            threshold=threshold,
            sigma=sigma,
            max_weight=max_weight,
        )

        losses.append(loss)

    return np.mean(losses)


In [18]:
results = []

for tau in [2.0, 3.0, 4.0]:
    for max_alpha in [0.6, 0.8, 1.0]:
        score = evaluate_on_elections_threshold_aware(
            df,
            elections,
            var_lags=4,
            lags_ML=12,
            tau=tau,
            vol_window=6,
            max_alpha=max_alpha,
            months_before_election=3,
            threshold=4.0,
            sigma=0.75,
            max_weight=3.0,
        )
        results.append({
            "tau": tau,
            "max_alpha": max_alpha,
            "threshold_loss": score,
        })

tau_alpha_tuning = (
    pd.DataFrame(results)
    .sort_values("threshold_loss")
)
tau_alpha_tuning


  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))


Unnamed: 0,tau,max_alpha,threshold_loss
8,4.0,1.0,2.729207
7,4.0,0.8,2.733208
5,3.0,1.0,2.733632
4,3.0,0.8,2.736748
6,4.0,0.6,2.737209
2,2.0,1.0,2.739762
3,3.0,0.6,2.739863
1,2.0,0.8,2.741651
0,2.0,0.6,2.743541


In [21]:
RS_LOW_GRID  = [0.2, 0.4, 0.8]
RS_HIGH_GRID = [1.1, 1.4, 1.6]

In [22]:
results = []

for rs_low in RS_LOW_GRID:
    for rs_high in RS_HIGH_GRID:
        if rs_low >= 1.0 or rs_high <= 1.0:
            continue

        score = evaluate_on_elections_threshold_aware(
            df,
            elections,
            var_lags=4,
            lags_ML=12,
            tau=4.0,          # your election-mode tau
            vol_window=6,
            max_alpha=1.0,
            months_before_election=3,
            threshold=4.0,
            sigma=0.75,
            max_weight=3.0,
            rs_low=rs_low,
            rs_high=rs_high,
        )

        results.append({
            "rs_low": rs_low,
            "rs_high": rs_high,
            "threshold_loss": score,
        })

regime_tuning = (
    pd.DataFrame(results)
    .sort_values("threshold_loss")
)
regime_tuning


  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))


Unnamed: 0,rs_low,rs_high,threshold_loss
0,0.2,1.1,2.730926
3,0.4,1.1,2.731765
1,0.2,1.4,2.733087
2,0.2,1.6,2.733351
4,0.4,1.4,2.734107
5,0.4,1.6,2.734245
6,0.8,1.1,2.734363
8,0.8,1.6,2.736459
7,0.8,1.4,2.736967


In [23]:
results = []

for election_window in [2, 3, 4]:
    score = evaluate_on_elections_threshold_aware(
        df,
        elections,
        var_lags=4,
        lags_ML=12,
        tau=4.0,              # election-mode params
        vol_window=6,
        max_alpha=1.0,
        months_before_election=election_window,
        threshold=4.0,
        sigma=0.75,
        max_weight=3.0,
    )

    results.append({
        "election_window": election_window,
        "threshold_loss": score,
    })

pd.DataFrame(results).sort_values("threshold_loss")


  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))
  sigma = np.sqrt(self._forecast_vars(steps))


Unnamed: 0,election_window,threshold_loss
0,2,2.471003
1,3,2.729207
2,4,3.052913
