In [1]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.mlemodel import MLEModel
from statsmodels.tsa.statespace.dynamic_factor import DynamicFactor
import matplotlib.pyplot as plt
import statsmodels
from statsmodels.tsa.api import VAR
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.vector_ar.vecm import coint_johansen, VECM
from statsmodels.tsa.statespace.mlemodel import MLEModel
from statsmodels.tsa.stattools import adfuller
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor

In [2]:
# Laste poll-of-polls data
url = "https://raw.githubusercontent.com/jensmorten/onesixtynine/main/data/pollofpolls_master.csv"
df = pd.read_csv(url)

In [3]:
# Convert to datetime and set the date to the end of the month
df["Mnd"] = pd.to_datetime(df["Mnd"])

In [4]:
# Sort values and set index
df = df.sort_values("Mnd")
df.set_index("Mnd", inplace=True)
df.index.to_period('M').to_timestamp('M')

DatetimeIndex(['2008-01-31', '2008-02-29', '2008-03-31', '2008-04-30',
               '2008-05-31', '2008-06-30', '2008-07-31', '2008-08-31',
               '2008-09-30', '2008-10-31',
               ...
               '2025-01-31', '2025-02-28', '2025-03-31', '2025-04-30',
               '2025-05-31', '2025-06-30', '2025-07-31', '2025-08-31',
               '2025-09-30', '2025-10-31'],
              dtype='datetime64[ns]', name='Mnd', length=214, freq='ME')

In [5]:
print(df.index[-5:])  # check last few dates

DatetimeIndex(['2025-06-30', '2025-07-31', '2025-08-31', '2025-09-30',
               '2025-10-31'],
              dtype='datetime64[ns]', name='Mnd', freq=None)


In [6]:
df=df[['Ap', 'Hoyre', 'Frp', 'SV', 'SP', 'KrF', 'Venstre', 'MDG', 'Rodt','Andre']]
df.dropna(inplace=True)

In [13]:
import numpy as np
import pandas as pd
from statsmodels.tsa.api import VAR
from xgboost import XGBRegressor

def hybrid_var_xgb_with_var_intervals(
    df,
    n_months=12,
    var_lags=6,
    lags_ML=12,
    random_state=123
):
    """
    Hybrid VAR + XGBoost-residuals.
    - Uncertainty from VAR forecast_interval
    - ML only adjusts the mean (and shifts the whole interval).

    Returns:
        forecast_df, forecast_lower_df, forecast_upper_df
    """

    # -------------------------------
    # 1. Fit VAR and get intervals
    # -------------------------------
    var_model = VAR(df)
    var_res = var_model.fit(maxlags=6)

    mean_var, lower_var, upper_var = var_res.forecast_interval(
        df.values[-var_res.k_ar:], steps=n_months
    )

    mean_var_df = pd.DataFrame(mean_var, columns=df.columns)
    lower_var_df = pd.DataFrame(lower_var, columns=df.columns)
    upper_var_df = pd.DataFrame(upper_var, columns=df.columns)

    # -------------------------------
    # 2. Build residual dataset for ML
    # -------------------------------
    # VAR fitted values & residuals
    var_fitted = var_res.fittedvalues
    aligned_true = df.iloc[var_res.k_ar:]          # same index as fittedvalues
    resid = aligned_true.values - var_fitted.values
    resid_df = pd.DataFrame(resid, index=aligned_true.index, columns=df.columns)

    X, y = [], []

    for t in range(lags_ML, len(df)):
        # lags_ML months of all parties as features
        X.append(df.iloc[t-lags_ML:t].values.flatten())
        # residual at time t (aligned with residual_df via index)
        y.append(resid_df.iloc[t - var_res.k_ar].values)

    X = np.array(X)
    y = np.array(y)

    # -------------------------------
    # 3. Fit XGBoost on residuals
    # -------------------------------
    split = int(0.8 * len(X))
    X_tr, X_va = X[:split], X[split:]
    y_tr, y_va = y[:split], y[split:]
    xgb = XGBRegressor(
        n_estimators=1000,
        max_depth=5,
        learning_rate=0.03,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=random_state,
        eval_metric="rmse"   # <-- THIS IS REQUIRED
    )
    
    xgb.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        early_stopping_rounds=50,
        verbose=False
    )
    
    # -------------------------------
    # 4. Predict future residuals recursively
    # -------------------------------
    current_win = df.values[-lags_ML:].copy()
    pred_resid_list = []

    for i in range(n_months):
        Xi = current_win.flatten().reshape(1, -1)
        pred = xgb.predict(Xi)[0]
        pred_resid_list.append(pred)

        # advance window using VAR mean forecast (not true values)
        var_step_mean = mean_var_df.iloc[i].values
        current_win = np.vstack([current_win[1:], var_step_mean])

    ml_resid_df = pd.DataFrame(pred_resid_list, columns=df.columns)

    # -------------------------------
    # 5. Build hybrid forecast + intervals
    # -------------------------------
    # VAR intervals, shifted by ML residual prediction (same width)
    forecast_index = pd.date_range(
        start=df.index[-1] + pd.offsets.MonthEnd(1),
        periods=n_months,
        freq="M"
    )

    forecast_df = mean_var_df + ml_resid_df
    forecast_lower_df = lower_var_df + ml_resid_df
    forecast_upper_df = upper_var_df + ml_resid_df

    forecast_df.index = forecast_index
    forecast_lower_df.index = forecast_index
    forecast_upper_df.index = forecast_index

    return forecast_df, forecast_lower_df, forecast_upper_df


In [14]:
forecast_df, forecast_lower_df, forecast_upper_df = hybrid_var_xgb_with_var_intervals(
    df,
    n_months=12,
    var_lags=6,
    lags_ML=6
)

  self._init_dates(dates, freq)


TypeError: XGBModel.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [None]:
# --- Define colors for consistency ---
colors = {
    'Ap': '#FF0000',        # Red
    'Hoyre': '#0000FF',     # Blue
    'Frp': '#00008B',       # Dark Blue
    'SV': '#FF6347',        # Light Red (Tomato)
    'SP': '#006400',        # Dark Green
    'KrF': '#FFD700',       # Yellow (Gold)
    'Venstre': '#ADD8E6',   # Light Blue
    'MDG': '#008000',       # Green
    'Rodt': '#8B0000',      # Dark Red
    'Andre': '#808080'      # Gray
}


# --- Plot ---
plt.figure(figsize=(14, 7))

months_back = 12
df_recent = df.iloc[-months_back:]  # last 12 months actual

for party, color in colors.items():

    # Actual observed data
    plt.plot(
        df_recent.index,
        df_recent[party],
        marker="o",
        color=color,
        label=f"{party}"
    )

    # Forecast line (bootstrap mean)
    plt.plot(
        forecast_df.index,
        forecast_df[party],
        linestyle="dashed",
        color=color
    )

    # Connect last actual point to first forecast
    plt.plot(
        [df_recent.index[-1], forecast_df.index[0]],
        [df_recent[party].iloc[-1], forecast_df[party].iloc[0]],
        linestyle="dashed",
        color=color
    )

    # Confidence interval (bootstrap)
    plt.fill_between(
        forecast_df.index,
        forecast_lower_df[party],
        forecast_upper_df[party],
        color=color,
        alpha=0.15
    )

# --- Vertical monthly guides ---
dates = pd.date_range(start=df_recent.index[0], end=forecast_df.index[-1], freq="MS")
for date in dates:
    plt.axvline(date, color="gray", linestyle="dotted", alpha=0.3)

# --- Horizontal percentage lines ---
for percent in range(0, 45, 5):
    plt.axhline(percent, color="gray", linestyle="dotted", alpha=0.3)

# --- Final formatting ---
plt.xlim(df_recent.index[0], forecast_df.index[-1])
plt.ylim(0, 40)
plt.xlabel("Tid")
plt.ylabel("Prosent oppslutning")
plt.title("OneSixtyNine")
plt.legend(loc="upper left", ncol=2)
plt.grid(alpha=0.2)
plt.tight_layout()
plt.show()


In [None]:
from itertools import product
import numpy as np
import pandas as pd

def tune_hybrid_params(
    df,
    horizon=6,
    lags_var_list=[5,6,7],
    lags_ml_list=[10,12,14],
    depth_list=[4,5,6],
    lr_list=[0.03, 0.05],
    n_estimators_list=[250,750],
):

    results = []

    for l_var, l_ml, depth, lr, n_est in product(
        lags_var_list, lags_ml_list, depth_list, lr_list,  n_estimators_list
    ):
        err = []
        step = max(1, horizon * 6)
        for t in range(60, len(df) - horizon, step):
            train = df.iloc[:t]

            # --- VAR ---
            var_res = VAR(train).fit(maxlags=l_var)

            mean_var, _, _ = var_res.forecast_interval(
                train.values[-var_res.k_ar:], steps=horizon
            )

            var_pred = mean_var[-1]

            # --- Residual ML ---
            resid = train.iloc[var_res.k_ar:].values - var_res.fittedvalues.values

            X, y = [], []
            for i in range(l_ml, len(train)):
                if i - var_res.k_ar < 0:
                    continue
                X.append(train.iloc[i-l_ml:i].values.flatten())
                y.append(resid[i - var_res.k_ar])

            if len(X) < 50:
                continue

            X = np.array(X)
            y = np.array(y)

            xgb = XGBRegressor(
                n_estimators=n_est,
                max_depth=depth,
                learning_rate=lr,
                subsample=0.9,
                colsample_bytree=0.9,
                objective="reg:squarederror",
                verbosity=0
            )
            xgb.fit(X, y)

            # predict residual
            win = train.values[-l_ml:].copy()
            pred_resid = xgb.predict(win.flatten().reshape(1, -1))[0]

            hybrid_pred = var_pred + pred_resid
            true = df.iloc[t + horizon - 1].values

            err.append(np.mean(np.abs(true - hybrid_pred)))

        if err:
            results.append({
                "lags_var": l_var,
                "lags_ml": l_ml,
                "max_depth": depth,
                "learning_rate": lr,
                "estimators": lr,
                "MAE": np.mean(err)
            })

    return pd.DataFrame(results).sort_values("MAE")


In [None]:
tuning_results = tune_hybrid_params(df)
tuning_results.head(10)