In [43]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, median_absolute_error
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.varmax import VARMAX
from statsmodels.tsa.api import VAR
from scipy.stats import norm
from sklearn.metrics import r2_score

In [44]:
# Laste poll-of-polls data
df = pd.read_csv("../data/v2_data_juli.csv", sep=",")

In [45]:
df.dropna(inplace=True)

In [46]:
# Convert to datetime and set the date to the end of the month
df["Mnd"] = pd.to_datetime(df["Mnd"])

In [47]:
# Sort values and set index
df = df.sort_values("Mnd")
df.set_index("Mnd", inplace=True)

In [48]:
candidate_lags = range(1, 3) 
#candidate_lags= [2,3,4]

In [49]:
cutoffs = pd.date_range(
    start="2020-09-30",   # first end-of-month
    end="2021-08-31",     # last end-of-month
    freq="ME"              # month-end
).to_list()

In [50]:
cutoffs

[Timestamp('2020-09-30 00:00:00'),
 Timestamp('2020-10-31 00:00:00'),
 Timestamp('2020-11-30 00:00:00'),
 Timestamp('2020-12-31 00:00:00'),
 Timestamp('2021-01-31 00:00:00'),
 Timestamp('2021-02-28 00:00:00'),
 Timestamp('2021-03-31 00:00:00'),
 Timestamp('2021-04-30 00:00:00'),
 Timestamp('2021-05-31 00:00:00'),
 Timestamp('2021-06-30 00:00:00'),
 Timestamp('2021-07-31 00:00:00'),
 Timestamp('2021-08-31 00:00:00')]

In [51]:
###resultat valg 2021
valg = {
    "Ap": [26.3],
    "Høyre": [20.4],
    "Frp": [11.6],
    "SV": [7.6],
    "Sp": [13.5],
    "KrF": [3.8],
    "Venstre": [4.6],
    "MDG": [3.9],
    "Rødt": [4.7],
    "Andre": [3.6],
}

###resultat valg 2017
#valg = {
#    "Ap": [27.4],
#    "Høyre": [25.0],
#    "Frp": [15.2],
#    "SV": [6.0],
#    "Sp": [10.3],
#    "KrF": [4.2],
#    "Venstre": [4.4],
#    "MDG": [3.2],
#    "Rødt": [2.4],
#    "Andre": [1.8],
#}



# Lage DataFrame
valg_df = pd.DataFrame(valg)
results = []
finals = []

In [52]:
y_true = valg_df.values.reshape(1, -1)[0]

In [55]:
for i, cutoff in enumerate(cutoffs):
# Fit model on training set
        df_en=df[['Ap', 'Hoyre', 'Frp', 'SV', 'SP', 'KrF', 'Venstre', 'MDG', 'Rodt','Andre']]

        df_ex=df[[
        'Ap_skandale_lag6', 
        'Hoyre_skandale_lag6', 
        'Frp_skandale_lag6',
        'Ap_reg_lag6',
        'Hoyre_reg_lag6',   
        'ledighet', 'ledig_trend3', 'ledig_trend6', 'styringsrente',
        'styringsrente_trend3', 'styringsrente_trend6']]

        steps = len(cutoffs) - i +1

        ex_future=df_ex.loc[df_ex.index >= cutoff][:steps]
        
        df_en=df_en.loc[df_en.index <= cutoff]
        df_ex=df_ex.loc[df_ex.index <= cutoff]
           
        df_orig=df_en
        df_en_val = df_en.diff().dropna() 

        #print(ex_future)
        #print(cutoff)
        #model = VAR(df_en_val)
        model = VARMAX(endog=df_en, exog=df_ex, order=(1,0), trend='n')
          
        model_fitted = model.fit(disp=True, method='powell', cov_type='None', maxiter=1000)
        #print(model_fitted.summary())
            
        #print(f"lag={lag}, steps={steps} and cutoff={cutoff}")
        forecast_dates = pd.date_range(start=cutoff, periods=steps, freq='ME')
        #print(forecast_dates)
        forecast_res = model_fitted.get_forecast(steps=steps, exog=ex_future)
        forecast = forecast_res.predicted_mean
        #forecast = model_fitted.forecast(df_en_val.values[-model_fitted.k_ar:], steps=steps,  exog_future=ex_future)
        #print(forecast)

        last_actual_value = df_orig.iloc[-1]  # Last known actual data point

        #print(last_actual_value)
        forecast_df = pd.DataFrame(forecast, index=forecast_dates, columns=df_en_val.columns)
        date_to_extract = "2021-09-30"
        estimate_row = forecast_df.loc[date_to_extract]
        finals.append(estimate_row)
        #print(estimate_row.values)
         
        y_pred = estimate_row.values.reshape(1, -1)[0]  
        # Compute error metric (MSE here)
        mse = median_absolute_error(y_true, y_pred)
        mse_pp = median_absolute_error(y_true, last_actual_value.values.reshape(1, -1)[0])
    
        results.append((cutoff,lag,steps, mse, mse_pp))


  self._init_dates(dates, freq)


KeyboardInterrupt: 

In [None]:
last_actual_value.values.reshape(1, -1)[0]

In [None]:
y_true

In [None]:
finals

In [None]:
results_df=pd.DataFrame(results, columns=["cutoff", "lag", "steps", "MSE", "mse_pp"])

In [None]:
results_df

In [None]:
best_lags = results_df.loc[results_df.groupby("cutoff")["MSE"].idxmin()]

In [None]:
best_lags

In [None]:
df_sorted = best_lags.sort_values("steps", ascending=True)  # reverse lag order for x-axis

plt.figure(figsize=(10, 6))
plt.plot(df_sorted["steps"], df_sorted["MSE"], marker='o', label="MSE")
plt.plot(df_sorted["steps"], df_sorted["mse_pp"], marker='x', label="MSE_PoP")

plt.xlabel("Steps (reversed)")
plt.ylabel("Error")
plt.title("MSE and mse_pp vs steps")
plt.xticks(df_sorted["steps"])  # show all lag values on x-axis
plt.gca().invert_xaxis()      # reverse x-axis so largest lag is left
plt.legend()
plt.grid(True)
plt.show()

In [None]:
filtered = results_df[results_df["lag"] == results_df["steps"]]
filtered = results_df[results_df["lag"] == 5]

In [None]:
filtered

In [None]:
df_sorted = filtered.sort_values("steps", ascending=False)  # reverse lag order for x-axis

plt.figure(figsize=(10, 6))
plt.plot(df_sorted["steps"], df_sorted["MSE"], marker='o', label="MSE")
plt.plot(df_sorted["steps"], df_sorted["mse_pp"], marker='x', label="mse_pp")

plt.xlabel("Lag (reversed)")
plt.ylabel("Error")
plt.title("MSE and mse_pp vs Lag")
plt.xticks(df_sorted["steps"])  # show all lag values on x-axis
plt.gca().invert_xaxis()      # reverse x-axis so largest lag is left
plt.legend()
plt.grid(True)
plt.show()