# SARIMA model

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pmdarima as pm
from pmdarima.model_selection import train_test_split
from statsmodels.tsa.statespace.sarimax import SARIMAXResults, SARIMAX
import matplotlib.pyplot as plt


from SARIMAX import *
%load_ext autoreload
%autoreload 2

## Load data

In [2]:
# Set file path and parameters
file_path = "../../data/fulldata.csv"

In [3]:

loader = DataLoader(file_path, 
                    use_temp_pca=False, 
                    use_wind_pca=False, 
                    use_fourier=True, 
                    use_oil=False, 
                    use_gas=False)

loader.load_data()
spot_price, exog_data = loader.preprocess_data()

## Running LASSO

In [4]:
# standardize features
if not exog_data.empty:
    scaler = StandardScaler()
    exog_scaled = scaler.fit_transform(exog_data)
else:
    exog_scaled = None
    print("No exogenous variables selected. Skipping scaling step.")


In [5]:
# Step 3: Fit LASSO Regression
print("Fitting LASSO regression for feature selection...")
lasso = LassoCV(cv=5, max_iter=5000)  # Cross-validation to find the best alpha
lasso.fit(exog_scaled, spot_price)

Fitting LASSO regression for feature selection...


## SARIMA model

### Preparing data for SARIMA

In [6]:
# Step 4: Extract Selected Features
selected_features = exog_data.columns[lasso.coef_ != 0]
print("Selected Features:", selected_features)

# Reduce exogenous data to selected features
exog_selected = exog_data[selected_features]

Selected Features: Index(['day_sin', 'month_sin'], dtype='object')


In [7]:
# Define the date range for training
train_start = '2017-10-27'
train_end = '2024-07-31'

# Filter the spot_price and exog_selected based on the date range
train_data = spot_price.loc[train_start:train_end].asfreq('h')
train_exog = exog_selected.loc[train_data.index].asfreq('h')

# Confirm the shape of the data
print("Training data shape:", train_data.shape)
#print("Training exogenous features shape:", train_exog.shape)


Training data shape: (59280,)


### Running AUTO ARIMA on subset of the data

In [8]:
# Define the date range for training
#train_start = '2023-10-27'
#train_end = '2024-07-31'

# Filter the spot_price and exog_features based on the date range
#train_data = spot_price.loc[train_start:train_end]
#train_exog = exog_selected.loc[train_start:train_end]

# Split into train and test (optional)
#train_y, test_y = train_test_split(train_data, test_size=0.1)


# Define exogenous variables and ensure alignment with the target variable
#exog_features = exog_data.loc[train_data.index]

# Run auto_arima with exogenous variables
#auto_model = pm.auto_arima(
#    train_data,                 # Target variable (spot_price)
#    exogenous=exog_features,    # Exogenous predictors
#    seasonal=True,              # Allows seasonal terms
#    m=12,                       # Seasonality frequency (12 for monthly, 24 for daily hours)
#    stepwise=True,              # Stepwise search for efficiency
#    trace=True,                 # Print progress
#    suppress_warnings=True,     # Ignore warnings
#    error_action='ignore',      # Ignore invalid models
#    max_order=(5, 2)            # Limit AR, MA, and seasonal orders for faster search
#)

# Print the best model summary
#print(auto_model.summary())



### Training SARIMA model

In [9]:
#After auto_arima is done
order = (5,1,0)       # (p, d, q)
seasonal_order = (2,0,0,12)  # (P, D, Q, m)

In [10]:
model = SARIMAX(
            train_data,
            exog=train_exog,
            order=order,
            seasonal_order=seasonal_order,
            enforce_stationarity=True,
            enforce_invertibility=True,
        )

model = model.fit(disp=False)

### Saving model for future use

In [11]:
# save the model
model.save('/Users/johan/Documents/04 Uni/09 Asset Pricing Data/SARMIA(X)/sarima_model.pkl')

In [12]:
# load the model
model = SARIMAXResults.load('/Users/johan/Documents/04 Uni/09 Asset Pricing Data/SARMIA(X)/sarima_model.pkl')

### Forecasting

In [13]:
# define the forecast start and end
forecast_start = pd.Timestamp('2024-08-01 00:00:00')
forecast_end = pd.Timestamp('2024-11-29 23:00:00')  # Shorter period for demonstration

# create a daily index for the days we want to forecast
daily_forecast_start_times = pd.date_range(start=forecast_start, end=forecast_end, freq='D', inclusive='both')
# print(daily_forecast_start_times)
# actual_data includes actual values over the entire period (for demonstration)
actual_data = spot_price.loc[:forecast_end]

# ensure exogenous data coverage for the full test period + 24 hours before
full_exog_index = pd.date_range(start=forecast_start - pd.Timedelta(hours=24),
                                end=forecast_end, freq='h')

exog_features_reduced = exog_selected.reindex(full_exog_index)
exog_features_shifted = exog_features_reduced.shift(24)

# extract original exogenous columns from the loaded model
orig_exog_columns = model.data.orig_exog.columns
forecasts = pd.DataFrame()
all_forecasts = []
dat_start = model.data.orig_endog.index[-1]

for start_time in daily_forecast_start_times:
    print(f"Forecasting for {start_time}")
    
    # forecast 24 hours from start_time
    end_time = start_time + pd.Timedelta(hours=23)

    # data known up to 24 hours before the forecast day starts
    dat_end = end_time - pd.Timedelta(hours=24)
    new_dat = actual_data.loc[(dat_start + pd.Timedelta(hours=1)):dat_end]

    # if any new data is available, append it w. exogenous variable to the model
    if not new_dat.empty:
        new_exog = exog_features_reduced.loc[new_dat.index, orig_exog_columns]
        model = model.append(new_dat, exog=new_exog, refit=False)
        dat_start = dat_end

    # forecast for the next 24 hours
    forecast_index = pd.date_range(start=start_time, periods=24, freq='h')

    # give 24h previous exogenous data to predict from
    exog_forecast = exog_features_shifted.loc[start_time:end_time, orig_exog_columns]

    # Get a 24-step forecast
    forecast_result = model.get_forecast(steps=24, exog=exog_forecast)
    day_forecasts = forecast_result.predicted_mean
    all_forecasts.append(day_forecasts)

# combine all daily forecasts
forecasts = pd.concat(all_forecasts)

Forecasting for 2024-08-01 00:00:00
Forecasting for 2024-08-02 00:00:00
Forecasting for 2024-08-03 00:00:00
Forecasting for 2024-08-04 00:00:00
Forecasting for 2024-08-05 00:00:00
Forecasting for 2024-08-06 00:00:00
Forecasting for 2024-08-07 00:00:00
Forecasting for 2024-08-08 00:00:00
Forecasting for 2024-08-09 00:00:00
Forecasting for 2024-08-10 00:00:00
Forecasting for 2024-08-11 00:00:00
Forecasting for 2024-08-12 00:00:00
Forecasting for 2024-08-13 00:00:00
Forecasting for 2024-08-14 00:00:00
Forecasting for 2024-08-15 00:00:00
Forecasting for 2024-08-16 00:00:00
Forecasting for 2024-08-17 00:00:00
Forecasting for 2024-08-18 00:00:00
Forecasting for 2024-08-19 00:00:00
Forecasting for 2024-08-20 00:00:00
Forecasting for 2024-08-21 00:00:00
Forecasting for 2024-08-22 00:00:00
Forecasting for 2024-08-23 00:00:00
Forecasting for 2024-08-24 00:00:00
Forecasting for 2024-08-25 00:00:00
Forecasting for 2024-08-26 00:00:00
Forecasting for 2024-08-27 00:00:00
Forecasting for 2024-08-28 0

In [14]:
# evaluation of forecasts
actuals = spot_price[spot_price.index >= forecast_start]

mae = mean_absolute_error(actuals, forecasts)
mse = mean_squared_error(actuals, forecasts)
rmse = np.sqrt(mse)
print(f"MAE: {mae:.4f}, RMSE: {rmse:.4f}")


MAE: 222.4029, RMSE: 318.5006


In [17]:
forecast_array = forecasts.values
np.save('output/forecast_sarima.npy', forecast_array)