In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm

# Load the CSV file into a DataFrame
y = pd.read_csv('../../../4 - Data/04_WorkingDatasets/Top50CombLagged/TargetOutliersTreated.csv')

# Display the DataFrame
y.head()

Unnamed: 0,Datum,PM10_Combined_Trend_Residual
0,2022-01-02 01:00:00+00:00,14.547155
1,2022-01-02 02:00:00+00:00,13.151571
2,2022-01-02 03:00:00+00:00,12.394376
3,2022-01-02 04:00:00+00:00,11.062399
4,2022-01-02 05:00:00+00:00,11.068747


AR Model


In [3]:
from joblib import Parallel, delayed

# Define results DataFrame to store MSE and MAE for test and validation sets separately
results_test = pd.DataFrame(np.zeros((2, 24)))  # MSE and MAE for test set
results_test.columns = np.arange(1, 25)
results_test.index = ["Test MSE", "Test MAE"]

results_validate = pd.DataFrame(np.zeros((2, 24)))  # MSE and MAE for validation set
results_validate.columns = np.arange(1, 25)
results_validate.index = ["Validation MSE", "Validation MAE"]

# Function to fit ARIMA model and compute MSE and MAE
def fit_arima(p, training_data, testing, validation):
    mod = sm.tsa.ARIMA(training_data['PM10_Combined_Trend_Residual'], order=(p, 0, 0))
    res = mod.fit()
    
    # One-step ahead forecast for the testing set
    forecast_test = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # One-step ahead forecast for the validation set
    forecast_validate = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # Calculate Mean Squared Error for the test set
    mse_test = ((testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test.values) ** 2).mean()
    
    # Calculate Mean Squared Error for the validation set
    mse_validate = ((validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate.values) ** 2).mean()
    
    # Calculate Mean Absolute Error for the test set
    mae_test = np.abs(testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test.values).mean()
    
    # Calculate Mean Absolute Error for the validation set
    mae_validate = np.abs(validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate.values).mean()
    
    return mse_test, mae_test, mse_validate, mae_validate

# Define testing and validation sets
testing = y.iloc[-30:]  # Assuming the last 30 samples are for testing
validation = y.iloc[-60:-30]  # Assuming the samples before the last 30 are for validation

# Use joblib's Parallel to run the fit_arima function in parallel for different p values
results = Parallel(n_jobs=-1)(delayed(fit_arima)(p, y, testing, validation) for p in range(1, 25))

# Store the results in the DataFrame
for idx, (mse_test, mae_test, mse_validate, mae_validate) in enumerate(results):
    p_value = idx + 1
    results_test.loc["Test MSE", p_value] = mse_test
    results_test.loc["Test MAE", p_value] = mae_test
    results_validate.loc["Validation MSE", p_value] = mse_validate
    results_validate.loc["Validation MAE", p_value] = mae_validate

# Combine the results into one DataFrame for better display
final_results = pd.concat([results_test, results_validate], axis=0, ignore_index=False)

# Print the results in a more readable format
print(final_results.to_string(index=True))  # Display the results in a tabular format


#runtime 6m 30s


                       1          2           3           4           5           6          7          8          9          10         11         12         13         14         15         16         17         18         19         20         21         22         23         24
Test MSE        23.610150  25.344963   27.608961   27.484515   27.397555   26.039529  24.528414  23.892048  23.273386  22.678452  22.038736  22.704180  23.110757  23.797401  22.333779  20.417598  19.651002  21.343744  20.413527  21.331782  22.228794  22.602560  22.836933  23.574719
Test MAE         4.859028   5.034378    5.254423    5.242568    5.234267    5.102894   4.952617   4.887949   4.824250   4.762190   4.694543   4.764890   4.807365   4.878258   4.725863   4.518584   4.432945   4.619929   4.518133   4.618634   4.714742   4.754215   4.778800   4.855380
Validation MSE  95.752443  99.214910  103.646918  103.405667  103.236927  100.584537  97.592803  96.319294  95.073027  93.866642  92.560432  93.918977 

In [4]:
# Define results DataFrame to store MSE and MAE for test and validation sets separately
results_test = pd.DataFrame(np.zeros((2, 24)))  # MSE and MAE for test set
results_test.columns = [f"q={q}" for q in range(1, 25)]  # Column names for MA order
results_test.index = ["Test MSE", "Test MAE"]

results_validate = pd.DataFrame(np.zeros((2, 24)))  # MSE and MAE for validation set
results_validate.columns = [f"q={q}" for q in range(1, 25)]  # Column names for MA order
results_validate.index = ["Validation MSE", "Validation MAE"]

# Function to fit MA model and compute MSE and MAE
def fit_ma(q, training_data, testing, validation):
    # Fit MA model with (0, 0, q) order (AR=0, differencing=0, MA=q)
    mod = sm.tsa.ARIMA(training_data['PM10_Combined_Trend_Residual'], order=(0, 0, q))
    res = mod.fit()
    
    # One-step ahead forecast for the testing set
    forecast_test = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # One-step ahead forecast for the validation set
    forecast_validate = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # Calculate Mean Squared Error for the test set
    mse_test = ((testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test.values) ** 2).mean()
    
    # Calculate Mean Squared Error for the validation set
    mse_validate = ((validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate.values) ** 2).mean()
    
    # Calculate Mean Absolute Error for the test set
    mae_test = np.abs(testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test.values).mean()
    
    # Calculate Mean Absolute Error for the validation set
    mae_validate = np.abs(validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate.values).mean()
    
    return mse_test, mae_test, mse_validate, mae_validate

# Use joblib's Parallel to run the fit_ma function in parallel for different q values
results = Parallel(n_jobs=-1)(delayed(fit_ma)(q, y, testing, validation) for q in range(1, 25))

# Store the results in the DataFrame
for idx, (mse_test, mae_test, mse_validate, mae_validate) in enumerate(results):
    q_value = idx + 1
    results_test.loc["Test MSE", f"q={q_value}"] = mse_test
    results_test.loc["Test MAE", f"q={q_value}"] = mae_test
    results_validate.loc["Validation MSE", f"q={q_value}"] = mse_validate
    results_validate.loc["Validation MAE", f"q={q_value}"] = mae_validate

# Combine the results into one DataFrame for better display
final_results = pd.concat([results_test, results_validate], axis=0, ignore_index=False)

# Print the results in a more readable format
print(final_results.to_string(index=True))  # Display the results in a tabular format


#runtime 3m 20s

                      q=1        q=2         q=3        q=4        q=5        q=6         q=7         q=8         q=9        q=10        q=11        q=12        q=13        q=14        q=15        q=16        q=17        q=18        q=19        q=20        q=21        q=22        q=23        q=24
Test MSE        16.968083  19.673985   27.166265  25.410833  21.151858  25.445134   25.745085   28.331804   29.454524   31.688697   31.997400   32.252652   30.705523   28.730880   27.957452   31.367804   30.927625   30.305285   29.971586   29.459931   28.061412   28.258334   27.682260   26.342156
Test MAE         4.119233   4.435537    5.212127   5.040916   4.599115   5.044317    5.073962    5.322763    5.427202    5.629271    5.656624    5.679142    5.541256    5.360119    5.287481    5.600697    5.561261    5.505024    5.474631    5.427700    5.297302    5.315857    5.261393    5.132461
Validation MSE  81.821492  87.643799  102.787494  99.345195  90.733340  99.413006  100.005031  105.043085 