In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm

# Load the CSV file into a DataFrame
y = pd.read_csv('../Detrended DAta/targetforlagged.csv')

# Display the DataFrame
y.head()

Unnamed: 0,Datum,PM10_Combined_Trend_Residual
0,2022-01-02 00:00:00+00:00,15.083582
1,2022-01-02 01:00:00+00:00,14.547155
2,2022-01-02 02:00:00+00:00,13.151571
3,2022-01-02 03:00:00+00:00,12.394376
4,2022-01-02 04:00:00+00:00,11.062399


AR Model


In [None]:
from joblib import Parallel, delayed

# Define results DataFrame to store MSE and MAE for test and validation sets separately
results_test = pd.DataFrame(np.zeros((2, 24)))  # MSE and MAE for test set
results_test.columns = np.arange(1, 25)
results_test.index = ["Test MSE", "Test MAE"]

results_validate = pd.DataFrame(np.zeros((2, 24)))  # MSE and MAE for validation set
results_validate.columns = np.arange(1, 25)
results_validate.index = ["Validation MSE", "Validation MAE"]

# Function to fit ARIMA model and compute MSE and MAE
def fit_arima(p, training_data, testing, validation):
    mod = sm.tsa.ARIMA(training_data['PM10_Combined_Trend_Residual'], order=(p, 0, 0))
    res = mod.fit()
    
    # One-step ahead forecast for the testing set
    forecast_test = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # One-step ahead forecast for the validation set
    forecast_validate = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # Calculate Mean Squared Error for the test set
    mse_test = ((testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test.values) ** 2).mean()
    
    # Calculate Mean Squared Error for the validation set
    mse_validate = ((validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate.values) ** 2).mean()
    
    # Calculate Mean Absolute Error for the test set
    mae_test = np.abs(testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test.values).mean()
    
    # Calculate Mean Absolute Error for the validation set
    mae_validate = np.abs(validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate.values).mean()
    
    return mse_test, mae_test, mse_validate, mae_validate

# Define testing and validation sets
testing = y.iloc[-30:]  # Assuming the last 30 samples are for testing
validation = y.iloc[-60:-30]  # Assuming the samples before the last 30 are for validation

# Use joblib's Parallel to run the fit_arima function in parallel for different p values
results = Parallel(n_jobs=-1)(delayed(fit_arima)(p, y, testing, validation) for p in range(1, 25))

# Store the results in the DataFrame
for idx, (mse_test, mae_test, mse_validate, mae_validate) in enumerate(results):
    p_value = idx + 1
    results_test.loc["Test MSE", p_value] = mse_test
    results_test.loc["Test MAE", p_value] = mae_test
    results_validate.loc["Validation MSE", p_value] = mse_validate
    results_validate.loc["Validation MAE", p_value] = mae_validate

# Combine the results into one DataFrame for better display
final_results = pd.concat([results_test, results_validate], axis=0, ignore_index=False)

# Print the results in a more readable format
print(final_results.to_string(index=True))  # Display the results in a tabular format


#runtime 6m 30s


                       1          2          3          4          5          6          7          8          9          10         11         12         13         14         15         16         17         18         19         20         21         22         23         24
Test MSE        22.847826  22.475001  23.528166  23.390813  23.245194  21.775633  20.280387  19.680660  19.129398  18.580836  18.093211  18.685466  19.180346  19.736472  18.557692  16.546761  15.906076  17.521215  17.071323  17.704342  18.700912  19.063445  19.260794  19.772443
Test MAE         4.779940   4.740781   4.850584   4.836405   4.821327   4.666437   4.503375   4.436289   4.373717   4.310549   4.253612   4.322669   4.379537   4.442575   4.307864   4.067771   3.988242   4.185835   4.131746   4.207653   4.324455   4.366170   4.388712   4.446621
Validation MSE  94.210900  93.452257  95.587269  95.310215  95.016038  92.020410  88.918584  87.657887  86.490125  85.319203  84.270594  85.543242  86.598422  87.7

In [6]:
# Define results DataFrame to store MSE and MAE for test and validation sets separately
results_test = pd.DataFrame(np.zeros((2, 24)))  # MSE and MAE for test set
results_test.columns = [f"q={q}" for q in range(1, 25)]  # Column names for MA order
results_test.index = ["Test MSE", "Test MAE"]

results_validate = pd.DataFrame(np.zeros((2, 24)))  # MSE and MAE for validation set
results_validate.columns = [f"q={q}" for q in range(1, 25)]  # Column names for MA order
results_validate.index = ["Validation MSE", "Validation MAE"]

# Function to fit MA model and compute MSE and MAE
def fit_ma(q, training_data, testing, validation):
    # Fit MA model with (0, 0, q) order (AR=0, differencing=0, MA=q)
    mod = sm.tsa.ARIMA(training_data['PM10_Combined_Trend_Residual'], order=(0, 0, q))
    res = mod.fit()
    
    # One-step ahead forecast for the testing set
    forecast_test = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # One-step ahead forecast for the validation set
    forecast_validate = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # Calculate Mean Squared Error for the test set
    mse_test = ((testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test.values) ** 2).mean()
    
    # Calculate Mean Squared Error for the validation set
    mse_validate = ((validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate.values) ** 2).mean()
    
    # Calculate Mean Absolute Error for the test set
    mae_test = np.abs(testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test.values).mean()
    
    # Calculate Mean Absolute Error for the validation set
    mae_validate = np.abs(validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate.values).mean()
    
    return mse_test, mae_test, mse_validate, mae_validate

# Use joblib's Parallel to run the fit_ma function in parallel for different q values
results = Parallel(n_jobs=-1)(delayed(fit_ma)(q, y, testing, validation) for q in range(1, 25))

# Store the results in the DataFrame
for idx, (mse_test, mae_test, mse_validate, mae_validate) in enumerate(results):
    q_value = idx + 1
    results_test.loc["Test MSE", f"q={q_value}"] = mse_test
    results_test.loc["Test MAE", f"q={q_value}"] = mae_test
    results_validate.loc["Validation MSE", f"q={q_value}"] = mse_validate
    results_validate.loc["Validation MAE", f"q={q_value}"] = mae_validate

# Combine the results into one DataFrame for better display
final_results = pd.concat([results_test, results_validate], axis=0, ignore_index=False)

# Print the results in a more readable format
print(final_results.to_string(index=True))  # Display the results in a tabular format


#runtime 3m 20s

KeyboardInterrupt: 