In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm

# Load the CSV file into a DataFrame
y = pd.read_csv('../../../4 - Data/04_WorkingDatasets/NormalData/Target_Additive.csv')



# Display the DataFrame
y.head()

Unnamed: 0,Datum,PM10_Combined_Trend_Residual
0,2022-01-01 00:00:00+00:00,75.197962
1,2022-01-01 01:00:00+00:00,51.472071
2,2022-01-01 02:00:00+00:00,32.710483
3,2022-01-01 03:00:00+00:00,24.801767
4,2022-01-01 04:00:00+00:00,9.68366


In [4]:
y_filtered = y[['Datum', 'PM10_Combined_Trend_Residual']]
y_filtered.head()

Unnamed: 0,Datum,PM10_Combined_Trend_Residual
0,2022-01-01 00:00:00+00:00,75.197962
1,2022-01-01 01:00:00+00:00,51.472071
2,2022-01-01 02:00:00+00:00,32.710483
3,2022-01-01 03:00:00+00:00,24.801767
4,2022-01-01 04:00:00+00:00,9.68366


In [5]:
# Print the column names to check the exact name of the 'Datum/ Zeit' column
print(y.columns)


Index(['Datum', 'PM10_Combined_Trend_Residual'], dtype='object')


In [6]:
# Print the column names to check for the correct timestamp column name
print(y_filtered.columns)


Index(['Datum', 'PM10_Combined_Trend_Residual'], dtype='object')


In [9]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from joblib import Parallel, delayed

# Assuming 'y_filtered' is your DataFrame
# Ensure the 'PM10 (Stundenmittelwerte)' column is numeric
y_filtered['PM10_Combined_Trend_Residual'] = pd.to_numeric(y_filtered['PM10_Combined_Trend_Residual'], errors='coerce')

# Optionally, handle missing values (e.g., by filling or dropping)
y_filtered = y_filtered.dropna(subset=['PM10_Combined_Trend_Residual'])  # Drop rows with NaN values in the relevant column

# Convert the 'Datum/Zeit' column to datetime format for proper time series handling
y_filtered['Datum'] = pd.to_datetime(y_filtered['Datum'], errors='coerce')

# Define the length of the dataset and split sizes
n = len(y_filtered)  # Length of the DataFrame y_filtered
train_size_initial = int(n * 0.7)  # 70% of the data for training
test_size = int(n * 0.1)   # 10% of the data for testing
validate_size = int(n * 0.1)  # 10% of the data for validation

# Adjusting the number of combinations to avoid the mismatch
# Generate a manageable number of columns (e.g., based on a subset of combinations)
columns = [f"p={p}_q={q}_P={P}_Q={Q}_s={s}" 
           for p in range(1, 4)  # Limit to 3 values for p
           for q in range(1, 4)  # Limit to 3 values for q
           for P in range(0, 2)  # Limit to 2 values for P
           for Q in range(0, 2)  # Limit to 2 values for Q
           for s in [24]]        # Limit to 1 seasonal period (just 24)

# Now assign the reduced list to the results DataFrame
results_test = pd.DataFrame(np.zeros((1, len(columns))))  # Adjust to match the number of columns
results_test.columns = columns

results_validate = pd.DataFrame(np.zeros((1, len(columns))))  # Adjust to match the number of columns
results_validate.columns = columns

# Use the first 50% of the data for the initial training set (50% of the train_size)
train_size_current = int(train_size_initial * 0.5)  # Initial 50% of the train set
testing_start = train_size_current
testing_end = testing_start + test_size

# Expanding the training set by adding the next 10% in each iteration
training = y_filtered[:train_size_current]  # Initial training set
testing = y_filtered[testing_start:testing_end]  # Testing set
validation_start = testing_end
validation_end = validation_start + validate_size
validation = y_filtered[validation_start:validation_end]  # Validation set

# Function to fit SARIMA model and compute MSE
def fit_sarima(p, q, P, Q, s, training_data, testing, validation):
    # Fit SARIMA model with seasonal order (P, D, Q, s)
    seasonal_order = (P, 1, Q, s)  # D=1 for seasonal differencing
    mod = sm.tsa.ARIMA(training_data['PM10_Combined_Trend_Residual'], order=(p, 1, q), seasonal_order=seasonal_order)
    res = mod.fit()
    
    # One-step ahead forecast for the testing set
    forecast_test = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # One-step ahead forecast for the validation set
    forecast_validate = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # Calculate Mean Squared Error for the test set
    mse_test = ((testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test) ** 2).mean()
    
    # Calculate Mean Squared Error for the validation set
    mse_validate = ((validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate) ** 2).mean()
    
    return mse_test, mse_validate

# Use joblib's Parallel to run the fit_sarima function in parallel for different p, q, P, Q, and s combinations
results = Parallel(n_jobs=-1)(delayed(fit_sarima)(p, q, P, Q, s, y_filtered, testing, validation) 
                              for p in range(1, 4)  # Limit p to 3
                              for q in range(1, 4)  # Limit q to 3
                              for P in range(0, 2)  # Limit P to 2
                              for Q in range(0, 2)  # Limit Q to 2
                              for s in [24])       # Limit to one seasonal period (24)

# Store the results in the DataFrame
for idx, (mse_test, mse_validate) in enumerate(results):
    p_value = idx // 36 + 1  # Calculate p from the index
    q_value = (idx % 36) // 12 + 1  # Calculate q from the index
    P_value = (idx % 12) // 4  # Calculate P from the index
    Q_value = (idx % 4) // 2  # Calculate Q from the index
    s_value = [24][idx % 1]  # Seasonal period (just 24)
    results_test.loc[0, f"p={p_value}_q={q_value}_P={P_value}_Q={Q_value}_s={s_value}"] = mse_test
    results_validate.loc[0, f"p={p_value}_q={q_value}_P={P_value}_Q={Q_value}_s={s_value}"] = mse_validate

# Combine the results into one DataFrame for better display
final_results = pd.concat([results_test, results_validate], axis=0, ignore_index=True)
final_results.index = ["Test MSE", "Validation MSE"]

# Print the results in a more readable format
print(final_results.to_string(index=True))  # Display the results in a tabular format


                p=1_q=1_P=0_Q=0_s=24  p=1_q=1_P=0_Q=1_s=24  p=1_q=1_P=1_Q=0_s=24  p=1_q=1_P=1_Q=1_s=24  p=1_q=2_P=0_Q=0_s=24  p=1_q=2_P=0_Q=1_s=24  p=1_q=2_P=1_Q=0_s=24  p=1_q=2_P=1_Q=1_s=24  p=1_q=3_P=0_Q=0_s=24  p=1_q=3_P=0_Q=1_s=24  p=1_q=3_P=1_Q=0_s=24  p=1_q=3_P=1_Q=1_s=24  p=2_q=1_P=0_Q=0_s=24  p=2_q=1_P=0_Q=1_s=24  p=2_q=1_P=1_Q=0_s=24  p=2_q=1_P=1_Q=1_s=24  p=2_q=2_P=0_Q=0_s=24  p=2_q=2_P=0_Q=1_s=24  p=2_q=2_P=1_Q=0_s=24  p=2_q=2_P=1_Q=1_s=24  p=2_q=3_P=0_Q=0_s=24  p=2_q=3_P=0_Q=1_s=24  p=2_q=3_P=1_Q=0_s=24  p=2_q=3_P=1_Q=1_s=24  p=3_q=1_P=0_Q=0_s=24  p=3_q=1_P=0_Q=1_s=24  p=3_q=1_P=1_Q=0_s=24  p=3_q=1_P=1_Q=1_s=24  p=3_q=2_P=0_Q=0_s=24  p=3_q=2_P=0_Q=1_s=24  p=3_q=2_P=1_Q=0_s=24  p=3_q=2_P=1_Q=1_s=24  p=3_q=3_P=0_Q=0_s=24  p=3_q=3_P=0_Q=1_s=24  p=3_q=3_P=1_Q=0_s=24  p=3_q=3_P=1_Q=1_s=24  p=1_q=1_P=2_Q=0_s=24  p=1_q=1_P=2_Q=1_s=24  p=1_q=2_P=2_Q=0_s=24  p=1_q=2_P=2_Q=1_s=24  p=1_q=3_P=2_Q=0_s=24  p=1_q=3_P=2_Q=1_s=24
Test MSE                   36.172646             35.687380   