In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm

# Load the CSV file into a DataFrame
y = pd.read_csv('../Detrended DAta/TargetCutto50MostImpFeatures_DF.csv')

# Display the DataFrame
y.head()

Unnamed: 0,Datum,PM10_Combined_Trend_Residual
0,2022-01-01 00:00:00+00:00,75.197962
1,2022-01-01 01:00:00+00:00,51.472071
2,2022-01-01 02:00:00+00:00,32.710483
3,2022-01-01 03:00:00+00:00,24.801767
4,2022-01-01 04:00:00+00:00,9.68366


In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from joblib import Parallel, delayed
import multiprocessing

# Set the start method for multiprocessing on Windows
multiprocessing.set_start_method('spawn', force=True)

# Assuming 'y' is your DataFrame
# Ensure the 'PM10_Combined_Trend_Residual' column is numeric
y['PM10_Combined_Trend_Residual'] = pd.to_numeric(y['PM10_Combined_Trend_Residual'], errors='coerce')

# Optionally, handle missing values (e.g., by filling or dropping)
y = y.dropna(subset=['PM10_Combined_Trend_Residual'])  # Drop rows with NaN values in the relevant column
# Or, if you prefer filling NaN values with the forward fill method:
# y['PM10_Combined_Trend_Residual'] = y['PM10_Combined_Trend_Residual'].fillna(method='ffill')

# Define the length of the dataset and split sizes
n = len(y)  # Length of the DataFrame y
train_size_initial = int(n * 0.7)  # 70% of the data for training
test_size = int(n * 0.1)   # 10% of the data for testing
validate_size = int(n * 0.1)  # 10% of the data for validation

# Define results DataFrame to store MSE for test and validation sets separately
results_test = pd.DataFrame(np.zeros((1, 100)))  # MSE for test set (p, q <= 10)
results_test.columns = [f"p={p}_q={q}" for p in range(1, 11) for q in range(1, 11)]  # Column names for ARMA orders

results_validate = pd.DataFrame(np.zeros((1, 100)))  # MSE for validation set (p, q <= 10)
results_validate.columns = [f"p={p}_q={q}" for p in range(1, 11) for q in range(1, 11)]  # Column names for ARMA orders

# Use the first 50% of the data for the initial training set (50% of the train_size)
train_size_current = int(train_size_initial * 0.5)  # Initial 50% of the train set
testing_start = train_size_current
testing_end = testing_start + test_size

# Expanding the training set by adding the next 10% in each iteration
training = y[:train_size_current]  # Initial training set
testing = y[testing_start:testing_end]  # Testing set
validation_start = testing_end
validation_end = validation_start + validate_size
validation = y[validation_start:validation_end]  # Validation set

# Function to fit ARMA model and compute MSE
def fit_arma(p, q, training_data, testing, validation):
    try:
        # Fit ARMA model with (p, 0, q) order (AR=p, differencing=0, MA=q)
        mod = sm.tsa.ARIMA(training_data['PM10_Combined_Trend_Residual'], order=(p, 0, q))
        res = mod.fit()
        
        # One-step ahead forecast for the testing set
        forecast_test = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
        
        # One-step ahead forecast for the validation set
        forecast_validate = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
        
        # Calculate Mean Squared Error for the test set
        mse_test = ((testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test.values) ** 2).mean()
        
        # Calculate Mean Squared Error for the validation set
        mse_validate = ((validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate.values) ** 2).mean()
        
    except np.linalg.LinAlgError:
        mse_test = np.nan
        mse_validate = np.nan
    
    return mse_test, mse_validate

# Use joblib's Parallel to run the fit_arma function in parallel for different p, q combinations
results = Parallel(n_jobs=-1, backend='threading')(
    delayed(fit_arma)(p, q, training, testing, validation) for p in range(1, 11) for q in range(1, 11)
)

# Store the results in the DataFrame
for idx, (mse_test, mse_validate) in enumerate(results):
    p_value = (idx // 10) + 1  # Calculate p from the index
    q_value = (idx % 10) + 1   # Calculate q from the index
    results_test.loc[0, f"p={p_value}_q={q_value}"] = mse_test
    results_validate.loc[0, f"p={p_value}_q={q_value}"] = mse_validate

# Combine the results into one DataFrame for better display
final_results = pd.concat([results_test, results_validate], axis=0, ignore_index=True)
final_results.index = ["Test MSE", "Validation MSE"]

# Print the results in a more readable format
print(final_results.to_string(index=True))  # Display the results in a tabular format


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA pa

KeyboardInterrupt: 

  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
