In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm

# Load the CSV file into a DataFrame
y = pd.read_csv('../Detrended DAta/TargetCutto50MostImpFeatures_DF.csv')

# Display the DataFrame
y.head()

Unnamed: 0,Datum,PM10_Combined_Trend_Residual
0,2022-01-01 00:00:00+00:00,75.197962
1,2022-01-01 01:00:00+00:00,51.472071
2,2022-01-01 02:00:00+00:00,32.710483
3,2022-01-01 03:00:00+00:00,24.801767
4,2022-01-01 04:00:00+00:00,9.68366


AR model

In [4]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from joblib import Parallel, delayed

# Assuming 'y' is your DataFrame
# Ensure the 'PM10_Combined_Trend_Residual' column is numeric
y['PM10_Combined_Trend_Residual'] = pd.to_numeric(y['PM10_Combined_Trend_Residual'], errors='coerce')

# Optionally, handle missing values (e.g., by filling or dropping)
y = y.dropna(subset=['PM10_Combined_Trend_Residual'])  # Drop rows with NaN values in the relevant column
# Or, if you prefer filling NaN values with the forward fill method:
# y['PM10_Combined_Trend_Residual'] = y['PM10_Combined_Trend_Residual'].fillna(method='ffill')

# Define the length of the dataset and split sizes
n = len(y)  # Length of the DataFrame y
train_size_initial = int(n * 0.7)  # 70% of the data for training
test_size = int(n * 0.1)   # 10% of the data for testing
validate_size = int(n * 0.1)  # 10% of the data for validation

# Define results DataFrame to store MSE for test and validation sets separately
results_test = pd.DataFrame(np.zeros((1, 10)))  # MSE for test set
results_test.columns = np.arange(1, 11)

results_validate = pd.DataFrame(np.zeros((1, 10)))  # MSE for validation set
results_validate.columns = np.arange(1, 11)

# Use the first 50% of the data for the initial training set (50% of the train_size)
train_size_current = int(train_size_initial * 0.5)  # Initial 50% of the train set
testing_start = train_size_current
testing_end = testing_start + test_size

# Expanding the training set by adding the next 10% in each iteration
training = y[:train_size_current]  # Initial training set
testing = y[testing_start:testing_end]  # Testing set
validation_start = testing_end
validation_end = validation_start + validate_size
validation = y[validation_start:validation_end]  # Validation set

# Function to fit ARIMA model and compute MSE
def fit_arima(p, training_data, testing, validation):
    mod = sm.tsa.ARIMA(training_data['PM10_Combined_Trend_Residual'], order=(p, 0, 0))
    res = mod.fit()
    
    # One-step ahead forecast for the testing set
    forecast_test = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # One-step ahead forecast for the validation set
    forecast_validate = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # Calculate Mean Squared Error for the test set
    mse_test = ((testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test.values) ** 2).mean()
    
    # Calculate Mean Squared Error for the validation set
    mse_validate = ((validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate.values) ** 2).mean()
    
    return mse_test, mse_validate

# Use joblib's Parallel to run the fit_arima function in parallel for different p values
results = Parallel(n_jobs=-1)(delayed(fit_arima)(p, y, testing, validation) for p in range(1, 11))

# Store the results in the DataFrame
for idx, (mse_test, mse_validate) in enumerate(results):
    p_value = idx + 1
    results_test.loc[0, p_value] = mse_test
    results_validate.loc[0, p_value] = mse_validate

# Print the results
print("Test Set MSE:")
print(results_test)

print("\nValidation Set MSE:")
print(results_validate)


Test Set MSE:
          1          2          3          4          5          6   \
0  83.520059  80.194799  82.802817  82.484601  82.148029  78.614943   

          7          8          9          10  
0  75.363332  73.478572  72.069796  70.744455  

Validation Set MSE:
          1          2          3          4          5          6   \
0  42.906973  40.533155  42.393327  42.165722  41.925179  39.412246   

          7          8          9          10  
0  37.119972  35.800774  34.819484  33.900142  


In [3]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from joblib import Parallel, delayed

# Assuming 'y' is your DataFrame
# Ensure the 'PM10_Combined_Trend_Residual' column is numeric
y['PM10_Combined_Trend_Residual'] = pd.to_numeric(y['PM10_Combined_Trend_Residual'], errors='coerce')

# Optionally, handle missing values (e.g., by filling or dropping)
y = y.dropna(subset=['PM10_Combined_Trend_Residual'])  # Drop rows with NaN values in the relevant column
# Or, if you prefer filling NaN values with the forward fill method:
# y['PM10_Combined_Trend_Residual'] = y['PM10_Combined_Trend_Residual'].fillna(method='ffill')

# Define the length of the dataset and split sizes
n = len(y)  # Length of the DataFrame y
train_size_initial = int(n * 0.7)  # 70% of the data for training
test_size = int(n * 0.1)   # 10% of the data for testing
validate_size = int(n * 0.1)  # 10% of the data for validation

# Define results DataFrame to store MSE for test and validation sets separately
results_test = pd.DataFrame(np.zeros((1, 10)))  # MSE for test set
results_test.columns = [f"q={q}" for q in range(1, 11)]  # Column names for MA order

results_validate = pd.DataFrame(np.zeros((1, 10)))  # MSE for validation set
results_validate.columns = [f"q={q}" for q in range(1, 11)]  # Column names for MA order

# Use the first 50% of the data for the initial training set (50% of the train_size)
train_size_current = int(train_size_initial * 0.5)  # Initial 50% of the train set
testing_start = train_size_current
testing_end = testing_start + test_size

# Expanding the training set by adding the next 10% in each iteration
training = y[:train_size_current]  # Initial training set
testing = y[testing_start:testing_end]  # Testing set
validation_start = testing_end
validation_end = validation_start + validate_size
validation = y[validation_start:validation_end]  # Validation set

# Function to fit MA model and compute MSE
def fit_ma(q, training_data, testing, validation):
    # Fit MA model with (0, 0, q) order (AR=0, differencing=0, MA=q)
    mod = sm.tsa.ARIMA(training_data['PM10_Combined_Trend_Residual'], order=(0, 0, q))
    res = mod.fit()
    
    # One-step ahead forecast for the testing set
    forecast_test = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # One-step ahead forecast for the validation set
    forecast_validate = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # Calculate Mean Squared Error for the test set
    mse_test = ((testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test.values) ** 2).mean()
    
    # Calculate Mean Squared Error for the validation set
    mse_validate = ((validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate.values) ** 2).mean()
    
    return mse_test, mse_validate

# Use joblib's Parallel to run the fit_ma function in parallel for different q values
results = Parallel(n_jobs=-1)(delayed(fit_ma)(q, y, testing, validation) for q in range(1, 11))

# Store the results in the DataFrame
for idx, (mse_test, mse_validate) in enumerate(results):
    q_value = idx + 1
    results_test.loc[0, f"q={q_value}"] = mse_test
    results_validate.loc[0, f"q={q_value}"] = mse_validate

# Combine the results into one DataFrame for better display
final_results = pd.concat([results_test, results_validate], axis=0, ignore_index=True)
final_results.index = ["Test MSE", "Validation MSE"]

# Print the results in a more readable format
print(final_results.to_string(index=True))  # Display the results in a tabular format


                      q=1        q=2        q=3        q=4        q=5        q=6        q=7        q=8        q=9       q=10
Test MSE        65.985091  67.075365  75.850070  68.614055  72.051681  77.651255  80.237003  85.623280  88.091847  90.037087
Validation MSE  30.631036  31.375297  37.461807  32.430413  34.806893  38.730776  40.563161  44.418164  46.201061  47.612732


ARMA model

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from joblib import Parallel, delayed

# Assuming 'y' is your DataFrame
# Ensure the 'PM10_Combined_Trend_Residual' column is numeric
y['PM10_Combined_Trend_Residual'] = pd.to_numeric(y['PM10_Combined_Trend_Residual'], errors='coerce')

# Optionally, handle missing values (e.g., by filling or dropping)
y = y.dropna(subset=['PM10_Combined_Trend_Residual'])  # Drop rows with NaN values in the relevant column
# Or, if you prefer filling NaN values with the forward fill method:
# y['PM10_Combined_Trend_Residual'] = y['PM10_Combined_Trend_Residual'].fillna(method='ffill')

# Define the length of the dataset and split sizes
n = len(y)  # Length of the DataFrame y
train_size_initial = int(n * 0.7)  # 70% of the data for training
test_size = int(n * 0.1)   # 10% of the data for testing
validate_size = int(n * 0.1)  # 10% of the data for validation

# Define results DataFrame to store MSE for test and validation sets separately
results_test = pd.DataFrame(np.zeros((1, 100)))  # MSE for test set
results_test.columns = [f"p={p}_q={q}" for p in range(1, 11) for q in range(1, 11)]  # Column names for ARMA orders

results_validate = pd.DataFrame(np.zeros((1, 100)))  # MSE for validation set
results_validate.columns = [f"p={p}_q={q}" for p in range(1, 11) for q in range(1, 11)]  # Column names for ARMA orders

# Use the first 50% of the data for the initial training set (50% of the train_size)
train_size_current = int(train_size_initial * 0.5)  # Initial 50% of the train set
testing_start = train_size_current
testing_end = testing_start + test_size

# Expanding the training set by adding the next 10% in each iteration
training = y[:train_size_current]  # Initial training set
testing = y[testing_start:testing_end]  # Testing set
validation_start = testing_end
validation_end = validation_start + validate_size
validation = y[validation_start:validation_end]  # Validation set

# Function to fit ARMA model and compute MSE
def fit_arma(p, q, training_data, testing, validation):
    # Fit ARMA model with (p, 0, q) order (AR=p, differencing=0, MA=q)
    mod = sm.tsa.ARIMA(training_data['PM10_Combined_Trend_Residual'], order=(p, 0, q))
    res = mod.fit()
    
    # One-step ahead forecast for the testing set
    forecast_test = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # One-step ahead forecast for the validation set
    forecast_validate = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # Calculate Mean Squared Error for the test set
    mse_test = ((testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test.values) ** 2).mean()
    
    # Calculate Mean Squared Error for the validation set
    mse_validate = ((validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate.values) ** 2).mean()
    
    return mse_test, mse_validate

# Use joblib's Parallel to run the fit_arma function in parallel for different p, q combinations
results = Parallel(n_jobs=-1)(delayed(fit_arma)(p, q, y, testing, validation) for p in range(1, 11) for q in range(1, 11))

# Store the results in the DataFrame
for idx, (mse_test, mse_validate) in enumerate(results):
    p_value = (idx // 10) + 1  # Calculate p from the index
    q_value = (idx % 10) + 1   # Calculate q from the index
    results_test.loc[0, f"p={p_value}_q={q_value}"] = mse_test
    results_validate.loc[0, f"p={p_value}_q={q_value}"] = mse_validate

# Combine the results into one DataFrame for better display
final_results = pd.concat([results_test, results_validate], axis=0, ignore_index=True)
final_results.index = ["Test MSE", "Validation MSE"]

# Print the results in a more readable format
print(final_results.to_string(index=True))  # Display the results in a tabular format


SARIMA

In [None]:

# Assuming 'y' is your DataFrame
# Ensure the 'PM10_Combined_Trend_Residual' column is numeric
y['PM10_Combined_Trend_Residual'] = pd.to_numeric(y['PM10_Combined_Trend_Residual'], errors='coerce')

# Optionally, handle missing values (e.g., by filling or dropping)
y = y.dropna(subset=['PM10_Combined_Trend_Residual'])  # Drop rows with NaN values in the relevant column
# Or, if you prefer filling NaN values with the forward fill method:
# y['PM10_Combined_Trend_Residual'] = y['PM10_Combined_Trend_Residual'].fillna(method='ffill')

# Define the length of the dataset and split sizes
n = len(y)  # Length of the DataFrame y
train_size_initial = int(n * 0.7)  # 70% of the data for training
test_size = int(n * 0.1)   # 10% of the data for testing
validate_size = int(n * 0.1)  # 10% of the data for validation

# Define results DataFrame to store MSE for test and validation sets separately
results_test = pd.DataFrame(np.zeros((1, 100)))  # MSE for test set
results_test.columns = [f"p={p}_q={q}_P={P}_Q={Q}_s={s}" for p in range(1, 11) for q in range(1, 11) for P in range(0, 3) for Q in range(0, 3) for s in [24, 720]]

results_validate = pd.DataFrame(np.zeros((1, 100)))  # MSE for validation set
results_validate.columns = [f"p={p}_q={q}_P={P}_Q={Q}_s={s}" for p in range(1, 11) for q in range(1, 11) for P in range(0, 3) for Q in range(0, 3) for s in [24, 720]]

# Use the first 50% of the data for the initial training set (50% of the train_size)
train_size_current = int(train_size_initial * 0.5)  # Initial 50% of the train set
testing_start = train_size_current
testing_end = testing_start + test_size

# Expanding the training set by adding the next 10% in each iteration
training = y[:train_size_current]  # Initial training set
testing = y[testing_start:testing_end]  # Testing set
validation_start = testing_end
validation_end = validation_start + validate_size
validation = y[validation_start:validation_end]  # Validation set

# Function to fit SARIMA model and compute MSE
def fit_sarima(p, q, P, Q, s, training_data, testing, validation):
    # Fit SARIMA model with seasonal order (P, D, Q, s)
    seasonal_order = (P, 1, Q, s)  # D=1 for seasonal differencing
    mod = sm.tsa.ARIMA(training_data['PM10_Combined_Trend_Residual'], order=(p, 1, q), seasonal_order=seasonal_order)
    res = mod.fit()
    
    # One-step ahead forecast for the testing set
    forecast_test = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # One-step ahead forecast for the validation set
    forecast_validate = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # Calculate Mean Squared Error for the test set
    mse_test = ((testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test.values) ** 2).mean()
    
    # Calculate Mean Squared Error for the validation set
    mse_validate = ((validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate.values) ** 2).mean()
    
    return mse_test, mse_validate

# Use joblib's Parallel to run the fit_sarima function in parallel for different p, q, P, Q, and s combinations
results = Parallel(n_jobs=-1)(delayed(fit_sarima)(p, q, P, Q, s, y, testing, validation) 
                              for p in range(1, 11) 
                              for q in range(1, 11) 
                              for P in range(0, 3)  # Seasonal AR order
                              for Q in range(0, 3)  # Seasonal MA order
                              for s in [24, 720])   # Seasonal period (daily and monthly seasonality)

# Store the results in the DataFrame
for idx, (mse_test, mse_validate) in enumerate(results):
    p_value = idx // 72 + 1  # Calculate p from the index
    q_value = (idx % 72) // 24 + 1  # Calculate q from the index
    P_value = (idx % 24) // 8  # Calculate P from the index
    Q_value = (idx % 8) // 4  # Calculate Q from the index
    s_value = [24, 720][idx % 2]  # Seasonal period (either 24 or 720)
    results_test.loc[0, f"p={p_value}_q={q_value}_P={P_value}_Q={Q_value}_s={s_value}"] = mse_test
    results_validate.loc[0, f"p={p_value}_q={q_value}_P={P_value}_Q={Q_value}_s={s_value}"] = mse_validate

# Combine the results into one DataFrame for better display
final_results = pd.concat([results_test, results_validate], axis=0, ignore_index=True)
final_results.index = ["Test MSE", "Validation MSE"]

# Print the results in a more readable format
print(final_results.to_string(index=True))  # Display the results in a tabular format
