In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm

# Load the CSV file into a DataFrame
y = pd.read_csv('../Detrended DAta/TargetCutto50MostImpFeatures_DF.csv')

# Display the DataFrame
y.head()

Unnamed: 0,Datum,PM10_Combined_Trend_Residual
0,2022-01-01 00:00:00+00:00,75.197962
1,2022-01-01 01:00:00+00:00,51.472071
2,2022-01-01 02:00:00+00:00,32.710483
3,2022-01-01 03:00:00+00:00,24.801767
4,2022-01-01 04:00:00+00:00,9.68366


In [12]:
from statsmodels.tsa.stattools import adfuller

# Perform the Augmented Dickey-Fuller test
adf_test = adfuller(y['PM10_Combined_Trend_Residual'])

# Extract and print the test statistic and p-value
adf_statistic = adf_test[0]
p_value = adf_test[1]

print(f'ADF Statistic: {adf_statistic}')
print(f'p-value: {p_value}')

# Check if the p-value is less than the significance level (e.g., 0.05)
if p_value < 0.05:
    print("The series is stationary.")
else:
    print("The series is not stationary.")

ADF Statistic: -4.604578602914901
p-value: 0.00012668277472821596
The series is stationary.


AR model

In [None]:
# Define results DataFrame to store MSE and MAE for test and validation sets separately
results_test = pd.DataFrame(np.zeros((2, 24)))  # MSE and MAE for test set
results_test.columns = np.arange(1, 25)
results_test.index = ["Test MSE", "Test MAE"]

results_validate = pd.DataFrame(np.zeros((2, 24)))  # MSE and MAE for validation set
results_validate.columns = np.arange(1, 25)
results_validate.index = ["Validation MSE", "Validation MAE"]

# Function to fit ARIMA model and compute MSE and MAE
def fit_arima(p, training_data, testing, validation):
    mod = sm.tsa.ARIMA(training_data['PM10_Combined_Trend_Residual'], order=(p, 0, 0))
    res = mod.fit()
    
    # One-step ahead forecast for the testing set
    forecast_test = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # One-step ahead forecast for the validation set
    forecast_validate = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # Calculate Mean Squared Error for the test set
    mse_test = ((testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test.values) ** 2).mean()
    
    # Calculate Mean Squared Error for the validation set
    mse_validate = ((validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate.values) ** 2).mean()
    
    # Calculate Mean Absolute Error for the test set
    mae_test = np.abs(testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test.values).mean()
    
    # Calculate Mean Absolute Error for the validation set
    mae_validate = np.abs(validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate.values).mean()
    
    return mse_test, mae_test, mse_validate, mae_validate

# Use joblib's Parallel to run the fit_arima function in parallel for different p values
results = Parallel(n_jobs=-1)(delayed(fit_arima)(p, y, testing, validation) for p in range(1, 25))

# Store the results in the DataFrame
for idx, (mse_test, mae_test, mse_validate, mae_validate) in enumerate(results):
    p_value = idx + 1
    results_test.loc["Test MSE", p_value] = mse_test
    results_test.loc["Test MAE", p_value] = mae_test
    results_validate.loc["Validation MSE", p_value] = mse_validate
    results_validate.loc["Validation MAE", p_value] = mae_validate

# Combine the results into one DataFrame for better display
final_results = pd.concat([results_test, results_validate], axis=0, ignore_index=False)

# Print the results in a more readable format
print(final_results.to_string(index=True))  # Display the results in a tabular format


#runtime 2m 30s


                       1          2          3          4          5          6          7          8          9          10         11         12         13         14         15         16         17         18         19         20         21         22         23         24
Test MSE        83.520059  80.194799  82.802817  82.484601  82.148029  78.614943  75.363332  73.478573  72.069796  70.744455  69.440469  70.296847  71.530207  73.199264  70.980494  66.884957  64.702587  67.133300  66.330506  67.377807  69.670836  70.746059  71.658436  73.719455
Test MAE         9.138931   8.955155   9.099605   9.082103   9.063555   8.866507   8.681206   8.571964   8.489393   8.410972   8.333095   8.384321   8.457553   8.555657   8.424992   8.178322   8.043792   8.193491   8.144354   8.208399   8.346906   8.411068   8.465131   8.586003
Validation MSE  42.906973  40.533155  42.393327  42.165722  41.925179  39.412246  37.119972  35.800775  34.819484  33.900142  32.999342  33.590509  34.444736  35.6

MA Model 


In [None]:
# Define results DataFrame to store MSE and MAE for test and validation sets separately
results_test = pd.DataFrame(np.zeros((2, 24)))  # MSE and MAE for test set
results_test.columns = [f"q={q}" for q in range(1, 25)]  # Column names for MA order
results_test.index = ["Test MSE", "Test MAE"]

results_validate = pd.DataFrame(np.zeros((2, 24)))  # MSE and MAE for validation set
results_validate.columns = [f"q={q}" for q in range(1, 25)]  # Column names for MA order
results_validate.index = ["Validation MSE", "Validation MAE"]

# Function to fit MA model and compute MSE and MAE
def fit_ma(q, training_data, testing, validation):
    # Fit MA model with (0, 0, q) order (AR=0, differencing=0, MA=q)
    mod = sm.tsa.ARIMA(training_data['PM10_Combined_Trend_Residual'], order=(0, 0, q))
    res = mod.fit()
    
    # One-step ahead forecast for the testing set
    forecast_test = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # One-step ahead forecast for the validation set
    forecast_validate = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
    
    # Calculate Mean Squared Error for the test set
    mse_test = ((testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test.values) ** 2).mean()
    
    # Calculate Mean Squared Error for the validation set
    mse_validate = ((validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate.values) ** 2).mean()
    
    # Calculate Mean Absolute Error for the test set
    mae_test = np.abs(testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test.values).mean()
    
    # Calculate Mean Absolute Error for the validation set
    mae_validate = np.abs(validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate.values).mean()
    
    return mse_test, mae_test, mse_validate, mae_validate

# Use joblib's Parallel to run the fit_ma function in parallel for different q values
results = Parallel(n_jobs=-1)(delayed(fit_ma)(q, y, testing, validation) for q in range(1, 25))

# Store the results in the DataFrame
for idx, (mse_test, mae_test, mse_validate, mae_validate) in enumerate(results):
    q_value = idx + 1
    results_test.loc["Test MSE", f"q={q_value}"] = mse_test
    results_test.loc["Test MAE", f"q={q_value}"] = mae_test
    results_validate.loc["Validation MSE", f"q={q_value}"] = mse_validate
    results_validate.loc["Validation MAE", f"q={q_value}"] = mae_validate

# Combine the results into one DataFrame for better display
final_results = pd.concat([results_test, results_validate], axis=0, ignore_index=False)

# Print the results in a more readable format
print(final_results.to_string(index=True))  # Display the results in a tabular format


#runtime 3m 20s

                      q=1        q=2        q=3        q=4        q=5        q=6        q=7        q=8        q=9       q=10       q=11       q=12       q=13       q=14       q=15       q=16       q=17       q=18       q=19       q=20       q=21       q=22       q=23       q=24
Test MSE        65.985091  67.075365  75.850070  68.614055  72.051681  77.651255  80.237003  85.623280  88.091847  90.037087  89.543566  88.571507  84.446137  82.698232  85.086645  89.222751  86.498848  85.654038  84.528418  82.907342  80.484712  79.632375  77.503295  78.043470
Test MAE         8.123121   8.189955   8.709195   8.283360   8.488326   8.811995   8.957511   9.253285   9.385726   9.488787   9.462746   9.411244   9.189458   9.093857   9.224242   9.445780   9.300476   9.254947   9.193934   9.105347   8.971327   8.923697   8.803596   8.834222
Validation MSE  30.631036  31.375297  37.461806  32.430413  34.806893  38.730776  40.563161  44.418164  46.201061  47.612732  47.254031  46.548610  43.571465  42.3

ARMA model

In [24]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from joblib import Parallel, delayed

# Assuming 'y' is your DataFrame
# Ensure the 'PM10_Combined_Trend_Residual' column is numeric
y['PM10_Combined_Trend_Residual'] = pd.to_numeric(y['PM10_Combined_Trend_Residual'], errors='coerce')

# Optionally, handle missing values (e.g., by filling or dropping)
y = y.dropna(subset=['PM10_Combined_Trend_Residual'])  # Drop rows with NaN values in the relevant column
# Or, if you prefer filling NaN values with the forward fill method:
# y['PM10_Combined_Trend_Residual'] = y['PM10_Combined_Trend_Residual'].fillna(method='ffill')

# Define the length of the dataset and split sizes
n = len(y)  # Length of the DataFrame y
train_size_initial = int(n * 0.7)  # 70% of the data for training
test_size = int(n * 0.1)   # 10% of the data for testing
validate_size = int(n * 0.1)  # 10% of the data for validation

# Define results DataFrame to store MSE for test and validation sets separately
results_test = pd.DataFrame(np.zeros((1, 576)))  # MSE for test set
results_test.columns = [f"p={p}_q={q}" for p in range(1, 25) for q in range(1, 25)]  # Column names for ARMA orders

results_validate = pd.DataFrame(np.zeros((1, 576)))  # MSE for validation set
results_validate.columns = [f"p={p}_q={q}" for p in range(1, 25) for q in range(1, 25)]  # Column names for ARMA orders

# Use the first 50% of the data for the initial training set (50% of the train_size)
train_size_current = int(train_size_initial * 0.5)  # Initial 50% of the train set
testing_start = train_size_current
testing_end = testing_start + test_size

# Expanding the training set by adding the next 10% in each iteration
training = y[:train_size_current]  # Initial training set
testing = y[testing_start:testing_end]  # Testing set
validation_start = testing_end
validation_end = validation_start + validate_size
validation = y[validation_start:validation_end]  # Validation set

# Function to fit ARMA model and compute MSE
def fit_arma(p, q, training_data, testing, validation):
    try:
        # Fit ARMA model with (p, 0, q) order (AR=p, differencing=0, MA=q)
        mod = sm.tsa.ARIMA(training_data['PM10_Combined_Trend_Residual'], order=(p, 0, q))
        res = mod.fit()
        
        # One-step ahead forecast for the testing set
        forecast_test = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
        
        # One-step ahead forecast for the validation set
        forecast_validate = res.forecast(steps=1, signal_only=False)  # Forecasting just 1 step ahead
        
        # Calculate Mean Squared Error for the test set
        mse_test = ((testing['PM10_Combined_Trend_Residual'].values[:1] - forecast_test.values) ** 2).mean()
        
        # Calculate Mean Squared Error for the validation set
        mse_validate = ((validation['PM10_Combined_Trend_Residual'].values[:1] - forecast_validate.values) ** 2).mean()
        
    except np.linalg.LinAlgError:
        mse_test = np.nan
        mse_validate = np.nan
    
    return mse_test, mse_validate

# Use joblib's Parallel to run the fit_arma function in parallel for different p, q combinations
# Use joblib's Parallel to run the fit_arma function in parallel for different p, q combinations
results = Parallel(n_jobs=-1)(delayed(fit_arma)(p, q, training, testing, validation) for p in range(1, 25) for q in range(1, 25))

# Store the results in the DataFrame
for idx, (mse_test, mse_validate) in enumerate(results):
    p_value = (idx // 24) + 1  # Calculate p from the index
    q_value = (idx % 24) + 1   # Calculate q from the index
    results_test.loc[0, f"p={p_value}_q={q_value}"] = mse_test
    results_validate.loc[0, f"p={p_value}_q={q_value}"] = mse_validate

# Combine the results into one DataFrame for better display
final_results = pd.concat([results_test, results_validate], axis=0, ignore_index=True)
final_results.index = ["Test MSE", "Validation MSE"]

# Print the results in a more readable format
print(final_results.to_string(index=True))  # Display the results in a tabular format


KeyboardInterrupt: 