In [6]:
import pandas as pd

# Load the CSV file into a DataFrame
x = pd.read_csv('../AR/50MostImpFeatures_DF.csv')
y = pd.read_csv('../AR/TargetCutto50MostImpFeatures_DF.csv')

y.head()


Unnamed: 0,Datum,PM10_Combined_Trend_Residual
0,2022-01-01 00:00:00+00:00,75.197962
1,2022-01-01 01:00:00+00:00,51.472071
2,2022-01-01 02:00:00+00:00,32.710483
3,2022-01-01 03:00:00+00:00,24.801767
4,2022-01-01 04:00:00+00:00,9.68366


In [11]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error

# Define the order for ARIMA and SARIMA models
arima_order = (1, 1, 1)
sarima_order = (1, 1, 1)
seasonal_order = (1, 1, 1, 12)  # Example seasonal order

# Fit ARIMAX model
arimax_model = ARIMA(endog=y['PM10_Combined_Trend_Residual'], exog=x, order=arima_order)
arimax_fit = arimax_model.fit()

# Fit SARIMAX model
sarimax_model = SARIMAX(endog=y['PM10_Combined_Trend_Residual'], exog=x, order=sarima_order, seasonal_order=seasonal_order)
sarimax_fit = sarimax_model.fit(disp=False)

# Make predictions
arimax_predictions = arimax_fit.forecast(steps=len(y_validate), exog=x_validate)
sarimax_predictions = sarimax_fit.forecast(steps=len(y_validate), exog=x_validate)

# Calculate MSE
arimax_mse = mean_squared_error(y_validate['PM10_Combined_Trend_Residual'], arimax_predictions)
sarimax_mse = mean_squared_error(y_validate['PM10_Combined_Trend_Residual'], sarimax_predictions)

print(f"ARIMAX MSE: {arimax_mse}")
print(f"SARIMAX MSE: {sarimax_mse}")



ARIMAX MSE: 28.737498819413833
SARIMAX MSE: 33.32387913169761


In [10]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import numpy as np

# Ensure data is numeric and handle missing values
x = x.apply(pd.to_numeric, errors='coerce').fillna(0)
y = y.apply(pd.to_numeric, errors='coerce').fillna(0)

# Train-test-validation split for x (features) and y (target variable)
train_size = int(len(y) * 0.7)
test_size = int(len(y) * 0.2)
validate_size = len(y) - train_size - test_size

x_train, x_test, x_validate = np.split(x, [train_size, train_size + test_size])
y_train, y_test, y_validate = np.split(y, [train_size, train_size + test_size])

# Function to fit and predict using ARIMA model
def fit_arima(train, order):
    model = ARIMA(train, order=order)
    model_fit = model.fit()
    return model_fit

# Function to fit and predict using SARIMA model
def fit_sarima(train, order, seasonal_order):
    model = SARIMAX(train, order=order, seasonal_order=seasonal_order)
    model_fit = model.fit(disp=False)
    return model_fit

# Expanding cross-validation
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(y_train):
    # Use .iloc for position-based indexing
    y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]
    x_train_cv, x_test_cv = x_train.iloc[train_index], x_train.iloc[test_index]

    # Fit ARIMA model
    ar_model = fit_arima(y_train_cv, order=(1, 0, 0))
    ar_predictions = ar_model.forecast(steps=len(y_test_cv))
    
    # Fit MA model
    ma_model = fit_arima(y_train_cv, order=(0, 0, 1))
    ma_predictions = ma_model.forecast(steps=len(y_test_cv))
    
    # Fit ARMA model
    arma_model = fit_arima(y_train_cv, order=(1, 0, 1))
    arma_predictions = arma_model.forecast(steps=len(y_test_cv))
    
    # Fit SARIMA model (ensure univariate time series)
    sarima_model = fit_sarima(y_train_cv['PM10_Combined_Trend_Residual'], order=(1, 0, 1), seasonal_order=(1, 1, 1, 12))
    sarima_predictions = sarima_model.forecast(steps=len(y_test_cv))
    
    # Evaluate models on the test set (y_test_cv)
    ar_mse_test = mean_squared_error(y_test_cv, ar_predictions)
    ma_mse_test = mean_squared_error(y_test_cv, ma_predictions)
    arma_mse_test = mean_squared_error(y_test_cv, arma_predictions)
    sarima_mse_test = mean_squared_error(y_test_cv, sarima_predictions)

    # Evaluate models on the validation set (y_validate)
    ar_mse_validate = mean_squared_error(y_validate, ar_model.forecast(steps=len(y_validate)))
    ma_mse_validate = mean_squared_error(y_validate, ma_model.forecast(steps=len(y_validate)))
    arma_mse_validate = mean_squared_error(y_validate, arma_model.forecast(steps=len(y_validate)))
    sarima_mse_validate = mean_squared_error(y_validate, sarima_model.forecast(steps=len(y_validate)))

    # Print results
    print("AR MSE Test:", ar_mse_test)
    print("MA MSE Test:", ma_mse_test)
    print("ARMA MSE Test:", arma_mse_test)
    print("SARIMA MSE Test:", sarima_mse_test)
    print("AR MSE Validate:", ar_mse_validate)
    print("MA MSE Validate:", ma_mse_validate)
    print("ARMA MSE Validate:", arma_mse_validate)
    print("SARIMA MSE Validate:", sarima_mse_validate)


ValueError: SARIMAX models require univariate `endog`. Got shape (2871, 2).

In [None]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Train-test-validation split
train_size = int(len(y) * 0.7)
test_size = int(len(y) * 0.2)
validate_size = len(y) - train_size - test_size

train, test, validate = np.split(y['PM10_Combined_Trend_Residual'], [train_size, train_size + test_size])

# Function to fit and predict using ARIMA model
def fit_arima(train, order):
    model = ARIMA(train, order=order)
    model_fit = model.fit()
    return model_fit

# Function to fit and predict using SARIMA model
def fit_sarima(train, order, seasonal_order):
    model = SARIMAX(train, order=order, seasonal_order=seasonal_order)
    model_fit = model.fit()
    return model_fit

# Expanding cross-validation
tscv = TimeSeriesSplit(n_splits=5)
fold = 1

print(f"{'Fold':<6} {'Model':<8} {'MSE Test':<10} {'MSE Validate':<15}")
print("=" * 40)

for train_index, test_index in tscv.split(train):
    train_cv, test_cv = train[train_index], train[test_index]
    
    models = {
        "AR": fit_arima(train_cv, order=(1, 0, 0)),
        "MA": fit_arima(train_cv, order=(0, 0, 1)),
        "ARMA": fit_arima(train_cv, order=(1, 0, 1)),
        "SARIMA": fit_sarima(train_cv, order=(1, 0, 1), seasonal_order=(1, 1, 1, 12))
    }

    for model_name, model_fit in models.items():
        predictions_test = model_fit.forecast(steps=len(test_cv))
        predictions_validate = model_fit.forecast(steps=len(validate))
        
        # Calculate MSE
        mse_test = mean_squared_error(test_cv, predictions_test)
        mse_validate = mean_squared_error(validate, predictions_validate)
        
        print(f"{fold:<6} {model_name:<8} {mse_test:<10.4f} {mse_validate:<15.4f}")
    fold += 1


  return bound(*args, **kwds)


Fold   Model    MSE Test   MSE Validate   
1      AR       40.5705    89.5497        
1      MA       38.2466    85.3100        
1      ARMA     40.7036    89.8140        
1      SARIMA   38.6565    84.9547        
2      AR       27.9202    58.7961        
2      MA       27.6984    57.7342        
2      ARMA     28.0045    59.1488        
2      SARIMA   27.1645    39.7247        
3      AR       72.1517    53.5400        
3      MA       72.4485    53.3734        
3      ARMA     72.0366    53.5554        
3      SARIMA   73.1516    56.2175        
4      AR       52.0502    47.0926        
4      MA       52.2094    46.9250        
4      ARMA     51.8040    46.4819        
4      SARIMA   60.0484    20.0369        
5      AR       48.0416    44.4847        
5      MA       48.1294    43.3521        
5      ARMA     48.0965    45.4195        
5      SARIMA   36.2654    30.0752        


```python
mean_target = y['PM10_Combined_Trend_Residual'].mean()
mean_target
```

In [None]:

mean_target = y['PM10_Combined_Trend_Residual'].mean()
mean_target

14.481885093511329

In [None]:
std_target = y['PM10_Combined_Trend_Residual'].std()
std_target

7.294967954185212

In [None]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
import numpy as np

# Train-test-validation split
train_size = int(len(y) * 0.7)
test_size = int(len(y) * 0.2)
validate_size = len(y) - train_size - test_size

train, test, validate = np.split(y['PM10_Combined_Trend_Residual'], [train_size, train_size + test_size])

# Function to fit and predict using ARIMA model
def fit_arima(train, order):
    model = ARIMA(train, order=order)
    model_fit = model.fit()
    return model_fit

# Function to fit and predict using SARIMA model
def fit_sarima(train, order, seasonal_order):
    model = SARIMAX(train, order=order, seasonal_order=seasonal_order)
    model_fit = model.fit()
    return model_fit

# Expanding cross-validation parameters
initial_train_size = int(len(train) * 0.5)  # Use 50% of training data as the initial training size
test_size = int(len(train) * 0.1)  # Test size (adjust as needed)
n_splits = (len(train) - initial_train_size) // test_size

fold = 1
print(f"{'Fold':<6} {'Model':<8} {'MSE Test':<10} {'MSE Validate':<15}")
print("=" * 40)

# Expanding Cross-Validation
for fold in range(1, n_splits + 1):
    # Define training and test sets for each fold
    train_end = initial_train_size + (fold - 1) * test_size
    test_start = train_end
    test_end = train_end + test_size

    train_cv = train[:train_end]
    test_cv = train[test_start:test_end]

    # Fit models for this fold
    models = {
        "AR": fit_arima(train_cv, order=(1, 0, 0)),
        "MA": fit_arima(train_cv, order=(0, 0, 1)),
        "ARMA": fit_arima(train_cv, order=(1, 0, 1)),
        "SARIMA": fit_sarima(train_cv, order=(1, 0, 1), seasonal_order=(1, 1, 1, 24))
    }

    for model_name, model_fit in models.items():
        predictions_test = model_fit.forecast(steps=len(test_cv))
        predictions_validate = model_fit.forecast(steps=len(validate))
        
        # Calculate MSE
        mse_test = mean_squared_error(test_cv, predictions_test)
        mse_validate = mean_squared_error(validate, predictions_validate)
        
        print(f"{fold:<6} {model_name:<8} {mse_test:<10.4f} {mse_validate:<15.4f}")


  return bound(*args, **kwds)


Fold   Model    MSE Test   MSE Validate   
1      AR       84.2168    53.7141        
1      MA       84.9976    53.4333        
1      ARMA     83.9558    53.7641        
1      SARIMA   83.6112    54.8880        
2      AR       53.6145    55.5490        
2      MA       52.6968    54.2958        
2      ARMA     53.7630    55.7384        
2      SARIMA   112.9097   108.8282       


KeyboardInterrupt: 

In [None]:


from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
import numpy as np
import itertools

# Example dataset (replace 'y' with your actual dataset)
n_points = 100
data = np.arange(n_points)

# Train-test-validation split
train_size = int(len(data) * 0.7)
test_size = int(len(data) * 0.2)
validate_size = len(data) - train_size - test_size

train, test, validate = np.split(data, [train_size, train_size + test_size])

# Define hyperparameter ranges
p = d = q = range(0, 2)  # Smaller ranges for ARIMA(p, d, q)
P = D = Q = range(0, 1)  # Smaller ranges for SARIMA(P, D, Q)
m = [24]  # Seasonal period (hourly data with daily seasonality)

# Generate all ARIMA combinations
arima_combinations = list(itertools.product(p, d, q))
sarima_combinations = list(itertools.product(p, d, q, P, D, Q, m))

# Expanding cross-validation parameters
initial_train_size = int(len(train) * 0.5)  # 50% initial train size
test_size = int(len(train) * 0.1)  # Test size
n_splits = (len(train) - initial_train_size) // test_size

# Function to fit ARIMA model
def fit_arima(train, order):
    try:
        model = ARIMA(train, order=order)
        model_fit = model.fit()
        return model_fit
    except Exception as e:
        print(f"ARIMA failed for order={order}: {e}")
        return None

# Function to fit SARIMA model
def fit_sarima(train, order, seasonal_order):
    try:
        model = SARIMAX(train, order=order, seasonal_order=seasonal_order)
        model_fit = model.fit(disp=False, maxiter=500)
        return model_fit
    except Exception as e:
        print(f"SARIMA failed for order={order}, seasonal_order={seasonal_order}: {e}")
        return None

# Hyperparameter tuning
best_arima = None
best_arima_mse = float("inf")
best_arima_params = None

best_sarima = None
best_sarima_mse = float("inf")
best_sarima_params = None

# Loop through ARIMA hyperparameter combinations
for order in arima_combinations:
    fold_mse = []
    for fold in range(1, n_splits + 1):
        # Expanding window logic
        train_end = initial_train_size + (fold - 1) * test_size
        test_start = train_end
        test_end = train_end + test_size

        train_cv = train[:train_end]
        test_cv = train[test_start:test_end]

        # Fit ARIMA model
        arima_model = fit_arima(train_cv, order=order)
        if arima_model is not None:
            predictions = arima_model.forecast(steps=len(test_cv))
            mse = mean_squared_error(test_cv, predictions)
            fold_mse.append(mse)

    # Average MSE across folds
    avg_mse = np.mean(fold_mse) if fold_mse else float("inf")
    if avg_mse < best_arima_mse:
        best_arima_mse = avg_mse
        best_arima_params = order

# Loop through SARIMA hyperparameter combinations
for order in itertools.product(p, d, q):
    for seasonal_order in itertools.product(P, D, Q, m):
        fold_mse = []
        for fold in range(1, n_splits + 1):
            train_end = initial_train_size + (fold - 1) * test_size
            test_start = train_end
            test_end = train_end + test_size

            train_cv = train[:train_end]
            test_cv = train[test_start:test_end]

            # Fit SARIMA model
            sarima_model = fit_sarima(train_cv, order=order, seasonal_order=seasonal_order)
            if sarima_model is not None:
                predictions = sarima_model.forecast(steps=len(test_cv))
                mse = mean_squared_error(test_cv, predictions)
                fold_mse.append(mse)

        # Average MSE across folds
        avg_mse = np.mean(fold_mse) if fold_mse else float("inf")
        if avg_mse < best_sarima_mse:
            best_sarima_mse = avg_mse
            best_sarima_params = (order, seasonal_order)

# Final evaluation on validation set using best parameters
print(f"Best ARIMA Params: {best_arima_params} with MSE {best_arima_mse}")
print(f"Best SARIMA Params: {best_sarima_params} with MSE {best_sarima_mse}")

final_arima_model = fit_arima(train, order=best_arima_params)
final_arima_predictions = final_arima_model.forecast(steps=len(validate)) if final_arima_model else []
final_arima_mse = mean_squared_error(validate, final_arima_predictions) if final_arima_model else float("inf")

final_sarima_model = fit_sarima(train, order=best_sarima_params[0], seasonal_order=best_sarima_params[1])
final_sarima_predictions = final_sarima_model.forecast(steps=len(validate)) if final_sarima_model else []
final_sarima_mse = mean_squared_error(validate, final_sarima_predictions) if final_sarima_model else float("inf")

print(f"Validation Results: ARIMA MSE = {final_arima_mse}, SARIMA MSE = {final_sarima_mse}")


  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-stationary starting autoregres

Best ARIMA Params: (1, 1, 1) with MSE 1.0001175376862573e-05
Best SARIMA Params: ((1, 1, 1), (0, 0, 0, 24)) with MSE 1.0001175376862573e-05


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-stationary starting autoregressive parameters'


Validation Results: ARIMA MSE = 400.09975129206083, SARIMA MSE = 400.09975129206083


