In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm

# Load the CSV file into a DataFrame
y = pd.read_csv('../Detrended DAta/TargetCutto50MostImpFeatures_DF.csv')

# Display the DataFrame
y.head()

Unnamed: 0,Datum,PM10_Combined_Trend_Residual
0,2022-01-01 00:00:00+00:00,75.197962
1,2022-01-01 01:00:00+00:00,51.472071
2,2022-01-01 02:00:00+00:00,32.710483
3,2022-01-01 03:00:00+00:00,24.801767
4,2022-01-01 04:00:00+00:00,9.68366


In [None]:
# Spliting the data
def train_test_val_split(data, train_ratio=0.7, test_ratio=0.2):
    """
    Splits the dataset into Train, Test, and Validation sets.
    Parameters:
        data: DataFrame - The full dataset to split.
        train_ratio: float - Proportion of data to use for training.
        test_ratio: float - Proportion of data to use for testing.
    Returns:
        train_data, test_data, val_data - DataFrames for train, test, and validation splits.
    """
    train_size = int(len(data) * train_ratio)
    test_size = int(len(data) * test_ratio)
    train_data = data[:train_size]
    test_data = data[train_size:train_size + test_size]
    val_data = data[train_size + test_size:]
    return train_data, test_data, val_data

# Call the function to split the data
train_data, test_data, val_data = train_test_val_split(y)


AR model

In [None]:
# Initialize Results DataFrames
results_test = pd.DataFrame(index=["Test MSE", "Test MAE"], columns=np.arange(1, 25))
results_validate = pd.DataFrame(index=["Validation MSE", "Validation MAE"], columns=np.arange(1, 25))

# AR Model Fitting and Error Calculation
def fit_ar_model(p, train_data, test_data, val_data):
    try:
        # Fit AR model
        model = sm.tsa.ARIMA(train_data["PM10_Combined_Trend_Residual"], order=(p, 0, 0))
        res = model.fit()

        # Forecast for test set
        test_forecast = res.forecast(steps=len(test_data))
        test_mse = ((test_data["PM10_Combined_Trend_Residual"].values - test_forecast.values) ** 2).mean()
        test_mae = np.abs(test_data["PM10_Combined_Trend_Residual"].values - test_forecast.values).mean()

        # Forecast for validation set
        val_forecast = res.forecast(steps=len(val_data))
        val_mse = ((val_data["PM10_Combined_Trend_Residual"].values - val_forecast.values) ** 2).mean()
        val_mae = np.abs(val_data["PM10_Combined_Trend_Residual"].values - val_forecast.values).mean()

        return test_mse, test_mae, val_mse, val_mae
    except Exception as e:
        print(f"Error for p={p}: {e}")
        return np.nan, np.nan, np.nan, np.nan

# Run in parallel for AR lags (p) from 1 to 24
results = Parallel(n_jobs=-1)(delayed(fit_ar_model)(p, train_data, test_data, val_data) for p in range(1, 25))

# Store results in DataFrames
for idx, (test_mse, test_mae, val_mse, val_mae) in enumerate(results):
    p_value = idx + 1
    results_test.loc["Test MSE", p_value] = test_mse
    results_test.loc["Test MAE", p_value] = test_mae
    results_validate.loc["Validation MSE", p_value] = val_mse
    results_validate.loc["Validation MAE", p_value] = val_mae

# Combine results into a single DataFrame for display
final_results = pd.concat([results_test, results_validate], axis=0)
print(final_results.to_string(index=True))


MA Model 


In [None]:
# Initialize Results DataFrames
results_test = pd.DataFrame(index=["Test MSE", "Test MAE"], columns=np.arange(1, 25))
results_validate = pd.DataFrame(index=["Validation MSE", "Validation MAE"], columns=np.arange(1, 25))

# MA Model Fitting and Error Calculation
def fit_ma_model(q, train_data, test_data, val_data):
    try:
        # Fit MA model
        model = sm.tsa.ARIMA(train_data["PM10_Combined_Trend_Residual"], order=(0, 0, q))
        res = model.fit()

        # Forecast for test set
        test_forecast = res.forecast(steps=len(test_data))
        test_mse = ((test_data["PM10_Combined_Trend_Residual"].values - test_forecast.values) ** 2).mean()
        test_mae = np.abs(test_data["PM10_Combined_Trend_Residual"].values - test_forecast.values).mean()

        # Forecast for validation set
        val_forecast = res.forecast(steps=len(val_data))
        val_mse = ((val_data["PM10_Combined_Trend_Residual"].values - val_forecast.values) ** 2).mean()
        val_mae = np.abs(val_data["PM10_Combined_Trend_Residual"].values - val_forecast.values).mean()

        return test_mse, test_mae, val_mse, val_mae
    except Exception as e:
        print(f"Error for q={q}: {e}")
        return np.nan, np.nan, np.nan, np.nan

# Run in parallel for MA lags (q) from 1 to 24
results = Parallel(n_jobs=-1)(delayed(fit_ma_model)(q, train_data, test_data, val_data) for q in range(1, 25))

# Store results in DataFrames
for idx, (test_mse, test_mae, val_mse, val_mae) in enumerate(results):
    q_value = idx + 1
    results_test.loc["Test MSE", q_value] = test_mse
    results_test.loc["Test MAE", q_value] = test_mae
    results_validate.loc["Validation MSE", q_value] = val_mse
    results_validate.loc["Validation MAE", q_value] = val_mae

# Combine results into a single DataFrame for display
final_results = pd.concat([results_test, results_validate], axis=0)
print(final_results.to_string(index=True))


                      q=1        q=2        q=3        q=4        q=5        q=6        q=7        q=8        q=9       q=10       q=11       q=12       q=13       q=14       q=15       q=16       q=17       q=18       q=19       q=20       q=21       q=22       q=23       q=24
Test MSE        65.985091  67.075365  75.850070  68.614055  72.051681  77.651255  80.237003  85.623280  88.091847  90.037087  89.543566  88.571507  84.446137  82.698232  85.086645  89.222751  86.498848  85.654038  84.528418  82.907342  80.484712  79.632375  77.503295  78.043470
Test MAE         8.123121   8.189955   8.709195   8.283360   8.488326   8.811995   8.957511   9.253285   9.385726   9.488787   9.462746   9.411244   9.189458   9.093857   9.224242   9.445780   9.300476   9.254947   9.193934   9.105347   8.971327   8.923697   8.803596   8.834222
Validation MSE  30.631036  31.375297  37.461806  32.430413  34.806893  38.730776  40.563161  44.418164  46.201061  47.612732  47.254031  46.548610  43.571465  42.3

ARMA model für mein laptop aktuell nur mit p und q bis 5

In [None]:
# Initialize Results DataFrames
results_test = pd.DataFrame(index=["Test MSE", "Test MAE"], columns=[(p, q) for p in range(1, 6) for q in range(1, 6)])
results_validate = pd.DataFrame(index=["Validation MSE", "Validation MAE"], columns=[(p, q) for p in range(1, 6) for q in range(1, 6)])

# ARMA Model Fitting and Error Calculation
def fit_arma_model(p, q, train_data, test_data, val_data):
    try:
        # Fit ARMA model
        model = sm.tsa.ARIMA(train_data["PM10_Combined_Trend_Residual"], order=(p, 0, q))
        res = model.fit()

        # Forecast for test set
        test_forecast = res.forecast(steps=len(test_data))
        test_mse = ((test_data["PM10_Combined_Trend_Residual"].values - test_forecast.values) ** 2).mean()
        test_mae = np.abs(test_data["PM10_Combined_Trend_Residual"].values - test_forecast.values).mean()

        # Forecast for validation set
        val_forecast = res.forecast(steps=len(val_data))
        val_mse = ((val_data["PM10_Combined_Trend_Residual"].values - val_forecast.values) ** 2).mean()
        val_mae = np.abs(val_data["PM10_Combined_Trend_Residual"].values - val_forecast.values).mean()

        return test_mse, test_mae, val_mse, val_mae
    except Exception as e:
        print(f"Error for (p, q)=({p}, {q}): {e}")
        return np.nan, np.nan, np.nan, np.nan

# Run in parallel for ARMA combinations (p, q) from (1, 1) to (5, 5)
results = Parallel(n_jobs=-1)(
    delayed(fit_arma_model)(p, q, train_data, test_data, val_data) 
    for p in range(1, 6) for q in range(1, 6)
)

# Store results in DataFrames
for idx, (test_mse, test_mae, val_mse, val_mae) in enumerate(results):
    p, q = results_test.columns[idx]
    results_test.loc["Test MSE", (p, q)] = test_mse
    results_test.loc["Test MAE", (p, q)] = test_mae
    results_validate.loc["Validation MSE", (p, q)] = val_mse
    results_validate.loc["Validation MAE", (p, q)] = val_mae

# Combine results into a single DataFrame for display
final_results = pd.concat([results_test, results_validate], axis=0)
print(final_results.to_string(index=True))


KeyboardInterrupt: 