In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm

# Load the CSV file into a DataFrame
y = pd.read_csv('../Detrended DAta/TargetCutto50MostImpFeatures_DF.csv')

# Display the DataFrame
y.head()

Unnamed: 0,Datum,PM10_Combined_Trend_Residual
0,2022-01-01 00:00:00+00:00,75.197962
1,2022-01-01 01:00:00+00:00,51.472071
2,2022-01-01 02:00:00+00:00,32.710483
3,2022-01-01 03:00:00+00:00,24.801767
4,2022-01-01 04:00:00+00:00,9.68366


In [None]:
# Spliting the data
def train_test_val_split(data, train_ratio=0.7, test_ratio=0.2):
    """
    Splits the dataset into Train, Test, and Validation sets.
    Parameters:
        data: DataFrame - The full dataset to split.
        train_ratio: float - Proportion of data to use for training.
        test_ratio: float - Proportion of data to use for testing.
    Returns:
        train_data, test_data, val_data - DataFrames for train, test, and validation splits.
    """
    train_size = int(len(data) * train_ratio)
    test_size = int(len(data) * test_ratio)
    train_data = data[:train_size]
    test_data = data[train_size:train_size + test_size]
    val_data = data[train_size + test_size:]
    return train_data, test_data, val_data

# Call the function to split the data
train_data, test_data, val_data = train_test_val_split(y)


AR model

In [5]:
# Initialize Results DataFrames
results_test = pd.DataFrame(index=["Test MSE", "Test MAE"], columns=np.arange(1, 25))
results_validate = pd.DataFrame(index=["Validation MSE", "Validation MAE"], columns=np.arange(1, 25))

# AR Model Fitting and Error Calculation
def fit_ar_model(p, train_data, test_data, val_data):
    try:
        # Fit AR model
        model = sm.tsa.ARIMA(train_data["PM10_Combined_Trend_Residual"], order=(p, 0, 0))
        res = model.fit()

        # Forecast for test set
        test_forecast = res.forecast(steps=len(test_data))
        test_mse = ((test_data["PM10_Combined_Trend_Residual"].values - test_forecast.values) ** 2).mean()
        test_mae = np.abs(test_data["PM10_Combined_Trend_Residual"].values - test_forecast.values).mean()

        # Forecast for validation set
        val_forecast = res.forecast(steps=len(val_data))
        val_mse = ((val_data["PM10_Combined_Trend_Residual"].values - val_forecast.values) ** 2).mean()
        val_mae = np.abs(val_data["PM10_Combined_Trend_Residual"].values - val_forecast.values).mean()

        return test_mse, test_mae, val_mse, val_mae
    except Exception as e:
        print(f"Error for p={p}: {e}")
        return np.nan, np.nan, np.nan, np.nan

# Run in parallel for AR lags (p) from 1 to 24
results = Parallel(n_jobs=-1)(delayed(fit_ar_model)(p, train_data, test_data, val_data) for p in range(1, 25))

# Store results in DataFrames
for idx, (test_mse, test_mae, val_mse, val_mae) in enumerate(results):
    p_value = idx + 1
    results_test.loc["Test MSE", p_value] = test_mse
    results_test.loc["Test MAE", p_value] = test_mae
    results_validate.loc["Validation MSE", p_value] = val_mse
    results_validate.loc["Validation MAE", p_value] = val_mae

# Combine results into a single DataFrame for display
final_results = pd.concat([results_test, results_validate], axis=0)
print(final_results.to_string(index=True))


                       1          2          3          4          5          6          7          8          9          10         11         12         13         14         15         16         17         18         19         20         21         22         23         24
Test MSE        75.799678  75.661107  75.658828  75.655298  75.642856  75.630336   75.61196   75.58474  75.552603  75.511302  75.443446  75.325239  75.187644  75.089311  75.000972  74.880268  74.773601  74.662847  74.555648  74.453701  74.369358   74.32279  74.298822  74.310638
Test MAE         6.208841   6.196197   6.195977   6.195576   6.194229   6.192906   6.191131   6.188715   6.186147   6.183305   6.178492    6.17036   6.161023   6.154307   6.148161   6.139595   6.131692   6.122664   6.113185   6.102667    6.09146   6.082506   6.071415   6.062875
Validation MSE  37.612108  37.414803  37.409042  37.398266  37.364463  37.330864  37.286039  37.225124  37.161295  37.092591  36.983614  36.812227   36.63508   36.

MA Model 


In [6]:
# Initialize Results DataFrames
results_test = pd.DataFrame(index=["Test MSE", "Test MAE"], columns=np.arange(1, 25))
results_validate = pd.DataFrame(index=["Validation MSE", "Validation MAE"], columns=np.arange(1, 25))

# MA Model Fitting and Error Calculation
def fit_ma_model(q, train_data, test_data, val_data):
    try:
        # Fit MA model
        model = sm.tsa.ARIMA(train_data["PM10_Combined_Trend_Residual"], order=(0, 0, q))
        res = model.fit()

        # Forecast for test set
        test_forecast = res.forecast(steps=len(test_data))
        test_mse = ((test_data["PM10_Combined_Trend_Residual"].values - test_forecast.values) ** 2).mean()
        test_mae = np.abs(test_data["PM10_Combined_Trend_Residual"].values - test_forecast.values).mean()

        # Forecast for validation set
        val_forecast = res.forecast(steps=len(val_data))
        val_mse = ((val_data["PM10_Combined_Trend_Residual"].values - val_forecast.values) ** 2).mean()
        val_mae = np.abs(val_data["PM10_Combined_Trend_Residual"].values - val_forecast.values).mean()

        return test_mse, test_mae, val_mse, val_mae
    except Exception as e:
        print(f"Error for q={q}: {e}")
        return np.nan, np.nan, np.nan, np.nan

# Run in parallel for MA lags (q) from 1 to 24
results = Parallel(n_jobs=-1)(delayed(fit_ma_model)(q, train_data, test_data, val_data) for q in range(1, 25))

# Store results in DataFrames
for idx, (test_mse, test_mae, val_mse, val_mae) in enumerate(results):
    q_value = idx + 1
    results_test.loc["Test MSE", q_value] = test_mse
    results_test.loc["Test MAE", q_value] = test_mae
    results_validate.loc["Validation MSE", q_value] = val_mse
    results_validate.loc["Validation MAE", q_value] = val_mae

# Combine results into a single DataFrame for display
final_results = pd.concat([results_test, results_validate], axis=0)
print(final_results.to_string(index=True))


KeyboardInterrupt: 

ARMA model für mein laptop aktuell nur mit p und q bis 5

In [None]:
# Initialize Results DataFrames
results_test = pd.DataFrame(index=["Test MSE", "Test MAE"], columns=[(p, q) for p in range(1, 6) for q in range(1, 6)])
results_validate = pd.DataFrame(index=["Validation MSE", "Validation MAE"], columns=[(p, q) for p in range(1, 6) for q in range(1, 6)])

# ARMA Model Fitting and Error Calculation
def fit_arma_model(p, q, train_data, test_data, val_data):
    try:
        # Fit ARMA model
        model = sm.tsa.ARIMA(train_data["PM10_Combined_Trend_Residual"], order=(p, 0, q))
        res = model.fit()

        # Forecast for test set
        test_forecast = res.forecast(steps=len(test_data))
        test_mse = ((test_data["PM10_Combined_Trend_Residual"].values - test_forecast.values) ** 2).mean()
        test_mae = np.abs(test_data["PM10_Combined_Trend_Residual"].values - test_forecast.values).mean()

        # Forecast for validation set
        val_forecast = res.forecast(steps=len(val_data))
        val_mse = ((val_data["PM10_Combined_Trend_Residual"].values - val_forecast.values) ** 2).mean()
        val_mae = np.abs(val_data["PM10_Combined_Trend_Residual"].values - val_forecast.values).mean()

        return test_mse, test_mae, val_mse, val_mae
    except Exception as e:
        print(f"Error for (p, q)=({p}, {q}): {e}")
        return np.nan, np.nan, np.nan, np.nan

# Run in parallel for ARMA combinations (p, q) from (1, 1) to (5, 5)
results = Parallel(n_jobs=-1)(
    delayed(fit_arma_model)(p, q, train_data, test_data, val_data) 
    for p in range(1, 6) for q in range(1, 6)
)

# Store results in DataFrames
for idx, (test_mse, test_mae, val_mse, val_mae) in enumerate(results):
    p, q = results_test.columns[idx]
    results_test.loc["Test MSE", (p, q)] = test_mse
    results_test.loc["Test MAE", (p, q)] = test_mae
    results_validate.loc["Validation MSE", (p, q)] = val_mse
    results_validate.loc["Validation MAE", (p, q)] = val_mae

# Combine results into a single DataFrame for display
final_results = pd.concat([results_test, results_validate], axis=0)
print(final_results.to_string(index=True))


KeyboardInterrupt: 