In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr
import warnings
warnings.filterwarnings("ignore")

Загрузка и подготовка данных

In [None]:
df = pd.read_csv("daily_accidents_hol_dw_week.csv", parse_dates=["CRASH DATE"])
df.sort_values("CRASH DATE", inplace=True)
df.set_index("CRASH DATE", inplace=True)
df.index.freq = "D"

target = df["CRASH_COUNT"]
exog_vars = ["is_weekend", "month", "is_holiday"]
exog = df[exog_vars]

train_size = int(len(df) * 0.8)
y_train, y_test = target[:train_size], target[train_size:]
exog_train, exog_test = exog[:train_size], exog[train_size:]

Функция подбора гиперпараметров

In [None]:
def objective(trial):
    p = trial.suggest_int("p", 0, 3)
    q = trial.suggest_int("q", 0, 3)
    d = 1  
    P = trial.suggest_int("P", 0, 2)
    Q = trial.suggest_int("Q", 0, 2)
    D = trial.suggest_int("D", 0, 1)
    m = 7 

    try:
        model = SARIMAX(y_train,
                        exog=exog_train,
                        order=(p, d, q),
                        seasonal_order=(P, D, Q, m),
                        enforce_stationarity=False,
                        enforce_invertibility=False)
        model_fit = model.fit(disp=False)

        if not model_fit.mle_retvals['converged']:
            return float("inf")

        preds = model_fit.predict(start=len(y_train), end=len(target)-1, exog=exog_test)
        return mean_squared_error(y_test, preds)

    except:
        return float("inf")

Подбор гиперпараметров

In [6]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

[I 2025-05-06 13:59:23,647] A new study created in memory with name: no-name-d0ea56ee-620f-4f1b-9417-27079ca5d86e
[I 2025-05-06 13:59:53,956] Trial 0 finished with value: inf and parameters: {'p': 2, 'q': 1, 'P': 1, 'Q': 2, 'D': 0}. Best is trial 0 with value: inf.
[I 2025-05-06 14:00:05,647] Trial 1 finished with value: inf and parameters: {'p': 3, 'q': 0, 'P': 1, 'Q': 2, 'D': 0}. Best is trial 0 with value: inf.
[I 2025-05-06 14:00:12,260] Trial 2 finished with value: 3036.033330457284 and parameters: {'p': 3, 'q': 3, 'P': 0, 'Q': 0, 'D': 0}. Best is trial 2 with value: 3036.033330457284.
[I 2025-05-06 14:00:27,178] Trial 3 finished with value: inf and parameters: {'p': 0, 'q': 3, 'P': 2, 'Q': 1, 'D': 0}. Best is trial 2 with value: 3036.033330457284.
[I 2025-05-06 14:00:59,597] Trial 4 finished with value: 1279.9920991899896 and parameters: {'p': 2, 'q': 1, 'P': 2, 'Q': 0, 'D': 1}. Best is trial 4 with value: 1279.9920991899896.
[I 2025-05-06 14:01:15,981] Trial 5 finished with valu

Обучение

In [7]:
best_params = study.best_params
d = 1
m = 7

final_model = SARIMAX(target,
                      exog=exog,
                      order=(best_params["p"], d, best_params["q"]),
                      seasonal_order=(best_params["P"], best_params["D"], best_params["Q"], m),
                      enforce_stationarity=False,
                      enforce_invertibility=False)
model_fit = final_model.fit(disp=False)

Предсказание

In [8]:
forecast = model_fit.predict(start=len(y_train), end=len(target)-1, exog=exog_test)

Метрики

In [9]:
rmse = np.sqrt(mean_squared_error(y_test, forecast))
mae = mean_absolute_error(y_test, forecast)
r2 = r2_score(y_test, forecast)
corr, _ = pearsonr(y_test.values, forecast.values)

In [None]:
print(f"LSTM RMSE: {rmse}, MAE: {mae}, R²: {r2}, Correlation: {corr}")

LSTM RMSE: 31.23807833100531, MAE: 24.67303706067422, R²: 0.22309559252346722, Correlation: 0.5452030794547537
