In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr

Загрузка  и подготовка данных

In [2]:
df = pd.read_csv("daily_accidents.csv", parse_dates=["CRASH DATE"])
df = df.sort_values("CRASH DATE")
df.set_index("CRASH DATE", inplace=True)
df.index.freq = 'D'
y = df["ACCIDENT_COUNT"]

In [3]:
train_size = int(len(y) * 0.8)
train, test = y.iloc[:train_size], y.iloc[train_size:]

Функция подбора гиперпараметров

In [None]:
def objective(trial):
    p = trial.suggest_int("p", 0, 3)
    q = trial.suggest_int("q", 0, 3)
    d = 1  # фиксировано по ADF
    P = trial.suggest_int("P", 0, 2)
    Q = trial.suggest_int("Q", 0, 2)
    D = trial.suggest_int("D", 0, 1)
    m = 7  # недельная сезонность

    try:
        model = SARIMAX(train,
                        order=(p, d, q),
                        seasonal_order=(P, D, Q, m),
                        enforce_stationarity=False,
                        enforce_invertibility=False)
        model_fit = model.fit(disp=False)

        # Проверка сходимости модели
        if not model_fit.mle_retvals['converged']:
            return float("inf")

        preds = model_fit.predict(start=len(train), end=len(y)-1)
        return mean_squared_error(test, preds)

    except:
        return float("inf")


Подбор гиперпараметров

In [11]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)
print("Лучшие параметры:", study.best_params)

[I 2025-05-06 02:29:49,375] A new study created in memory with name: no-name-5825b210-fd1c-4df7-bfd6-6066641a88e4
[I 2025-05-06 02:29:56,928] Trial 0 finished with value: 1744.6752533528843 and parameters: {'p': 2, 'q': 1, 'P': 1, 'Q': 0, 'D': 1}. Best is trial 0 with value: 1744.6752533528843.
[I 2025-05-06 02:30:03,340] Trial 1 finished with value: 1379.06151194527 and parameters: {'p': 1, 'q': 0, 'P': 0, 'Q': 2, 'D': 1}. Best is trial 1 with value: 1379.06151194527.
[I 2025-05-06 02:30:06,436] Trial 2 finished with value: 1389.3483825791104 and parameters: {'p': 2, 'q': 0, 'P': 1, 'Q': 1, 'D': 0}. Best is trial 1 with value: 1379.06151194527.
[I 2025-05-06 02:30:07,059] Trial 3 finished with value: 3588.1946245306995 and parameters: {'p': 0, 'q': 1, 'P': 0, 'Q': 1, 'D': 0}. Best is trial 1 with value: 1379.06151194527.
[I 2025-05-06 02:30:23,325] Trial 4 finished with value: inf and parameters: {'p': 2, 'q': 0, 'P': 2, 'Q': 2, 'D': 1}. Best is trial 1 with value: 1379.06151194527.
[

Лучшие параметры: {'p': 1, 'q': 0, 'P': 1, 'Q': 1, 'D': 0}


In [12]:
best_params = study.best_params

Обучение

In [15]:
d = 1  # вручную, т.к. не подбирался
best_model = SARIMAX(y,
                     order=(best_params["p"], d, best_params["q"]),
                     seasonal_order=(best_params["P"], best_params["D"], best_params["Q"], 7),  # m = 7
                     enforce_stationarity=False,
                     enforce_invertibility=False)
model_fit = best_model.fit(disp=False)

Предсказание

In [16]:
preds = model_fit.predict(start=len(train), end=len(y)-1)
preds.index = test.index  # выравниваем индексы

Метрики

In [17]:
rmse = np.sqrt(mean_squared_error(test, preds))
mae = mean_absolute_error(test, preds)
r2 = r2_score(test, preds)
corr, _ = pearsonr(test.values, preds.values)
print(f"RMSE: {rmse}, MAE: {mae}, R²: {r2}, Correlation: {corr}")

RMSE: 30.881626907119408, MAE: 24.401035546429853, R²: 0.2407246342734466, Correlation: 0.5800397424182442
