Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create an issue when database backups fail because the system runs out of resources #109020

Merged
merged 5 commits into from Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 7 additions & 1 deletion homeassistant/components/recorder/core.py
Expand Up @@ -119,6 +119,7 @@
WaitTask,
)
from .util import (
async_create_backup_failure_issue,
build_mysqldb_conv,
dburl_to_path,
end_incomplete_runs,
Expand Down Expand Up @@ -1006,9 +1007,11 @@ def _lock_database(self, task: DatabaseLockTask) -> None:
def _async_set_database_locked(task: DatabaseLockTask) -> None:
task.database_locked.set()

local_start_time = dt_util.now()
hass = self.hass
with write_lock_db_sqlite(self):
# Notify that lock is being held, wait until database can be used again.
self.hass.add_job(_async_set_database_locked, task)
hass.add_job(_async_set_database_locked, task)
while not task.database_unlock.wait(timeout=DB_LOCK_QUEUE_CHECK_TIMEOUT):
if self._reached_max_backlog_percentage(90):
_LOGGER.warning(
Expand All @@ -1020,6 +1023,9 @@ def _async_set_database_locked(task: DatabaseLockTask) -> None:
self.backlog,
)
task.queue_overflow = True
hass.add_job(
async_create_backup_failure_issue, self.hass, local_start_time
)
break
_LOGGER.info(
"Database queue backlog reached %d entries during backup",
Expand Down
4 changes: 4 additions & 0 deletions homeassistant/components/recorder/strings.json
Expand Up @@ -12,6 +12,10 @@
"maria_db_range_index_regression": {
"title": "Update MariaDB to {min_version} or later resolve a significant performance issue",
"description": "Older versions of MariaDB suffer from a significant performance regression when retrieving history data or purging the database. Update to MariaDB version {min_version} or later and restart Home Assistant. If you are using the MariaDB core add-on, make sure to update it to the latest version."
},
"backup_failed_out_of_resources": {
"title": "Database backup failed due to lack of resources",
"description": "The database backup stated at {start_time} failed due to lack of resources. The backup cannot be trusted and must be restarted. This can happen if the database is too large or if the system is under heavy load. Consider upgrading the system hardware or reducing the size of the database by decreasing the number of history days to keep or creating a filter."
}
},
"services": {
Expand Down
18 changes: 18 additions & 0 deletions homeassistant/components/recorder/util.py
Expand Up @@ -470,6 +470,24 @@ def _async_create_mariadb_range_index_regression_issue(
)


@callback
def async_create_backup_failure_issue(
hass: HomeAssistant,
local_start_time: datetime,
) -> None:
"""Create an issue when the backup fails because we run out of resources."""
ir.async_create_issue(
hass,
DOMAIN,
"backup_failed_out_of_resources",
is_fixable=False,
severity=ir.IssueSeverity.CRITICAL,
learn_more_url="https://www.home-assistant.io/integrations/recorder",
translation_key="backup_failed_out_of_resources",
translation_placeholders={"start_time": str(local_start_time)},
)


def setup_connection_for_dialect(
instance: Recorder,
dialect_name: str,
Expand Down
17 changes: 17 additions & 0 deletions tests/components/recorder/test_init.py
Expand Up @@ -73,6 +73,7 @@
)
from homeassistant.core import Context, CoreState, Event, HomeAssistant, callback
from homeassistant.helpers import entity_registry as er, recorder as recorder_helper
from homeassistant.helpers.issue_registry import async_get as async_get_issue_registry
from homeassistant.setup import async_setup_component, setup_component
from homeassistant.util import dt as dt_util
from homeassistant.util.json import json_loads
Expand Down Expand Up @@ -1832,6 +1833,14 @@ def _get_db_events():
assert "Database queue backlog reached more than" in caplog.text
assert not instance.unlock_database()

registry = async_get_issue_registry(hass)
issue = registry.async_get_issue(DOMAIN, "backup_failed_out_of_resources")
assert issue is not None
assert "start_time" in issue.translation_placeholders
start_time = issue.translation_placeholders["start_time"]
assert start_time is not None
assert dt_util.parse_datetime(start_time) is not None


async def test_database_lock_and_overflow_checks_available_memory(
async_setup_recorder_instance: RecorderInstanceGenerator,
Expand Down Expand Up @@ -1910,6 +1919,14 @@ def _wait_database_unlocked():
db_events = await instance.async_add_executor_job(_get_db_events)
assert len(db_events) >= 2

registry = async_get_issue_registry(hass)
issue = registry.async_get_issue(DOMAIN, "backup_failed_out_of_resources")
assert issue is not None
assert "start_time" in issue.translation_placeholders
start_time = issue.translation_placeholders["start_time"]
assert start_time is not None
assert dt_util.parse_datetime(start_time) is not None


async def test_database_lock_timeout(
recorder_mock: Recorder, hass: HomeAssistant, recorder_db_url: str
Expand Down