Skip to content

Commit

Permalink
Create an issue when database backups fail because the system runs ou…
Browse files Browse the repository at this point in the history
…t of resources (#109020)
  • Loading branch information
bdraco committed Jan 30, 2024
1 parent 6174aa4 commit a222447
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 1 deletion.
8 changes: 7 additions & 1 deletion homeassistant/components/recorder/core.py
Expand Up @@ -119,6 +119,7 @@
WaitTask,
)
from .util import (
async_create_backup_failure_issue,
build_mysqldb_conv,
dburl_to_path,
end_incomplete_runs,
Expand Down Expand Up @@ -1006,9 +1007,11 @@ def _lock_database(self, task: DatabaseLockTask) -> None:
def _async_set_database_locked(task: DatabaseLockTask) -> None:
task.database_locked.set()

local_start_time = dt_util.now()
hass = self.hass
with write_lock_db_sqlite(self):
# Notify that lock is being held, wait until database can be used again.
self.hass.add_job(_async_set_database_locked, task)
hass.add_job(_async_set_database_locked, task)
while not task.database_unlock.wait(timeout=DB_LOCK_QUEUE_CHECK_TIMEOUT):
if self._reached_max_backlog_percentage(90):
_LOGGER.warning(
Expand All @@ -1020,6 +1023,9 @@ def _async_set_database_locked(task: DatabaseLockTask) -> None:
self.backlog,
)
task.queue_overflow = True
hass.add_job(
async_create_backup_failure_issue, self.hass, local_start_time
)
break
_LOGGER.info(
"Database queue backlog reached %d entries during backup",
Expand Down
4 changes: 4 additions & 0 deletions homeassistant/components/recorder/strings.json
Expand Up @@ -12,6 +12,10 @@
"maria_db_range_index_regression": {
"title": "Update MariaDB to {min_version} or later resolve a significant performance issue",
"description": "Older versions of MariaDB suffer from a significant performance regression when retrieving history data or purging the database. Update to MariaDB version {min_version} or later and restart Home Assistant. If you are using the MariaDB core add-on, make sure to update it to the latest version."
},
"backup_failed_out_of_resources": {
"title": "Database backup failed due to lack of resources",
"description": "The database backup stated at {start_time} failed due to lack of resources. The backup cannot be trusted and must be restarted. This can happen if the database is too large or if the system is under heavy load. Consider upgrading the system hardware or reducing the size of the database by decreasing the number of history days to keep or creating a filter."
}
},
"services": {
Expand Down
18 changes: 18 additions & 0 deletions homeassistant/components/recorder/util.py
Expand Up @@ -470,6 +470,24 @@ def _async_create_mariadb_range_index_regression_issue(
)


@callback
def async_create_backup_failure_issue(
hass: HomeAssistant,
local_start_time: datetime,
) -> None:
"""Create an issue when the backup fails because we run out of resources."""
ir.async_create_issue(
hass,
DOMAIN,
"backup_failed_out_of_resources",
is_fixable=False,
severity=ir.IssueSeverity.CRITICAL,
learn_more_url="https://www.home-assistant.io/integrations/recorder",
translation_key="backup_failed_out_of_resources",
translation_placeholders={"start_time": local_start_time.strftime("%H:%M:%S")},
)


def setup_connection_for_dialect(
instance: Recorder,
dialect_name: str,
Expand Down
19 changes: 19 additions & 0 deletions tests/components/recorder/test_init.py
Expand Up @@ -73,6 +73,7 @@
)
from homeassistant.core import Context, CoreState, Event, HomeAssistant, callback
from homeassistant.helpers import entity_registry as er, recorder as recorder_helper
from homeassistant.helpers.issue_registry import async_get as async_get_issue_registry
from homeassistant.setup import async_setup_component, setup_component
from homeassistant.util import dt as dt_util
from homeassistant.util.json import json_loads
Expand Down Expand Up @@ -1832,6 +1833,15 @@ def _get_db_events():
assert "Database queue backlog reached more than" in caplog.text
assert not instance.unlock_database()

registry = async_get_issue_registry(hass)
issue = registry.async_get_issue(DOMAIN, "backup_failed_out_of_resources")
assert issue is not None
assert "start_time" in issue.translation_placeholders
start_time = issue.translation_placeholders["start_time"]
assert start_time is not None
# Should be in H:M:S format
assert start_time.count(":") == 2


async def test_database_lock_and_overflow_checks_available_memory(
async_setup_recorder_instance: RecorderInstanceGenerator,
Expand Down Expand Up @@ -1910,6 +1920,15 @@ def _wait_database_unlocked():
db_events = await instance.async_add_executor_job(_get_db_events)
assert len(db_events) >= 2

registry = async_get_issue_registry(hass)
issue = registry.async_get_issue(DOMAIN, "backup_failed_out_of_resources")
assert issue is not None
assert "start_time" in issue.translation_placeholders
start_time = issue.translation_placeholders["start_time"]
assert start_time is not None
# Should be in H:M:S format
assert start_time.count(":") == 2


async def test_database_lock_timeout(
recorder_mock: Recorder, hass: HomeAssistant, recorder_db_url: str
Expand Down

0 comments on commit a222447

Please sign in to comment.