From 09a51c7cdbf020567486b97d603f57b1cfc9f0e9 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Sun, 28 Jan 2024 09:44:43 -1000 Subject: [PATCH 1/5] Create an issue when backups fail because the system runs out of resources If the backup takes too long, the system can run out of resources to store the event queue. When this happens we logged a warning in the log which was easy to miss, and users sometimes found out their backup had failed at the worst possible time (when they needed to restore it.) --- homeassistant/components/recorder/core.py | 3 +++ homeassistant/components/recorder/strings.json | 4 ++++ homeassistant/components/recorder/util.py | 18 ++++++++++++++++++ 3 files changed, 25 insertions(+) diff --git a/homeassistant/components/recorder/core.py b/homeassistant/components/recorder/core.py index ad05cad3d54db..ca6d6e1329bc7 100644 --- a/homeassistant/components/recorder/core.py +++ b/homeassistant/components/recorder/core.py @@ -119,6 +119,7 @@ WaitTask, ) from .util import ( + async_create_backup_failure_issue, build_mysqldb_conv, dburl_to_path, end_incomplete_runs, @@ -1006,11 +1007,13 @@ def _lock_database(self, task: DatabaseLockTask) -> None: def _async_set_database_locked(task: DatabaseLockTask) -> None: task.database_locked.set() + start_time = dt_util.utcnow() with write_lock_db_sqlite(self): # Notify that lock is being held, wait until database can be used again. self.hass.add_job(_async_set_database_locked, task) while not task.database_unlock.wait(timeout=DB_LOCK_QUEUE_CHECK_TIMEOUT): if self._reached_max_backlog_percentage(90): + async_create_backup_failure_issue(self.hass, start_time) _LOGGER.warning( "Database queue backlog reached more than %s (%s events) of maximum queue " "length while waiting for backup to finish; recorder will now " diff --git a/homeassistant/components/recorder/strings.json b/homeassistant/components/recorder/strings.json index eb162628727bc..6e65544a727ac 100644 --- a/homeassistant/components/recorder/strings.json +++ b/homeassistant/components/recorder/strings.json @@ -12,6 +12,10 @@ "maria_db_range_index_regression": { "title": "Update MariaDB to {min_version} or later resolve a significant performance issue", "description": "Older versions of MariaDB suffer from a significant performance regression when retrieving history data or purging the database. Update to MariaDB version {min_version} or later and restart Home Assistant. If you are using the MariaDB core add-on, make sure to update it to the latest version." + }, + "backup_failed_out_of_resources": { + "title": "Database backup failed due to lack of resources", + "description": "The database backup stated at {start_time} failed due to lack of resources. This can happen if the database is too large or if the system is under heavy load. Consider upgrading the system hardware or reducing the size of the database by decreasing the number of history days to keep or creating a filter." } }, "services": { diff --git a/homeassistant/components/recorder/util.py b/homeassistant/components/recorder/util.py index 4a1bf940b244a..26f74b773ccd8 100644 --- a/homeassistant/components/recorder/util.py +++ b/homeassistant/components/recorder/util.py @@ -470,6 +470,24 @@ def _async_create_mariadb_range_index_regression_issue( ) +@callback +def async_create_backup_failure_issue( + hass: HomeAssistant, + local_start_time: datetime, +) -> None: + """Create an issue when the backup fails because we run out of resources.""" + ir.async_create_issue( + hass, + DOMAIN, + "backup_failed_out_of_resources", + is_fixable=False, + severity=ir.IssueSeverity.CRITICAL, + learn_more_url="https://www.home-assistant.io/integrations/recorder", + translation_key="backup_failed_out_of_resources", + translation_placeholders={"start_time": str(local_start_time)}, + ) + + def setup_connection_for_dialect( instance: Recorder, dialect_name: str, From 75db1bd124c758bf05d2fbbb6b44ba1f5ce68ef7 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Sun, 28 Jan 2024 10:31:42 -1000 Subject: [PATCH 2/5] fixup tests --- homeassistant/components/recorder/core.py | 9 ++++++--- tests/components/recorder/test_init.py | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/homeassistant/components/recorder/core.py b/homeassistant/components/recorder/core.py index ca6d6e1329bc7..07591c468b8b2 100644 --- a/homeassistant/components/recorder/core.py +++ b/homeassistant/components/recorder/core.py @@ -1007,13 +1007,13 @@ def _lock_database(self, task: DatabaseLockTask) -> None: def _async_set_database_locked(task: DatabaseLockTask) -> None: task.database_locked.set() - start_time = dt_util.utcnow() + local_start_time = dt_util.now() + hass = self.hass with write_lock_db_sqlite(self): # Notify that lock is being held, wait until database can be used again. - self.hass.add_job(_async_set_database_locked, task) + hass.add_job(_async_set_database_locked, task) while not task.database_unlock.wait(timeout=DB_LOCK_QUEUE_CHECK_TIMEOUT): if self._reached_max_backlog_percentage(90): - async_create_backup_failure_issue(self.hass, start_time) _LOGGER.warning( "Database queue backlog reached more than %s (%s events) of maximum queue " "length while waiting for backup to finish; recorder will now " @@ -1023,6 +1023,9 @@ def _async_set_database_locked(task: DatabaseLockTask) -> None: self.backlog, ) task.queue_overflow = True + hass.add_job( + async_create_backup_failure_issue, self.hass, local_start_time + ) break _LOGGER.info( "Database queue backlog reached %d entries during backup", diff --git a/tests/components/recorder/test_init.py b/tests/components/recorder/test_init.py index f8aa219fdb417..31b1e4f8fbb78 100644 --- a/tests/components/recorder/test_init.py +++ b/tests/components/recorder/test_init.py @@ -73,6 +73,7 @@ ) from homeassistant.core import Context, CoreState, Event, HomeAssistant, callback from homeassistant.helpers import entity_registry as er, recorder as recorder_helper +from homeassistant.helpers.issue_registry import async_get as async_get_issue_registry from homeassistant.setup import async_setup_component, setup_component from homeassistant.util import dt as dt_util from homeassistant.util.json import json_loads @@ -1832,6 +1833,14 @@ def _get_db_events(): assert "Database queue backlog reached more than" in caplog.text assert not instance.unlock_database() + registry = async_get_issue_registry(hass) + issue = registry.async_get_issue(DOMAIN, "backup_failed_out_of_resources") + assert issue is not None + assert "start_time" in issue.translation_placeholders + start_time = issue.translation_placeholders["start_time"] + assert start_time is not None + assert dt_util.parse_datetime(start_time) is not None + async def test_database_lock_and_overflow_checks_available_memory( async_setup_recorder_instance: RecorderInstanceGenerator, @@ -1910,6 +1919,14 @@ def _wait_database_unlocked(): db_events = await instance.async_add_executor_job(_get_db_events) assert len(db_events) >= 2 + registry = async_get_issue_registry(hass) + issue = registry.async_get_issue(DOMAIN, "backup_failed_out_of_resources") + assert issue is not None + assert "start_time" in issue.translation_placeholders + start_time = issue.translation_placeholders["start_time"] + assert start_time is not None + assert dt_util.parse_datetime(start_time) is not None + async def test_database_lock_timeout( recorder_mock: Recorder, hass: HomeAssistant, recorder_db_url: str From e5178c9579bfec4b632f11ae617f8c09b376bf3a Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Sun, 28 Jan 2024 10:32:28 -1000 Subject: [PATCH 3/5] fixup tests --- homeassistant/components/recorder/strings.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homeassistant/components/recorder/strings.json b/homeassistant/components/recorder/strings.json index 6e65544a727ac..74b248354d7e5 100644 --- a/homeassistant/components/recorder/strings.json +++ b/homeassistant/components/recorder/strings.json @@ -15,7 +15,7 @@ }, "backup_failed_out_of_resources": { "title": "Database backup failed due to lack of resources", - "description": "The database backup stated at {start_time} failed due to lack of resources. This can happen if the database is too large or if the system is under heavy load. Consider upgrading the system hardware or reducing the size of the database by decreasing the number of history days to keep or creating a filter." + "description": "The database backup stated at {start_time} failed due to lack of resources. The backup cannot be trusted and must be restarted. This can happen if the database is too large or if the system is under heavy load. Consider upgrading the system hardware or reducing the size of the database by decreasing the number of history days to keep or creating a filter." } }, "services": { From 8c346b7a62c39f96b0a213a3c93ce5b2e5bfd8e3 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Tue, 30 Jan 2024 07:51:22 -1000 Subject: [PATCH 4/5] format %H:%M:%S --- homeassistant/components/recorder/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/homeassistant/components/recorder/util.py b/homeassistant/components/recorder/util.py index 26f74b773ccd8..f684160f86fa1 100644 --- a/homeassistant/components/recorder/util.py +++ b/homeassistant/components/recorder/util.py @@ -484,7 +484,7 @@ def async_create_backup_failure_issue( severity=ir.IssueSeverity.CRITICAL, learn_more_url="https://www.home-assistant.io/integrations/recorder", translation_key="backup_failed_out_of_resources", - translation_placeholders={"start_time": str(local_start_time)}, + translation_placeholders={"start_time": local_start_time.strftime("%H:%M:%S")}, ) From d89e4f3006fdd1838617c1dbe78a324ad3c64720 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Tue, 30 Jan 2024 07:56:31 -1000 Subject: [PATCH 5/5] format %H:%M:%S --- tests/components/recorder/test_init.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/components/recorder/test_init.py b/tests/components/recorder/test_init.py index 31b1e4f8fbb78..78af9a64257da 100644 --- a/tests/components/recorder/test_init.py +++ b/tests/components/recorder/test_init.py @@ -1839,7 +1839,8 @@ def _get_db_events(): assert "start_time" in issue.translation_placeholders start_time = issue.translation_placeholders["start_time"] assert start_time is not None - assert dt_util.parse_datetime(start_time) is not None + # Should be in H:M:S format + assert start_time.count(":") == 2 async def test_database_lock_and_overflow_checks_available_memory( @@ -1925,7 +1926,8 @@ def _wait_database_unlocked(): assert "start_time" in issue.translation_placeholders start_time = issue.translation_placeholders["start_time"] assert start_time is not None - assert dt_util.parse_datetime(start_time) is not None + # Should be in H:M:S format + assert start_time.count(":") == 2 async def test_database_lock_timeout(