Skip to content

Commit

Permalink
Home Assistant watchdog attempts safe mode after max fails (#5124)
Browse files Browse the repository at this point in the history
* Home Assistant watchdog attempts safe mode after max fails

* Remove duplicate line

* Refactor and logging change from feedback

* Update supervisor/misc/tasks.py

* Fix log text check in test

---------

Co-authored-by: Stefan Agner <stefan@agner.ch>
  • Loading branch information
mdegat01 and agners committed Jun 20, 2024
1 parent 918fcb7 commit 1bb814b
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 25 deletions.
47 changes: 29 additions & 18 deletions supervisor/misc/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,17 +174,6 @@ async def _watchdog_homeassistant_api(self):
self._cache[HASS_WATCHDOG_API_FAILURES] = 0
return

# Give up after 5 reanimation failures in a row. Supervisor cannot fix this issue.
reanimate_fails = self._cache.get(HASS_WATCHDOG_REANIMATE_FAILURES, 0)
if reanimate_fails >= HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
if reanimate_fails == HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
_LOGGER.critical(
"Watchdog cannot reanimate Home Assistant Core, failed all %s attempts.",
reanimate_fails,
)
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] += 1
return

# Init cache data
api_fails = self._cache.get(HASS_WATCHDOG_API_FAILURES, 0)

Expand All @@ -195,16 +184,38 @@ async def _watchdog_homeassistant_api(self):
_LOGGER.warning("Watchdog missed an Home Assistant Core API response.")
return

_LOGGER.error(
"Watchdog missed %s Home Assistant Core API responses in a row. Restarting Home Assistant Core API!",
HASS_WATCHDOG_MAX_API_ATTEMPTS,
)
# After 5 reanimation attempts switch to safe mode. If that fails, give up
reanimate_fails = self._cache.get(HASS_WATCHDOG_REANIMATE_FAILURES, 0)
if reanimate_fails > HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
return

if safe_mode := reanimate_fails == HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
_LOGGER.critical(
"Watchdog cannot reanimate Home Assistant Core, failed all %s attempts. Restarting into safe mode",
reanimate_fails,
)
else:
_LOGGER.error(
"Watchdog missed %s Home Assistant Core API responses in a row. Restarting Home Assistant Core!",
HASS_WATCHDOG_MAX_API_ATTEMPTS,
)

try:
await self.sys_homeassistant.core.restart()
if safe_mode:
await self.sys_homeassistant.core.rebuild(safe_mode=True)
else:
await self.sys_homeassistant.core.restart()
except HomeAssistantError as err:
_LOGGER.error("Home Assistant watchdog reanimation failed!")
if reanimate_fails == 0:
if reanimate_fails == 0 or safe_mode:
capture_exception(err)

if safe_mode:
_LOGGER.critical(
"Safe mode restart failed. Watchdog cannot bring Home Assistant online."
)
else:
_LOGGER.error("Home Assistant watchdog reanimation failed!")

self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = reanimate_fails + 1
else:
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = 0
Expand Down
31 changes: 24 additions & 7 deletions tests/misc/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ async def test_watchdog_homeassistant_api(
restart.assert_called_once()
assert "Watchdog missed an Home Assistant Core API response." not in caplog.text
assert (
"Watchdog missed 2 Home Assistant Core API responses in a row. Restarting Home Assistant Core API!"
"Watchdog missed 2 Home Assistant Core API responses in a row. Restarting Home Assistant Core!"
in caplog.text
)

Expand Down Expand Up @@ -109,31 +109,48 @@ async def test_watchdog_homeassistant_api_reanimation_limit(
HomeAssistantAPI, "check_api_state", return_value=False
), patch.object(
HomeAssistantCore, "restart", side_effect=(err := HomeAssistantError())
) as restart:
) as restart, patch.object(
HomeAssistantCore, "rebuild", side_effect=err
) as rebuild:
for _ in range(5):
await tasks._watchdog_homeassistant_api()
restart.assert_not_called()

await tasks._watchdog_homeassistant_api()
restart.assert_called_once()
restart.assert_called_once_with()
assert "Home Assistant watchdog reanimation failed!" in caplog.text

rebuild.assert_not_called()
restart.reset_mock()

capture_exception.assert_called_once_with(err)

# Next time it should try safe mode
caplog.clear()
await tasks._watchdog_homeassistant_api()
rebuild.assert_not_called()

await tasks._watchdog_homeassistant_api()

rebuild.assert_called_once_with(safe_mode=True)
restart.assert_not_called()
assert "Watchdog missed an Home Assistant Core API response." not in caplog.text
assert "Watchdog found a problem with Home Assistant API!" not in caplog.text
assert (
"Watchdog cannot reanimate Home Assistant Core, failed all 5 attempts."
"Watchdog cannot reanimate Home Assistant Core, failed all 5 attempts. Restarting into safe mode"
in caplog.text
)
assert (
"Safe mode restart failed. Watchdog cannot bring Home Assistant online."
in caplog.text
)

# After safe mode has failed too, no more restart attempts
rebuild.reset_mock()
caplog.clear()
await tasks._watchdog_homeassistant_api()
assert "Watchdog missed an Home Assistant Core API response." in caplog.text

caplog.clear()
await tasks._watchdog_homeassistant_api()
restart.assert_not_called()
assert not caplog.text
restart.assert_not_called()
rebuild.assert_not_called()

0 comments on commit 1bb814b

Please sign in to comment.