Always schedule next_check within check_interval

After naemon#259 we now keep the next_check schedule over restarts if use_retained_schedule_info is enabled. However after this patch, if one would lower the check_interval it was possible that after the restart, the next check of an object would be more than one check_interval away. This commit ensures that if the next_check is more than one check_interval away, then we randomly schedule the next check, instead of using the retention data. This fixed MON-11295 (https://jira.op5.com/browse/MON-11295) Signed-off-by: Jacob Hansen <jhansen@op5.com>
jacobbaungard · Oct 10, 2018 · de8f21b · de8f21b
1 parent 859b20d
commit de8f21b
Show file tree

Hide file tree

Showing 3 changed files with 66 additions and 4 deletions.
diff --git a/src/naemon/checks_host.c b/src/naemon/checks_host.c
@@ -65,10 +65,13 @@ void checks_init_hosts(void)
  		 * If use_retained_scheduling_info is enabled, we use the previously set
  		 * next_check. If the check was missed, schedule it within the next
  		 * interval length. If more than one check was missed, we schedule the check
- 		 * randomly instead.
+ 		 * randomly instead. If the next_check is more than one check_interval in
+ 		 * the future, we also schedule the next check randomly. This indicates
+ 		 * that the check_interval has been lowered over restarts.
  		 */
 		if (use_retained_scheduling_info == TRUE &&
-		    temp_host->next_check > current_time-get_host_check_interval_s(temp_host)) {
+		    temp_host->next_check > current_time-get_host_check_interval_s(temp_host) &&
+				temp_host->next_check <= current_time+get_host_check_interval_s(temp_host)) {
 			if (temp_host->next_check < current_time) {
 				delay = ranged_urand(0, interval_length);
 			} else {

diff --git a/src/naemon/checks_service.c b/src/naemon/checks_service.c
@@ -63,10 +63,13 @@ void checks_init_services(void)
  		 * If use_retained_scheduling_info is enabled, we use the previously set
  		 * next_check. If the check was missed, schedule it within the next
  		 * interval length. If more than one check was missed, we schedule the check
- 		 * randomly instead.
+ 		 * randomly instead. If the next_check is more than one check_interval in
+ 		 * the future, we also schedule the next check randomly. This indicates
+ 		 * that the check_interval has been lowered over restarts.
  		 */
 		if (use_retained_scheduling_info == TRUE &&
-		    temp_service->next_check > current_time-get_service_check_interval_s(temp_service)) {
+		    temp_service->next_check > current_time-get_service_check_interval_s(temp_service) &&
+				temp_service->next_check <= current_time+get_service_check_interval_s(temp_service)) {
 			if (temp_service->next_check < current_time) {
 				delay = ranged_urand(0, interval_length);
 			} else {

diff --git a/tests/test-check-scheduling.c b/tests/test-check-scheduling.c
@@ -745,6 +745,32 @@ START_TEST(host_retain_disabled_next_check)
 END_TEST
 
 
+/* If use_retained_scheduling info is enabled but the next_check in the
+ * retention data is more than one check_interval away, then we should
+ * schedule the check randomly within one check_interval.
+ */
+START_TEST(host_retain_always_within_check_interval)
+{
+	time_t current_time = time(NULL);
+	time_t expected_max_next_check;
+	use_retained_scheduling_info=TRUE;
+
+	hst->retry_interval = 1.0;
+	hst->check_interval = 15.0;
+	hst->current_state = STATE_UP;
+	hst->state_type = HARD_STATE;
+	hst->next_check = current_time+get_host_check_interval_s(hst);
+	hst->check_interval = 5.0;
+	expected_max_next_check = current_time+get_host_check_interval_s(hst);
+
+	/* Simulates a restart */
+	checks_init_hosts();
+	ck_assert(hst->next_check >= current_time);
+	ck_assert(hst->next_check <= expected_max_next_check);
+}
+END_TEST
+
+
 /* If use_retained_scheduling_info is enabled the next_check time should be
  * retained over restarts
  */
@@ -844,6 +870,34 @@ START_TEST(service_retain_disabled_next_check)
 END_TEST
 
 
+/* If use_retained_scheduling info is enabled but the next_check in the
+ * retention data is more than one check_interval away, then we should
+ * schedule the check randomly within one check_interval.
+ */
+START_TEST(service_retain_always_within_check_interval)
+{
+	time_t current_time = time(NULL);
+	time_t expected_max_next_check;
+	use_retained_scheduling_info=TRUE;
+
+	svc->retry_interval = 1.0;
+	svc->check_interval = 15.0;
+	svc->current_state = STATE_UP;
+	svc->state_type = HARD_STATE;
+	svc->next_check = current_time+get_service_check_interval_s(svc);
+	printf("service check interval seconds: %lld\n", (long long)get_service_check_interval_s(svc));
+	svc->check_interval = 5;
+	expected_max_next_check = current_time+get_service_check_interval_s(svc);
+	printf("service check interval seconds: %lld\n", (long long)get_service_check_interval_s(svc));
+
+	/* Simulates a restart */
+	checks_init_services();
+	ck_assert(svc->next_check >= current_time);
+	ck_assert(svc->next_check <= expected_max_next_check);
+}
+END_TEST
+
+
 Suite*
 check_scheduling_suite(void)
 {
@@ -880,10 +934,12 @@ check_scheduling_suite(void)
 	tcase_add_test(tc_retain, host_retain_missed_check);
 	tcase_add_test(tc_retain, host_retain_missed_multiple_checks);
 	tcase_add_test(tc_retain, host_retain_disabled_next_check);
+	tcase_add_test(tc_retain, host_retain_always_within_check_interval);
 	tcase_add_test(tc_retain, service_retain_next_check);
 	tcase_add_test(tc_retain, service_retain_missed_check);
 	tcase_add_test(tc_retain, service_retain_missed_multiple_checks);
 	tcase_add_test(tc_retain, service_retain_disabled_next_check);
+	tcase_add_test(tc_retain, service_retain_always_within_check_interval);
 	suite_add_tcase(s, tc_retain);
 
 	tcase_add_checked_fixture(tc_miscellaneous, setup, teardown);