Skip to content

Commit

Permalink
Bound successive non-fatal failures per target
Browse files Browse the repository at this point in the history
  • Loading branch information
smondet committed Jun 11, 2015
1 parent e99f18b commit 8f3b881
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 18 deletions.
17 changes: 13 additions & 4 deletions src/lib/ketrew_configuration.ml
Expand Up @@ -30,6 +30,7 @@ type engine = {
database_parameters: string;
turn_unix_ssh_failure_into_target_failure: bool [@default false];
host_timeout_upper_bound: float option [@default None];
maximum_successive_attempts: int [@default 10];
} [@@deriving yojson]
type explorer_defaults = {
request_targets_ids: [ `All | `Younger_than of [ `Days of float ]];
Expand Down Expand Up @@ -109,13 +110,17 @@ let log t =
item "Targets-to-prefectch" (i targets_to_prefetch);
]);
] in
let engine t =
let engine { database_parameters; turn_unix_ssh_failure_into_target_failure;
host_timeout_upper_bound; maximum_successive_attempts } =
sublist [
item "Database" (quote t.database_parameters);
item "Database" (quote database_parameters);
item "Unix-failure"
((if t.turn_unix_ssh_failure_into_target_failure
((if turn_unix_ssh_failure_into_target_failure
then s "turns"
else s "does not turn") % s " into target failure");
item "Host-timeout-upper-bound"
(option f host_timeout_upper_bound);
item "Maximum-successive-attempts" (i maximum_successive_attempts);
] in
let authorized_tokens = function
| `Path path -> s "Path: " % quote path
Expand Down Expand Up @@ -187,10 +192,13 @@ let default_ui = ui ()
let engine
?(database_parameters=default_database_path)
?(turn_unix_ssh_failure_into_target_failure=false)
?host_timeout_upper_bound () = {
?host_timeout_upper_bound
?(maximum_successive_attempts=10)
() = {
database_parameters;
turn_unix_ssh_failure_into_target_failure;
host_timeout_upper_bound;
maximum_successive_attempts;
}
let default_engine = engine ()

Expand Down Expand Up @@ -228,6 +236,7 @@ let daemon s = s.daemon
let log_path s = s.log_path
let database_parameters e = e.database_parameters
let is_unix_ssh_failure_fatal e = e.turn_unix_ssh_failure_into_target_failure
let maximum_successive_attempts e = e.maximum_successive_attempts
let mode t = t.mode
let standalone_engine st = st.standalone_engine
let server_engine s = s.server_engine
Expand Down
6 changes: 6 additions & 0 deletions src/lib/ketrew_configuration.mli
Expand Up @@ -90,6 +90,7 @@ val engine:
?database_parameters:string ->
?turn_unix_ssh_failure_into_target_failure: bool ->
?host_timeout_upper_bound: float ->
?maximum_successive_attempts: int ->
unit -> engine
(** Build an [engine] configuration:
Expand All @@ -103,6 +104,8 @@ val engine:
behavior set the option to [true].
- [host_timeout_upper_bound]: every connection/command timeout
will be “≤ upper-bound” (in seconds, default is [60.]).
- [maximum_successive_attempts]: number of successive non-fatal
failures allowed before declaring a target dead (default is [10]).
*)

type authorized_tokens
Expand Down Expand Up @@ -225,6 +228,9 @@ val database_parameters: engine -> string
val is_unix_ssh_failure_fatal: engine -> bool
(** Should we kill targets on ssh/unix errors. *)

val maximum_successive_attempts: engine -> int
(** Get the maximum number of successive non-fatal failures. *)

val plugins: t -> plugin list
(** Get the configured list of plugins. *)

Expand Down
47 changes: 33 additions & 14 deletions src/lib/ketrew_engine.ml
Expand Up @@ -472,17 +472,21 @@ let add_targets = Adding_targets.store_targets_to_add

module Run_automaton = struct

let _long_running_action_error t ~error ~bookkeeping =
let _long_running_action_error t ~error ~bookkeeping ~previous_attempts =
let should_kill = Configuration.is_unix_ssh_failure_fatal t.configuration in
match error, should_kill with
| `Recoverable str, true
| `Fatal str, _ ->
| `Fatal str, _ -> `Fatal, str, bookkeeping
| `Recoverable str, false when
previous_attempts >=
Ketrew_configuration.maximum_successive_attempts t.configuration ->
`Fatal, str, bookkeeping
| `Recoverable str, false ->
`Try_again, str, bookkeeping
| `Recoverable str, false -> `Try_again, str, bookkeeping

let _start_running_target t bookkeeping =
let _start_running_target t ~target ~bookkeeping =
let {Target.Automaton. plugin_name; run_parameters} = bookkeeping in
let previous_attempts =
Target.(state target |> State.Count.consecutive_recent_attempts) in
begin match Ketrew_plugin.find_plugin plugin_name with
| Some m ->
let module Long_running = (val m : LONG_RUNNING) in
Expand All @@ -493,7 +497,7 @@ module Run_automaton = struct
fail (_long_running_action_error t
~error:(`Fatal (fmt "Deserialize-long-running: %s"
(Printexc.to_string e)))
~bookkeeping)
~bookkeeping ~previous_attempts)
end
>>= fun run_parameters ->
Long_running.start run_parameters
Expand All @@ -502,11 +506,12 @@ module Run_automaton = struct
let run_parameters = Long_running.serialize rp in
return { Target.Automaton. plugin_name; run_parameters}
| `Error e ->
fail (_long_running_action_error t ~error:e ~bookkeeping)
fail (_long_running_action_error t ~error:e ~bookkeeping
~previous_attempts)
end
| None ->
let error = `Recoverable (fmt "Missing plugin %S" plugin_name) in
fail (_long_running_action_error t ~error ~bookkeeping)
fail (_long_running_action_error t ~error ~bookkeeping ~previous_attempts)
end

let _check_and_activate_dependencies t ~dependency_of ~ids =
Expand Down Expand Up @@ -563,6 +568,8 @@ module Run_automaton = struct

let _attempt_to_kill t ~target ~bookkeeping =
let {Target.Automaton. plugin_name; run_parameters} = bookkeeping in
let previous_attempts =
Target.(state target |> State.Count.consecutive_recent_attempts) in
begin match Ketrew_plugin.find_plugin plugin_name with
| Some m ->
let module Long_running = (val m : LONG_RUNNING) in
Expand All @@ -573,11 +580,13 @@ module Run_automaton = struct
let run_parameters = Long_running.serialize rp in
return { Target.Automaton. plugin_name; run_parameters}
| `Error e ->
fail (_long_running_action_error t ~error:e ~bookkeeping)
fail (_long_running_action_error t
~error:e ~bookkeeping ~previous_attempts)
end
| None ->
let error = `Recoverable (fmt "Missing plugin %S" plugin_name) in
fail (_long_running_action_error t ~error ~bookkeeping)
fail (_long_running_action_error t
~error ~bookkeeping ~previous_attempts)
end

let _check_process t ~target ~bookkeeping =
Expand Down Expand Up @@ -605,11 +614,16 @@ module Run_automaton = struct
{ bookkeeping with
Target.Automaton.run_parameters = run_parameters })
| `Error e ->
fail (_long_running_action_error t ~error:e ~bookkeeping)
let previous_attempts =
Target.(state target |> State.Count.consecutive_recent_attempts) in
fail (_long_running_action_error t
~error:e ~bookkeeping ~previous_attempts)
end
| None ->
let error = `Recoverable (fmt "Missing plugin %S" plugin_name) in
fail (_long_running_action_error t ~error ~bookkeeping)
let previous_attempts =
Target.(state target |> State.Count.consecutive_recent_attempts) in
fail (_long_running_action_error t ~error ~bookkeeping ~previous_attempts)
end

let _process_automaton_transition t target =
Expand All @@ -631,7 +645,7 @@ module Run_automaton = struct
~dependency_of:(Target.id target) ~ids
>>| (make_new_target ~log)
| `Start_running (bookkeeping, make_new_target) ->
_start_running_target t bookkeeping
_start_running_target t ~target ~bookkeeping
>>< fun starting_attemp ->
return (make_new_target ~log:("Attempt to start") starting_attemp)
| `Eval_condition (condition, make_new_target) ->
Expand All @@ -641,11 +655,16 @@ module Run_automaton = struct
| `Ok answer ->
return (make_new_target ?log:None (`Ok answer))
| `Error e ->
let attempts =
Target.(state target |> State.Count.consecutive_recent_attempts) in
let log = Ketrew_error.to_string e in
let severity =
match e with
| `Volume _ -> `Fatal
| `Host _ -> `Try_again
| `Host _ ->
if attempts >=
Ketrew_configuration.maximum_successive_attempts t.configuration
then `Fatal else `Try_again
in
return (make_new_target ?log:None (`Error (severity, log)))
end
Expand Down
44 changes: 44 additions & 0 deletions src/pure/ketrew_target.ml
Expand Up @@ -556,6 +556,50 @@ that the potential condition has been ensured.

end

module Count = struct
module Latest = struct
let make_counter ~continue t =
let rec count v (t: t) =
match continue t with
| Some previous_state ->
count (v + 1) (previous_state :> t)
| _ -> v
in
count 0 t
let tried_to_eval_condition (t: t) =
make_counter ~continue:(function
| `Tried_to_eval_condition { log; previous_state } -> Some previous_state
| _ -> None) t
let tried_to_reeval_condition (t: t) =
make_counter ~continue:(function
| `Tried_to_reeval_condition (_, { log; previous_state }) ->
Some previous_state
| _ -> None) t
let tried_to_kill (t: t) =
make_counter ~continue:(function
| `Tried_to_kill { log; previous_state } -> Some previous_state
| _ -> None) t
let tried_to_start (t: t) =
make_counter ~continue:(function
| `Tried_to_start ({ log; previous_state }, _) -> Some previous_state
| _ -> None) t
end
let consecutive_recent_attempts t =
let (+-+) = max in
let open Latest in
tried_to_start t
+-+ tried_to_kill t
+-+ tried_to_eval_condition t
+-+ tried_to_reeval_condition t
(* let rec count v (t: t) = *)
(* match t with *)
(* | `Tried_to_eval_condition { log; previous_state } -> *)
(* count (v + 1) (previous_state :> t) *)
(* | _ -> v *)
(* in *)
(* count 0 t *)
end

end


Expand Down
9 changes: 9 additions & 0 deletions src/pure/ketrew_target.mli
Expand Up @@ -172,6 +172,15 @@ module State : sig
val killable: t -> bool
val finished_because_dependencies_died: t -> bool
end

(** A module providing functions [t -> int] to provide counts. *)
module Count : sig
val consecutive_recent_attempts: t -> int
(**
Count how many times a current non-fatal failure state
“repeats.” I.e. how many [`Tried_to_...] state form recent
history of the target. *)
end
end

type t
Expand Down

0 comments on commit 8f3b881

Please sign in to comment.