Fixing potential race condition on drain shutdown

Drains are sometimes shut down through the API by sending themselves a 'shutdown' message. However, in logplex_drain:stop/2, if the drain process crashes before or after the monitor is set in the call, and before the 'shutdown' message makes it there, the receive will work on '{'DOWN', Ref, process, DrainPid, _}' but the supervisor will restart it because it's transient. During that failure, we think we killed the pid, but we didn't (the crash did), so it restarts and stands alone while we're content at the fact it died once. This patch uses the {shutdown, call} tuple to mean the drain was killed using a gen_server:call demand, which will be reported in the monitor as a reason. This lets us switch based on the reason to make sure the drain was properly shut down.
heroku · Mar 27, 2013 · 263407d · 263407d
1 parent 95cf78f
commit 263407d
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 4 deletions.
diff --git a/src/logplex_drain.erl b/src/logplex_drain.erl
@@ -126,8 +126,10 @@ stop(DrainId, Timeout) ->
             Ref = erlang:monitor(process, DrainPid),
             DrainPid ! shutdown,
             receive
-                {'DOWN', Ref, process, DrainPid, _} ->
-                    ok
+                {'DOWN', Ref, process, DrainPid, {shutdown,call}} ->
+                    ok;
+                {'DOWN', Ref, process, DrainPid, _} -> % bad reason!
+                    supervisor:terminate_child(logplex_drain_sup, DrainId)
             after Timeout ->
                     erlang:demonitor(Ref, [flush]),
                     supervisor:terminate_child(logplex_drain_sup, DrainId)

diff --git a/src/logplex_http_drain.erl b/src/logplex_http_drain.erl
@@ -184,7 +184,7 @@ handle_info({timeout, _Ref, ?RECONNECT_MSG}, StateName,
     {next_state, StateName, State};
 
 handle_info(shutdown, _StateName, State) ->
-    {stop, shutdown, State};
+    {stop, {shutdown,call}, State};
 
 handle_info({'EXIT', ClientPid, Reason}, StateName,
             State = #state{client = ClientPid}) ->

diff --git a/src/logplex_tcpsyslog_drain.erl b/src/logplex_tcpsyslog_drain.erl
@@ -265,7 +265,7 @@ handle_info(shutdown, StateName, State = #state{sock = Sock})
           log_info(State, [StateName, shutdown, Sock, duration(State)])),
     {stop, shutdown, State#state{sock = undefined}};
 handle_info(shutdown, _StateName, State) ->
-    {stop, shutdown, State};
+    {stop, {shutdown,call}, State};
 handle_info(Info, StateName, State) ->
     ?MODULE:StateName(Info, State).