Skip to content

Commit

Permalink
Mailer reliability improvements (#2262)
Browse files Browse the repository at this point in the history
### Connect before sending mail, not at startup

Per #2250 when we connect to the remote SMTP server at start-up time by calling `mailer.Connect()` but do not actually call `mailer.SendMail()` until after we have done some potentially expensive/time-consuming work we are liable to have our connection closed due to timeout.

This PR moves the `Connect()` call in `expiration-mailer` and `notify-mailer` to be closer to where the actual messages are sent via `SendMail()` and resolves #2250 

### Handle SMTP 421 errors gracefully

Issue #2249 describes a case where we see this SMTP error code from the remote server when our connection has been idle for too long. This would manifest when connecting to the remote server at startup, running a very long database query, and then sending mail. This commit allows the mailer to treat SMTP 421 errors as an event that should produce a reconnect attempt and resolves #2249.

A unit test is added to the mailer tests to test that reconnection works when the server sends a SMTP 421 error. Prior to b64e51f and support for SMTP 421 reconnection this test failed in a manner matching issue #2249:

```
go test -p 1 -race --test.run TestReconnectSMTP421
github.com/letsencrypt/boulder/mail
Wrote goodbye msg: 421 1.2.3 green.eggs.and.spam Error: timeout exceeded
Cutting off client early
--- FAIL: TestReconnectSMTP421 (0.00s)
  mailer_test.go:257: Expected SendMail() to not fail. Got err: 421
  1.2.3 green.eggs.and.spam Error: timeout exceeded
  FAIL
  FAIL  github.com/letsencrypt/boulder/mail     0.023s
```

With b64e51f the test passes and the client gracefully reconnects.

The existing reconnect testing logic in the `mail-test-srv` integration tests is changed such that half of the forced disconnects are a normal clean connection close and half are a SMTP 421. This allows the existing integration test for server disconnects to be reused to test the 421 reconnect logic.
  • Loading branch information
cpu committed Oct 20, 2016
1 parent 5f1919b commit 8efc634
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 29 deletions.
20 changes: 12 additions & 8 deletions cmd/expiration-mailer/main.go
Expand Up @@ -24,7 +24,7 @@ import (
"github.com/letsencrypt/boulder/cmd"
"github.com/letsencrypt/boulder/core"
blog "github.com/letsencrypt/boulder/log"
"github.com/letsencrypt/boulder/mail"
bmail "github.com/letsencrypt/boulder/mail"
"github.com/letsencrypt/boulder/metrics"
"github.com/letsencrypt/boulder/rpc"
"github.com/letsencrypt/boulder/sa"
Expand All @@ -47,7 +47,7 @@ type mailer struct {
log blog.Logger
dbMap *gorp.DbMap
rs regStore
mailer mail.Mailer
mailer bmail.Mailer
emailTemplate *template.Template
subject string
nagTimes []time.Duration
Expand Down Expand Up @@ -155,6 +155,15 @@ func (m *mailer) processCerts(allCerts []core.Certificate) {
regIDToCerts[cert.RegistrationID] = cs
}

err := m.mailer.Connect()
if err != nil {
m.log.AuditErr(fmt.Sprintf("Error connecting to send nag emails: %s", err))
return
}
defer func() {
_ = m.mailer.Close()
}()

for regID, certs := range regIDToCerts {
reg, err := m.rs.GetRegistration(ctx, regID)
if err != nil {
Expand Down Expand Up @@ -379,7 +388,7 @@ func main() {

smtpPassword, err := c.Mailer.PasswordConfig.Pass()
cmd.FailOnError(err, "Failed to load SMTP password")
mailClient := mail.New(
mailClient := bmail.New(
c.Mailer.Server,
c.Mailer.Port,
c.Mailer.Username,
Expand All @@ -389,11 +398,6 @@ func main() {
scope,
*reconnBase,
*reconnMax)
err = mailClient.Connect()
cmd.FailOnError(err, "Couldn't connect to mail server.")
defer func() {
_ = mailClient.Close()
}()

nagCheckInterval := defaultNagCheckInterval
if s := c.Mailer.NagCheckInterval; s != "" {
Expand Down
15 changes: 8 additions & 7 deletions cmd/notify-mailer/main.go
Expand Up @@ -99,6 +99,14 @@ func (m *mailer) run() error {
return err
}

err = m.mailer.Connect()
if err != nil {
return err
}
defer func() {
_ = m.mailer.Close()
}()

startTime := m.clk.Now()

for i, dest := range destinations {
Expand Down Expand Up @@ -345,13 +353,6 @@ func main() {
*reconnBase,
*reconnMax)
}
err = mailClient.Connect()
cmd.FailOnError(err, fmt.Sprintf("Connecting to %s:%s",
cfg.NotifyMailer.Server, cfg.NotifyMailer.Port))
defer func() {
err = mailClient.Close()
cmd.FailOnError(err, "Closing mail client")
}()

m := mailer{
clk: cmd.Clock(),
Expand Down
30 changes: 26 additions & 4 deletions mail/mailer.go
Expand Up @@ -13,6 +13,7 @@ import (
"net"
"net/mail"
"net/smtp"
"net/textproto"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -292,10 +293,31 @@ func (m *MailerImpl) SendMail(to []string, subject, msg string) error {
m.stats.Inc("SendMail.Reconnects", 1)
continue
} else if err != nil {
// If it wasn't an EOF error it is unexpected and we return from
// SendMail() with an error
m.stats.Inc("SendMail.Errors", 1)
return err
/*
* If the error is an instance of `textproto.Error` with a SMTP error code,
* and that error code is 421 then treat this as a reconnect-able event.
*
* The SMTP RFC defines this error code as:
* 421 <domain> Service not available, closing transmission channel
* (This may be a reply to any command if the service knows it
* must shut down)
*
* In practice we see this code being used by our production SMTP server
* when the connection has gone idle for too long. For more information
* see issue #2249[0].
*
* [0] - https://github.com/letsencrypt/boulder/issues/2249
*/
if protoErr, ok := err.(*textproto.Error); ok && protoErr.Code == 421 {
m.stats.Inc("SendMail.Errors.SMTP.421", 1)
m.reconnect()
m.stats.Inc("SendMail.Reconnects", 1)
} else {
// If it wasn't an EOF error or a SMTP 421 it is unexpected and we
// return from SendMail() with an error
m.stats.Inc("SendMail.Errors", 1)
return err
}
}
}

Expand Down
44 changes: 39 additions & 5 deletions mail/mailer_test.go
Expand Up @@ -121,10 +121,11 @@ func normalHandler(connID int, t *testing.T, conn net.Conn) {
// The disconnectHandler authenticates the client like the normalHandler but
// additionally processes an email flow (e.g. MAIL, RCPT and DATA commands).
// When the `connID` is <= `closeFirst` the connection is closed immediately
// after the MAIL command is received and prior to issuing a 250 response. In
// this way the first `closeFirst` connections will not complete normally and
// can be tested for reconnection logic.
func disconnectHandler(closeFirst int) connHandler {
// after the MAIL command is received and prior to issuing a 250 response. If
// a `goodbyeMsg` is provided, it is written to the client immediately before
// closing. In this way the first `closeFirst` connections will not complete
// normally and can be tested for reconnection logic.
func disconnectHandler(closeFirst int, goodbyeMsg string) connHandler {
return func(connID int, t *testing.T, conn net.Conn) {
defer func() {
err := conn.Close()
Expand All @@ -140,6 +141,13 @@ func disconnectHandler(closeFirst int) connHandler {
}

if connID <= closeFirst {
// If there was a `goodbyeMsg` specified, write it to the client before
// closing the connection. This is a good way to deliver a SMTP error
// before closing
if goodbyeMsg != "" {
_, _ = conn.Write([]byte(fmt.Sprintf("%s\r\n", goodbyeMsg)))
fmt.Printf("Wrote goodbye msg: %s\n", goodbyeMsg)
}
fmt.Printf("Cutting off client early\n")
return
}
Expand Down Expand Up @@ -215,7 +223,33 @@ func TestReconnectSuccess(t *testing.T) {

// Configure a test server that will disconnect the first `closedConns`
// connections after the MAIL cmd
go listenForever(l, t, disconnectHandler(closedConns))
go listenForever(l, t, disconnectHandler(closedConns, ""))

// With a mailer client that has a max attempt > `closedConns` we expect no
// error. The message should be delivered after `closedConns` reconnect
// attempts.
err := m.Connect()
if err != nil {
t.Errorf("Failed to connect: %s", err)
}
err = m.SendMail([]string{"hi@bye.com"}, "You are already a winner!", "Just kidding")
if err != nil {
t.Errorf("Expected SendMail() to not fail. Got err: %s", err)
}
}

func TestReconnectSMTP421(t *testing.T) {
m, l, cleanUp := setup(t)
defer cleanUp()
const closedConns = 5

// A SMTP 421 can be generated when the server times out an idle connection.
// For more information see https://github.com/letsencrypt/boulder/issues/2249
smtp421 := "421 1.2.3 green.eggs.and.spam Error: timeout exceeded"

// Configure a test server that will disconnect the first `closedConns`
// connections after the MAIL cmd with a SMTP 421 error
go listenForever(l, t, disconnectHandler(closedConns, smtp421))

// With a mailer client that has a max attempt > `closedConns` we expect no
// error. The message should be delivered after `closedConns` reconnect
Expand Down
24 changes: 19 additions & 5 deletions test/mail-test-srv/main.go
Expand Up @@ -101,11 +101,25 @@ func (srv *mailSrv) handleConn(conn net.Conn) {
case "MAIL":
srv.connNumberMutex.RLock()
if srv.connNumber <= srv.closeFirst {
log.Printf(
"mail-test-srv: connection # %d < -closeFirst parameter %d, disconnecting client. Bye!\n",
srv.connNumber, srv.closeFirst)
clearState()
conn.Close()
// Half of the time, close cleanly to simulate the server side closing
// unexpectedly.
if srv.connNumber%2 == 0 {
log.Printf(
"mail-test-srv: connection # %d < -closeFirst parameter %d, disconnecting client. Bye!\n",
srv.connNumber, srv.closeFirst)
clearState()
conn.Close()
} else {
// The rest of the time, simulate a stale connection timeout by sending
// a SMTP 421 message. This replicates the timeout/close from issue
// 2249 - https://github.com/letsencrypt/boulder/issues/2249
log.Printf(
"mail-test-srv: connection # %d < -closeFirst parameter %d, disconnecting with 421. Bye!\n",
srv.connNumber, srv.closeFirst)
clearState()
conn.Write([]byte("421 1.2.3 foo.bar.baz Error: timeout exceeded \r\n"))
conn.Close()
}
}
srv.connNumberMutex.RUnlock()
clearState()
Expand Down

0 comments on commit 8efc634

Please sign in to comment.