Skip to content

Commit 4265507

Browse files
committed
hostagent: gate Ready on raw-TCP SSH banner probe
The host agent fires its Ready event after essentialRequirements is satisfied. Today the first SSH-related check runs a no-op script via Lima's own SSH client, which has internal retry-on-failure behavior. That makes the check coincidentally correct rather than a direct statement about whether external clients can use the forwarded SSH port. There is a race during VM bring-up where the host agent starts accepting TCP connections on 127.0.0.1:<sshLocalPort> before guest sshd has bound :22 inside the guest. A fresh external connection in that window gets accepted, the proxy has no live peer, the host side closes, and the client's first write fails with EPIPE ("Broken pipe"). The race is consistently reproducible: probing the forwarded port with ssh-keyscan during `limactl start` shows 7-8s of EPIPE on restart (vz, macOS aarch64, Ubuntu 24.04 cloudimg) and ~11s on first boot. On well-provisioned hardware the subsequent requirements (user session, ControlMaster, final boot scripts) coincidentally wait long enough that the race has closed before Ready fires. With --plain mode, non-Linux guests (which return immediately after the ssh requirement), or slower hardware, the race can extend past `limactl start` returning, and downstream tools that invoke ssh-keyscan / sftp / rsync immediately after see EPIPE. This commit adds a new essential requirement, "sshLocalPort serves an SSH banner", at the head of the list. It opens a fresh raw TCP connection to 127.0.0.1:<sshLocalPort>, reads the SSH identification string per RFC4253 §4.2, and validates the prefix. No SSH client is involved, so there is no internal retry that could mask the race. waitForRequirements wraps the probe in its existing 3-second backoff retry loop, so a transient EPIPE during bring-up just causes another attempt rather than failing the start. To support host-side native checks alongside the existing script-based ones, the requirement struct gains a `check func(ctx) error` field; waitForRequirement dispatches between the two (mutually exclusive per requirement). context is now threaded through waitForRequirements so the retry loop honors cancellation. Pure-Go unit tests cover banner success, the SSH-1.99 legacy prefix, accept-then-close (the exact race shape), wrong banner, no listener, and a hung-write read-deadline case. Signed-off-by: Weishi Z <amwish.zeng@gmail.com>
1 parent a28905c commit 4265507

3 files changed

Lines changed: 201 additions & 7 deletions

File tree

pkg/hostagent/hostagent.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,7 @@ func (a *HostAgent) startHostAgentRoutines(ctx context.Context) error {
557557
return nil
558558
})
559559
var errs []error
560-
if err := a.waitForRequirements("essential", a.essentialRequirements()); err != nil {
560+
if err := a.waitForRequirements(ctx, "essential", a.essentialRequirements()); err != nil {
561561
errs = append(errs, err)
562562
}
563563
if *a.instConfig.SSH.ForwardAgent {
@@ -629,7 +629,7 @@ sudo chown -R "${USER}" /run/host-services`
629629
}()
630630
}
631631
}
632-
if err := a.waitForRequirements("optional", a.optionalRequirements()); err != nil {
632+
if err := a.waitForRequirements(ctx, "optional", a.optionalRequirements()); err != nil {
633633
errs = append(errs, err)
634634
}
635635
if hasGuestAgentDaemon {
@@ -641,7 +641,7 @@ sudo chown -R "${USER}" /run/host-services`
641641
errs = append(errs, errors.New("guest agent does not seem to be running; port forwards will not work"))
642642
}
643643
}
644-
if err := a.waitForRequirements("final", a.finalRequirements()); err != nil {
644+
if err := a.waitForRequirements(ctx, "final", a.finalRequirements()); err != nil {
645645
errs = append(errs, err)
646646
}
647647
// Copy all config files _after_ the requirements are done

pkg/hostagent/requirements.go

Lines changed: 78 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,11 @@
44
package hostagent
55

66
import (
7+
"bufio"
8+
"context"
79
"errors"
810
"fmt"
11+
"net"
912
"runtime"
1013
"strings"
1114
"time"
@@ -17,7 +20,7 @@ import (
1720
"github.com/lima-vm/lima/v2/pkg/sshutil"
1821
)
1922

20-
func (a *HostAgent) waitForRequirements(label string, requirements []requirement) error {
23+
func (a *HostAgent) waitForRequirements(ctx context.Context, label string, requirements []requirement) error {
2124
const (
2225
retries = 200
2326
sleepDuration = 3 * time.Second
@@ -28,7 +31,7 @@ func (a *HostAgent) waitForRequirements(label string, requirements []requirement
2831
logrus.Infof("Waiting for the %s requirement %d of %d: %q", label, i+1, len(requirements), req.description)
2932
retryLoop:
3033
for j := range retries {
31-
err := a.waitForRequirement(req)
34+
err := a.waitForRequirement(ctx, req)
3235
if err == nil {
3336
logrus.Infof("The %s requirement %d of %d is satisfied", label, i+1, len(requirements))
3437
break retryLoop
@@ -42,7 +45,12 @@ func (a *HostAgent) waitForRequirements(label string, requirements []requirement
4245
errs = append(errs, fmt.Errorf("failed to satisfy the %s requirement %d of %d %q: %s: %w", label, i+1, len(requirements), req.description, req.debugHint, err))
4346
break retryLoop
4447
}
45-
time.Sleep(sleepDuration)
48+
select {
49+
case <-ctx.Done():
50+
errs = append(errs, fmt.Errorf("failed to satisfy the %s requirement %d of %d %q: %s: %w", label, i+1, len(requirements), req.description, req.debugHint, ctx.Err()))
51+
return errors.Join(errs...)
52+
case <-time.After(sleepDuration):
53+
}
4654
}
4755
}
4856
return errors.Join(errs...)
@@ -109,7 +117,11 @@ func (a *HostAgent) bashAvailable() bool {
109117
return *a.instConfig.OS != limatype.FREEBSD
110118
}
111119

112-
func (a *HostAgent) waitForRequirement(r requirement) error {
120+
func (a *HostAgent) waitForRequirement(ctx context.Context, r requirement) error {
121+
if r.check != nil {
122+
logrus.Debugf("checking requirement %q", r.description)
123+
return r.check(ctx)
124+
}
113125
logrus.Debugf("executing script %q", r.description)
114126
script := r.script
115127
if a.bashAvailable() {
@@ -147,16 +159,40 @@ func (a *HostAgent) waitForRequirement(r requirement) error {
147159
return nil
148160
}
149161

162+
// requirement describes a single readiness check performed by the host agent
163+
// before firing the Ready event. Exactly one of script and check must be set:
164+
// - script: a shell script executed inside the guest via ssh.ExecuteScript.
165+
// - check: a host-side Go probe that returns nil when the requirement is satisfied.
150166
type requirement struct {
151167
description string
152168
script string
169+
check func(ctx context.Context) error
153170
debugHint string
154171
fatal bool
155172
noMaster bool
156173
}
157174

158175
func (a *HostAgent) essentialRequirements() []requirement {
159176
req := make([]requirement, 0)
177+
// Probe the host-facing forwarded SSH port with a raw TCP connection before
178+
// running the script-based "ssh" requirement below. The script path uses
179+
// Lima's own SSH client, which has internal retry behavior that can mask a
180+
// race where the hostagent accepts a TCP connection on 127.0.0.1:sshLocalPort
181+
// before guest sshd has bound :22. External clients (ssh-keyscan, sftp,
182+
// rsync, ...) that connect during that window get EPIPE on the first write.
183+
// This native probe proves the public-facing path is end-to-end usable.
184+
port := a.sshLocalPort
185+
req = append(req,
186+
requirement{
187+
description: "sshLocalPort serves an SSH banner",
188+
check: func(ctx context.Context) error {
189+
return probeSSHBannerOnLocalPort(ctx, port)
190+
},
191+
debugHint: `The host agent is listening on 127.0.0.1:<sshLocalPort> but the
192+
proxied connection into the guest is failing. Most commonly this means
193+
guest sshd has not yet bound :22. Check the serial log for sshd startup.
194+
`,
195+
})
160196
req = append(req,
161197
requirement{
162198
description: "ssh",
@@ -292,3 +328,41 @@ Check "` + logLocation + `" to see where the process is blocked!
292328
})
293329
return req
294330
}
331+
332+
// probeSSHBannerTimeout bounds a single attempt of probeSSHBannerOnLocalPort.
333+
// waitForRequirements wraps the probe in its own retry loop (3s backoff), so
334+
// the per-attempt deadline only needs to be long enough to distinguish a
335+
// healthy SSH server from a hung TCP proxy.
336+
const probeSSHBannerTimeout = 5 * time.Second
337+
338+
// probeSSHBannerOnLocalPort confirms that the host agent's TCP forwarder on
339+
// 127.0.0.1:port is serving an SSH banner end-to-end. The hostagent starts
340+
// accepting connections on 127.0.0.1:sshLocalPort before guest sshd is
341+
// necessarily ready on :22, so a fresh external connect+read may fail with
342+
// EPIPE during the bring-up window. This probe verifies the public-facing
343+
// path is healthy without using Lima's own SSH client (which can mask the
344+
// race via internal retries).
345+
func probeSSHBannerOnLocalPort(ctx context.Context, port int) error {
346+
addr := fmt.Sprintf("127.0.0.1:%d", port)
347+
d := net.Dialer{Timeout: probeSSHBannerTimeout}
348+
conn, err := d.DialContext(ctx, "tcp", addr)
349+
if err != nil {
350+
return fmt.Errorf("dial %s: %w", addr, err)
351+
}
352+
defer conn.Close()
353+
if err := conn.SetReadDeadline(time.Now().Add(probeSSHBannerTimeout)); err != nil {
354+
return fmt.Errorf("set read deadline on %s: %w", addr, err)
355+
}
356+
// RFC4253 §4.2: an SSH server sends an identification string ending in CR
357+
// LF (or just LF historically) before the version exchange. If the proxied
358+
// connection inside the guest has no live peer, the host side will close
359+
// and ReadString returns EOF.
360+
banner, err := bufio.NewReader(conn).ReadString('\n')
361+
if err != nil {
362+
return fmt.Errorf("read SSH banner from %s: %w", addr, err)
363+
}
364+
if !strings.HasPrefix(banner, "SSH-2.0-") && !strings.HasPrefix(banner, "SSH-1.99-") {
365+
return fmt.Errorf("unexpected banner from %s: %q", addr, strings.TrimRight(banner, "\r\n"))
366+
}
367+
return nil
368+
}

pkg/hostagent/requirements_test.go

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
// SPDX-FileCopyrightText: Copyright The Lima Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package hostagent
5+
6+
import (
7+
"context"
8+
"net"
9+
"strconv"
10+
"strings"
11+
"testing"
12+
"time"
13+
14+
"gotest.tools/v3/assert"
15+
)
16+
17+
// startBannerServer starts a TCP listener on 127.0.0.1:0 that emits the given
18+
// banner to each accepted connection and then closes it. If banner is empty,
19+
// the connection is closed immediately (simulating the hostagent-proxies-to-
20+
// dead-guest-sshd race).
21+
func startBannerServer(t *testing.T, banner string) int {
22+
t.Helper()
23+
var lc net.ListenConfig
24+
ln, err := lc.Listen(t.Context(), "tcp", "127.0.0.1:0")
25+
assert.NilError(t, err)
26+
t.Cleanup(func() { _ = ln.Close() })
27+
go func() {
28+
for {
29+
conn, err := ln.Accept()
30+
if err != nil {
31+
return
32+
}
33+
go func(c net.Conn) {
34+
defer c.Close()
35+
if banner != "" {
36+
_, _ = c.Write([]byte(banner))
37+
}
38+
}(conn)
39+
}
40+
}()
41+
_, portStr, err := net.SplitHostPort(ln.Addr().String())
42+
assert.NilError(t, err)
43+
port, err := strconv.Atoi(portStr)
44+
assert.NilError(t, err)
45+
return port
46+
}
47+
48+
func TestProbeSSHBannerOnLocalPort_OK(t *testing.T) {
49+
port := startBannerServer(t, "SSH-2.0-OpenSSH_9.6p1 Ubuntu-3ubuntu13\r\n")
50+
assert.NilError(t, probeSSHBannerOnLocalPort(t.Context(), port))
51+
}
52+
53+
func TestProbeSSHBannerOnLocalPort_LegacyBanner(t *testing.T) {
54+
// RFC4253 §5.1 reserves "SSH-1.99-" for servers that support both v1 and v2.
55+
port := startBannerServer(t, "SSH-1.99-OpenSSH_old\r\n")
56+
assert.NilError(t, probeSSHBannerOnLocalPort(t.Context(), port))
57+
}
58+
59+
func TestProbeSSHBannerOnLocalPort_AcceptThenClose(t *testing.T) {
60+
// Simulates the race: TCP forwarder accepts but the proxied guest peer is
61+
// not listening, so the host side closes immediately and the client sees EOF.
62+
port := startBannerServer(t, "")
63+
err := probeSSHBannerOnLocalPort(t.Context(), port)
64+
assert.ErrorContains(t, err, "read SSH banner")
65+
}
66+
67+
func TestProbeSSHBannerOnLocalPort_WrongBanner(t *testing.T) {
68+
port := startBannerServer(t, "HTTP/1.1 200 OK\r\n")
69+
err := probeSSHBannerOnLocalPort(t.Context(), port)
70+
assert.ErrorContains(t, err, "unexpected banner")
71+
}
72+
73+
func TestProbeSSHBannerOnLocalPort_NoListener(t *testing.T) {
74+
// Bind a listener to grab a free port, then close it before probing so the
75+
// dial returns ECONNREFUSED (or equivalent on the host platform).
76+
var lc net.ListenConfig
77+
ln, err := lc.Listen(t.Context(), "tcp", "127.0.0.1:0")
78+
assert.NilError(t, err)
79+
_, portStr, err := net.SplitHostPort(ln.Addr().String())
80+
assert.NilError(t, err)
81+
port, err := strconv.Atoi(portStr)
82+
assert.NilError(t, err)
83+
assert.NilError(t, ln.Close())
84+
85+
err = probeSSHBannerOnLocalPort(t.Context(), port)
86+
assert.Assert(t, err != nil)
87+
assert.Assert(t, strings.Contains(err.Error(), "dial 127.0.0.1:"+portStr))
88+
}
89+
90+
func TestProbeSSHBannerOnLocalPort_HungWrite(t *testing.T) {
91+
// Server accepts but neither writes a banner nor closes, so the probe must
92+
// time out on the read deadline rather than hang forever.
93+
var lc net.ListenConfig
94+
ln, err := lc.Listen(t.Context(), "tcp", "127.0.0.1:0")
95+
assert.NilError(t, err)
96+
t.Cleanup(func() { _ = ln.Close() })
97+
accepted := make(chan net.Conn, 1)
98+
go func() {
99+
conn, err := ln.Accept()
100+
if err != nil {
101+
return
102+
}
103+
accepted <- conn
104+
}()
105+
_, portStr, err := net.SplitHostPort(ln.Addr().String())
106+
assert.NilError(t, err)
107+
port, err := strconv.Atoi(portStr)
108+
assert.NilError(t, err)
109+
110+
ctx, cancel := context.WithTimeout(t.Context(), 2*probeSSHBannerTimeout)
111+
defer cancel()
112+
start := time.Now()
113+
err = probeSSHBannerOnLocalPort(ctx, port)
114+
elapsed := time.Since(start)
115+
if conn := <-accepted; conn != nil {
116+
_ = conn.Close()
117+
}
118+
assert.Assert(t, err != nil)
119+
assert.Assert(t, elapsed < 2*probeSSHBannerTimeout, "probe did not honor read deadline: elapsed=%s", elapsed)
120+
}

0 commit comments

Comments
 (0)