Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
fb3e57a
feat: implement fail-safe mode UI
adamshiervani Nov 7, 2025
fc7156f
fix: remove unused variable from FailSafeModeOverlay component
adamshiervani Nov 7, 2025
2c512f7
chore: comment out failsafe mode timeout in KvmIdRoute component
adamshiervani Nov 7, 2025
42a0e1f
refactor: update FailSafeModeOverlay to simplify reboot handling and …
adamshiervani Nov 7, 2025
6afb15b
chore: comment out failsafe mode timeout in KvmIdRoute component
adamshiervani Nov 7, 2025
239c2dc
chore: backport supervisor changes
ym Nov 7, 2025
03ab8d8
feat: add failsafe mode to recover from infinite restarts caused by c…
ym Nov 7, 2025
99a6012
feat: add failsafe mode to recover from infinite restarts caused by c…
ym Nov 7, 2025
72f29df
fix: ignore errors when crash log doesn't exist
ym Nov 7, 2025
82ad2a4
feat: enhance FailSafeModeOverlay with tooltip and log download impro…
adamshiervani Nov 7, 2025
5933adb
feat: implement FailSafeModeOverlay component with log download and i…
adamshiervani Nov 7, 2025
502cd4e
fix: handle mDNS initialization error without exiting the application
adamshiervani Nov 7, 2025
6b052e7
Update ui/src/components/FailSafeModeOverlay.tsx
ym Nov 10, 2025
9115362
chore: make downgrade version configurable
ym Nov 10, 2025
426cd6f
fix: update KvmIdRoute to conditionally render WebRTCVideo based on f…
adamshiervani Nov 10, 2025
bbbea61
fix: simplify tooltip text in FailSafeModeOverlay for clarity
adamshiervani Nov 10, 2025
029a78e
fix: update FailsafeModeState to ensure reason is always a string and…
adamshiervani Nov 10, 2025
4def3e6
fix: adjust failsafe mode handling in KvmIdRoute to improve WebRTCVid…
adamshiervani Nov 10, 2025
b70199c
Revert "fix: adjust failsafe mode handling in KvmIdRoute to improve W…
adamshiervani Nov 10, 2025
d8e3140
fix: Proper logic for hiding the WebRTC Video stream
adamshiervani Nov 10, 2025
8406a35
fix: reset failsafe mode on reboot
adamshiervani Nov 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 86 additions & 22 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ import (
)

const (
envChildID = "JETKVM_CHILD_ID"
errorDumpDir = "/userdata/jetkvm/"
errorDumpStateFile = ".has_error_dump"
errorDumpTemplate = "jetkvm-%s.log"
envChildID = "JETKVM_CHILD_ID"
errorDumpDir = "/userdata/jetkvm/crashdump"
errorDumpLastFile = "last-crash.log"
errorDumpTemplate = "jetkvm-%s.log"
)

func program() {
Expand Down Expand Up @@ -74,7 +74,12 @@ func supervise() error {
// run the child binary
cmd := exec.Command(binPath)

cmd.Env = append(os.Environ(), []string{envChildID + "=" + kvm.GetBuiltAppVersion()}...)
lastFilePath := filepath.Join(errorDumpDir, errorDumpLastFile)

cmd.Env = append(os.Environ(), []string{
fmt.Sprintf("%s=%s", envChildID, kvm.GetBuiltAppVersion()),
fmt.Sprintf("JETKVM_LAST_ERROR_PATH=%s", lastFilePath),
}...)
cmd.Args = os.Args

logFile, err := os.CreateTemp("", "jetkvm-stdout.log")
Expand Down Expand Up @@ -117,49 +122,108 @@ func supervise() error {
return nil
}

func createErrorDump(logFile *os.File) {
logFile.Close()
func isSymlinkTo(oldName, newName string) bool {
file, err := os.Stat(newName)
if err != nil {
return false
}
if file.Mode()&os.ModeSymlink != os.ModeSymlink {
return false
}
target, err := os.Readlink(newName)
if err != nil {
return false
}
return target == oldName
}

// touch the error dump state file
if err := os.WriteFile(filepath.Join(errorDumpDir, errorDumpStateFile), []byte{}, 0644); err != nil {
return
func ensureSymlink(oldName, newName string) error {
if isSymlinkTo(oldName, newName) {
return nil
}
_ = os.Remove(newName)
return os.Symlink(oldName, newName)
}

fileName := fmt.Sprintf(errorDumpTemplate, time.Now().Format("20060102150405"))
filePath := filepath.Join(errorDumpDir, fileName)
if err := os.Rename(logFile.Name(), filePath); err == nil {
fmt.Printf("error dump created: %s\n", filePath)
return
func renameFile(f *os.File, newName string) error {
_ = f.Close()

// try to rename the file first
if err := os.Rename(f.Name(), newName); err == nil {
return nil
}

fnSrc, err := os.Open(logFile.Name())
// copy the log file to the error dump directory
fnSrc, err := os.Open(f.Name())
if err != nil {
return
return fmt.Errorf("failed to open file: %w", err)
}
defer fnSrc.Close()

fnDst, err := os.Create(filePath)
fnDst, err := os.Create(newName)
if err != nil {
return
return fmt.Errorf("failed to create file: %w", err)
}
defer fnDst.Close()

buf := make([]byte, 1024*1024)
for {
n, err := fnSrc.Read(buf)
if err != nil && err != io.EOF {
return
return fmt.Errorf("failed to read file: %w", err)
}
if n == 0 {
break
}

if _, err := fnDst.Write(buf[:n]); err != nil {
return
return fmt.Errorf("failed to write file: %w", err)
}
}

fmt.Printf("error dump created: %s\n", filePath)
return nil
}

func ensureErrorDumpDir() error {
// TODO: check if the directory is writable
f, err := os.Stat(errorDumpDir)
if err == nil && f.IsDir() {
return nil
}
if err := os.MkdirAll(errorDumpDir, 0755); err != nil {
return fmt.Errorf("failed to create error dump directory: %w", err)
}
return nil
}

func createErrorDump(logFile *os.File) {
fmt.Println()

fileName := fmt.Sprintf(
errorDumpTemplate,
time.Now().Format("20060102-150405"),
)

// check if the directory exists
if err := ensureErrorDumpDir(); err != nil {
fmt.Printf("failed to ensure error dump directory: %v\n", err)
return
}

filePath := filepath.Join(errorDumpDir, fileName)
if err := renameFile(logFile, filePath); err != nil {
fmt.Printf("failed to rename file: %v\n", err)
return
}

fmt.Printf("error dump copied: %s\n", filePath)

lastFilePath := filepath.Join(errorDumpDir, errorDumpLastFile)

if err := ensureSymlink(filePath, lastFilePath); err != nil {
fmt.Printf("failed to create symlink: %v\n", err)
return
}
}

func doSupervise() {
Expand Down
107 changes: 107 additions & 0 deletions failsafe.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
package kvm

import (
"fmt"
"os"
"strings"
"sync"
)

const (
failsafeDefaultLastCrashPath = "/userdata/jetkvm/crashdump/last-crash.log"
failsafeFile = "/userdata/jetkvm/.enablefailsafe"
failsafeLastCrashEnv = "JETKVM_LAST_ERROR_PATH"
failsafeEnv = "JETKVM_FORCE_FAILSAFE"
)

var (
failsafeOnce sync.Once
failsafeCrashLog = ""
failsafeModeActive = false
failsafeModeReason = ""
)

type FailsafeModeNotification struct {
Active bool `json:"active"`
Reason string `json:"reason"`
}

// this function has side effects and can be only executed once
func checkFailsafeReason() {
failsafeOnce.Do(func() {
// check if the failsafe environment variable is set
if os.Getenv(failsafeEnv) == "1" {
failsafeModeActive = true
failsafeModeReason = "failsafe_env_set"
return
}

// check if the failsafe file exists
if _, err := os.Stat(failsafeFile); err == nil {
failsafeModeActive = true
failsafeModeReason = "failsafe_file_exists"
_ = os.Remove(failsafeFile)
return
}

// get the last crash log path from the environment variable
lastCrashPath := os.Getenv(failsafeLastCrashEnv)
if lastCrashPath == "" {
lastCrashPath = failsafeDefaultLastCrashPath
}

// check if the last crash log file exists
l := failsafeLogger.With().Str("path", lastCrashPath).Logger()
fi, err := os.Lstat(lastCrashPath)
if err != nil {
if !os.IsNotExist(err) {
l.Warn().Err(err).Msg("failed to stat last crash log")
}
return
}

if fi.Mode()&os.ModeSymlink != os.ModeSymlink {
l.Warn().Msg("last crash log is not a symlink, ignoring")
return
}

// open the last crash log file and find if it contains the string "panic"
content, err := os.ReadFile(lastCrashPath)
if err != nil {
l.Warn().Err(err).Msg("failed to read last crash log")
return
}

// unlink the last crash log file
failsafeCrashLog = string(content)
_ = os.Remove(lastCrashPath)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we delay trashing the crashlog until we've exited the failsafe mode through some action?


// TODO: read the goroutine stack trace and check which goroutine is panicking
if strings.Contains(failsafeCrashLog, "runtime.cgocall") {
failsafeModeActive = true
failsafeModeReason = "video"
return
}
})
}

func notifyFailsafeMode(session *Session) {
if !failsafeModeActive || session == nil {
return
}

jsonRpcLogger.Info().Str("reason", failsafeModeReason).Msg("sending failsafe mode notification")

writeJSONRPCEvent("failsafeMode", FailsafeModeNotification{
Active: true,
Reason: failsafeModeReason,
}, session)
}

func rpcGetFailsafeLogs() (string, error) {
if !failsafeModeActive {
return "", fmt.Errorf("failsafe mode is not active")
}

return failsafeCrashLog, nil
}
Loading