diff --git a/internal/common/common.go b/internal/common/common.go index 69db78fe..cddc28b3 100644 --- a/internal/common/common.go +++ b/internal/common/common.go @@ -96,6 +96,10 @@ func (rc *ReportingCommand) Run() error { go func() { sig := <-sigChannel slog.Info("received signal", slog.String("signal", sig.String())) + // when perfspect receives ctrl-c while in the shell, the shell makes sure to propogate the + // signal to all our children. But when perfspect is run in the background or disowned and + // then receives SIGINT, e.g., from a script, we need to send the signal to our children + util.SignalChildren(syscall.SIGINT) }() // get the data we need to generate reports orderedTargetScriptOutputs, err := rc.retrieveScriptOutputs(localTempDir) diff --git a/internal/report/table_defs.go b/internal/report/table_defs.go index 071cd536..f1aa6c7f 100644 --- a/internal/report/table_defs.go +++ b/internal/report/table_defs.go @@ -2163,10 +2163,11 @@ func powerStatsTableValues(outputs map[string]script.ScriptOutput) []Field { } func gaudiStatsTableValues(outputs map[string]script.ScriptOutput) []Field { - // build fields to match CSV output from hl_smi tool - fields := []Field{} // parse the CSV output csvOutput := outputs[script.GaudiStatsScriptName].Stdout + if csvOutput == "" { + return []Field{} + } r := csv.NewReader(strings.NewReader(csvOutput)) rows, err := r.ReadAll() if err != nil { @@ -2177,6 +2178,8 @@ func gaudiStatsTableValues(outputs map[string]script.ScriptOutput) []Field { slog.Error("gaudi stats output is not in expected format") return []Field{} } + // build fields to match CSV output from hl_smi tool + fields := []Field{} // first row is the header, extract field names for _, fieldName := range rows[0] { fields = append(fields, Field{Name: strings.TrimSpace(fieldName)}) @@ -2229,7 +2232,7 @@ func instructionMixTableValues(outputs map[string]script.ScriptOutput) []Field { var interval int lines := strings.Split(outputs[script.InstructionMixScriptName].Stdout, "\n") if len(lines) < 4 { - slog.Error("no data found in instruction mix output") + slog.Warn("no data found in instruction mix output") return []Field{} } // TIME diff --git a/internal/script/script.go b/internal/script/script.go index 232254c5..816a03e8 100644 --- a/internal/script/script.go +++ b/internal/script/script.go @@ -130,7 +130,7 @@ func RunScripts(myTarget target.Target, scripts []ScriptDefinition, ignoreScript if len(parallelScripts) > 0 { // form one master script that calls all the parallel scripts in the background masterScriptName := "parallel_master.sh" - masterScript, needsElevatedPrivileges := formMasterScript(myTarget, parallelScripts) + masterScript, needsElevatedPrivileges := formMasterScript(myTarget.GetTempDirectory(), parallelScripts) // write master script to local file masterScriptPath := path.Join(localTempDirForTarget, masterScriptName) err = os.WriteFile(masterScriptPath, []byte(masterScript), 0644) @@ -278,11 +278,10 @@ func scriptNameToFilename(name string) string { // formMasterScript forms a master script that runs all parallel scripts in the background, waits for them to finish, then prints the output of each script. // Return values are the master script and a boolean indicating whether the master script requires elevated privileges. -func formMasterScript(myTarget target.Target, parallelScripts []ScriptDefinition) (string, bool) { +func formMasterScript(targetTempDirectory string, parallelScripts []ScriptDefinition) (string, bool) { // we write the stdout and stderr from each command to temporary files and save the PID of each command // in a variable named after the script var masterScript strings.Builder - targetTempDirectory := myTarget.GetTempDirectory() masterScript.WriteString("#!/bin/bash\n") @@ -304,11 +303,34 @@ func formMasterScript(myTarget target.Target, parallelScripts []ScriptDefinition // function to handle SIGINT masterScript.WriteString("\nhandle_sigint() {\n") for _, script := range parallelScripts { - masterScript.WriteString(fmt.Sprintf("\tkill -SIGINT $%s_pid\n", sanitizeScriptName(script.Name))) - if script.NeedsKill { - // kill the command started by the script - masterScript.WriteString(fmt.Sprintf("\tkill -SIGKILL $(cat %s_cmd.pid)\n", sanitizeScriptName(script.Name))) - masterScript.WriteString(fmt.Sprintf("\t%s_exitcode=137\n", sanitizeScriptName(script.Name))) // 137 is the exit code for SIGKILL + // send SIGINT to the child script, if it is still running + masterScript.WriteString(fmt.Sprintf("\tif ps -p \"$%s_pid\" > /dev/null; then\n", sanitizeScriptName(script.Name))) + masterScript.WriteString(fmt.Sprintf("\t\tkill -SIGINT $%s_pid\n", sanitizeScriptName(script.Name))) + masterScript.WriteString("\tfi\n") + if script.NeedsKill { // this is primarily used for scripts that start commands in the background, some of which (processwatch) doesn't respond to SIGINT as expected + // if the *cmd.pid file exists, check if the process is still running + masterScript.WriteString(fmt.Sprintf("\tif [ -f %s_cmd.pid ]; then\n", sanitizeScriptName(script.Name))) + masterScript.WriteString(fmt.Sprintf("\t\tif ps -p $(cat %s_cmd.pid) > /dev/null; then\n", sanitizeScriptName(script.Name))) + // send SIGINT to the background process first, then SIGKILL if it doesn't respond to SIGINT + masterScript.WriteString(fmt.Sprintf("\t\t\tkill -SIGINT $(cat %s_cmd.pid)\n", sanitizeScriptName(script.Name))) + // give the process a chance to respond to SIGINT + masterScript.WriteString("\t\t\tsleep 0.5\n") + // if the background process is still running, send SIGKILL + masterScript.WriteString(fmt.Sprintf("\t\t\tif ps -p $(cat %s_cmd.pid) > /dev/null; then\n", sanitizeScriptName(script.Name))) + masterScript.WriteString(fmt.Sprintf("\t\t\t\tkill -SIGKILL $(cat %s_cmd.pid)\n", sanitizeScriptName(script.Name))) + masterScript.WriteString(fmt.Sprintf("\t\t\t\t%s_exitcode=137\n", sanitizeScriptName(script.Name))) // 137 is the exit code for SIGKILL + masterScript.WriteString("\t\t\telse\n") + // if the background process has exited, set the exit code to 0 + masterScript.WriteString(fmt.Sprintf("\t\t\t\t%s_exitcode=0\n", sanitizeScriptName(script.Name))) + masterScript.WriteString("\t\t\tfi\n") + masterScript.WriteString("\t\telse\n") + // if the script itself has exited, set the exit code to 0 + masterScript.WriteString(fmt.Sprintf("\t\t\t%s_exitcode=0\n", sanitizeScriptName(script.Name))) + masterScript.WriteString("\t\tfi\n") + masterScript.WriteString("\telse\n") + // if the *cmd.pid file doesn't exist, set the exit code to 1 + masterScript.WriteString(fmt.Sprintf("\t\t%s_exitcode=0\n", sanitizeScriptName(script.Name))) + masterScript.WriteString("\tfi\n") } else { masterScript.WriteString(fmt.Sprintf("\twait \"$%s_pid\"\n", sanitizeScriptName(script.Name))) masterScript.WriteString(fmt.Sprintf("\t%s_exitcode=$?\n", sanitizeScriptName(script.Name))) @@ -402,10 +424,15 @@ func parseMasterScriptOutput(masterScriptOutput string) (scriptOutputs []ScriptO } stdout = strings.Join(stdoutLines, "\n") stderr = strings.Join(stderrLines, "\n") - exitCodeInt, err := strconv.Atoi(exitcode) - if err != nil { - slog.Error("error converting exit code to integer, setting to -100", slog.String("exitcode", exitcode), slog.String("error", err.Error())) - exitCodeInt = -100 + exitCodeInt := -100 + if exitcode == "" { + slog.Warn("exit code for script not set", slog.String("script", scriptName)) + } else { + var err error + exitCodeInt, err = strconv.Atoi(exitcode) + if err != nil { + slog.Warn("error converting exit code to integer", slog.String("exitcode", exitcode), slog.String("error", err.Error()), slog.String("script", scriptName)) + } } scriptOutputs = append(scriptOutputs, ScriptOutput{ ScriptDefinition: ScriptDefinition{Name: scriptName}, diff --git a/internal/script/script_defs.go b/internal/script/script_defs.go index 2df0be99..61cfe1eb 100644 --- a/internal/script/script_defs.go +++ b/internal/script/script_defs.go @@ -302,7 +302,8 @@ if [ -d "$cstate_dir" ]; then done else echo "C-state directory not found." -fi`, +fi +`, }, SpecTurboCoresScriptName: { Name: SpecTurboCoresScriptName, @@ -495,8 +496,7 @@ rdmsr 0x1ad # MSR_TURBO_RATIO_LIMIT: Maximum Ratio Limit of Turbo Mode }, ElcScriptName: { Name: ElcScriptName, - ScriptTemplate: ` -# Script derived from bhs-power-mode script in Intel PCM repository + ScriptTemplate: `# Script derived from bhs-power-mode script in Intel PCM repository # Run the pcm-tpmi command to determine I/O and compute dies output=$(pcm-tpmi 2 0x10 -d -b 26:26) @@ -560,7 +560,7 @@ for die in "${!die_types[@]}"; do fi done <<< "$output" done - `, +`, Architectures: []string{x86_64}, Families: []string{"6"}, // Intel Models: []string{"173", "175"}, // GNR, SRF @@ -654,7 +654,8 @@ echo "" # finish the line Name: ChaCountScriptName, ScriptTemplate: `rdmsr 0x396 rdmsr 0x702 -rdmsr 0x2FFE`, // uncore client cha count, uncore cha count, uncore cha count spr +rdmsr 0x2FFE +`, // uncore client cha count, uncore cha count, uncore cha count spr Architectures: []string{x86_64}, Families: []string{"6"}, // Intel Lkms: []string{"msr"}, @@ -707,7 +708,7 @@ rdmsr 0x2FFE`, // uncore client cha count, uncore cha count, uncore cha count sp echo -n "IRQ Balance: " pgrep irqbalance >/dev/null && echo "Enabled" || echo "Disabled" done - `, +`, Depends: []string{"lshw"}, Superuser: true, }, @@ -756,13 +757,15 @@ do fi fi echo "$name|$model|$size|$mountpoint|$fstype|$rqsize|$minio|$fw|$addr|$numa|$curlinkspeed|$curlinkwidth|$maxlinkspeed|$maxlinkwidth" -done`, +done +`, }, HdparmScriptName: { Name: HdparmScriptName, ScriptTemplate: `lsblk -d -r -o NAME -e7 -e1 -n | while read -r device ; do hdparm -i /dev/"$device" -done`, +done +`, Superuser: true, }, DfScriptName: { @@ -859,7 +862,8 @@ for i in "${pmu_counters[@]}"; do fi # print the full list of PMU values echo "Values: ${pmu_values[$i]}" -done`, +done +`, Superuser: true, Architectures: []string{x86_64}, Families: []string{"6"}, // Intel @@ -897,7 +901,8 @@ if [ $needed_num_huge_pages -gt $orig_num_huge_pages ]; then echo $needed_num_huge_pages > /proc/sys/vm/nr_hugepages fi mlc --loaded_latency -echo $orig_num_huge_pages > /proc/sys/vm/nr_hugepages`, +echo $orig_num_huge_pages > /proc/sys/vm/nr_hugepages +`, Architectures: []string{x86_64}, Superuser: true, Lkms: []string{"msr"}, @@ -917,7 +922,8 @@ if [ $needed_num_huge_pages -gt $orig_num_huge_pages ]; then echo $needed_num_huge_pages > /proc/sys/vm/nr_hugepages fi mlc --bandwidth_matrix -echo $orig_num_huge_pages > /proc/sys/vm/nr_hugepages`, +echo $orig_num_huge_pages > /proc/sys/vm/nr_hugepages +`, Architectures: []string{x86_64}, Superuser: true, Lkms: []string{"msr"}, @@ -930,7 +936,8 @@ echo $orig_num_huge_pages > /proc/sys/vm/nr_hugepages`, for method in $methods; do printf "%s " "$method" stress-ng --cpu 0 -t 1 --cpu-method "$method" --metrics-brief 2>&1 | tail -1 | awk '{print $9}' -done`, +done +`, Superuser: false, Depends: []string{"stress-ng"}, Sequential: true, @@ -1005,7 +1012,8 @@ interleaved_core_list=$(IFS=,; echo "${interleaved_cores[*]}") num_cores_per_socket=$( lscpu | grep 'Core(s) per socket:' | head -1 | awk '{print $4}' ) # Run the avx-turbo benchmark -avx-turbo --min-threads=1 --max-threads=$num_cores_per_socket --test scalar_iadd,avx128_fma,avx256_fma,avx512_fma --iters=100000 --cpuids=$interleaved_core_list`, +avx-turbo --min-threads=1 --max-threads=$num_cores_per_socket --test scalar_iadd,avx128_fma,avx256_fma,avx512_fma --iters=100000 --cpuids=$interleaved_core_list +`, Superuser: true, Lkms: []string{"msr"}, Depends: []string{"avx-turbo"}, @@ -1079,7 +1087,8 @@ if [ $duration -ne 0 ] && [ $interval -ne 0 ]; then fi mpstat -u -T -I SCPU -P ALL $interval $count & echo $! > {{.ScriptName}}_cmd.pid -wait`, +wait +`, Superuser: true, Lkms: []string{}, Depends: []string{"mpstat"}, @@ -1094,7 +1103,8 @@ if [ $duration -ne 0 ] && [ $interval -ne 0 ]; then fi S_TIME_FORMAT=ISO iostat -d -t $interval $count | sed '/^loop/d' & echo $! > {{.ScriptName}}_cmd.pid -wait`, +wait +`, Superuser: true, Lkms: []string{}, Depends: []string{"iostat"}, @@ -1109,7 +1119,8 @@ if [ $duration -ne 0 ] && [ $interval -ne 0 ]; then fi sar -r $interval $count & echo $! > {{.ScriptName}}_cmd.pid -wait`, +wait +`, Superuser: true, Lkms: []string{}, Depends: []string{"sar", "sadc"}, @@ -1124,7 +1135,8 @@ if [ $duration -ne 0 ] && [ $interval -ne 0 ]; then fi sar -n DEV $interval $count & echo $! > {{.ScriptName}}_cmd.pid -wait`, +wait +`, Superuser: true, Lkms: []string{}, Depends: []string{"sar", "sadc"}, @@ -1140,7 +1152,8 @@ if [ $duration -ne 0 ] && [ $interval -ne 0 ]; then fi turbostat -S -s PkgWatt,RAMWatt -q -i $interval $count | awk '{ print strftime("%H:%M:%S"), $0 }' & echo $! > {{.ScriptName}}_cmd.pid -wait`, +wait +`, Superuser: true, Lkms: []string{"msr"}, Depends: []string{"turbostat"}, @@ -1173,7 +1186,8 @@ done processwatch -c $arg_sampling_rate $arg_pid $arg_interval $arg_count $arg_filter & echo $! > {{.ScriptName}}_cmd.pid -wait`, +wait +`, Superuser: true, Lkms: []string{"msr"}, Depends: []string{"processwatch"}, @@ -1181,13 +1195,26 @@ wait`, }, GaudiStatsScriptName: { Name: GaudiStatsScriptName, - ScriptTemplate: `hl-smi --query-aip=timestamp,name,temperature.aip,module_id,utilization.aip,memory.total,memory.free,memory.used,power.draw --format=csv,nounits -l {{.Interval}} & -echo $! > {{.ScriptName}}_cmd.pid -sleep {{.Duration}} -kill -SIGINT $(cat {{.ScriptName}}_cmd.pid)`, + ScriptTemplate: ` +# if the hl-smi program is in the path +if command -v hl-smi &> /dev/null; then + hl-smi --query-aip=timestamp,name,temperature.aip,module_id,utilization.aip,memory.total,memory.free,memory.used,power.draw --format=csv,nounits -l {{.Interval}} & + echo $! > {{.ScriptName}}_cmd.pid + # if duration is set, sleep for the duration then kill the process + if [ {{.Duration}} -ne 0 ]; then + sleep {{.Duration}} + kill -SIGINT $(cat {{.ScriptName}}_cmd.pid) + fi + wait +else + echo "hl-smi not found in the path" >&2 + exit 1 +fi +`, Superuser: true, NeedsKill: true, }, + // profile (flamegraph) scripts ProfileJavaScriptName: { Name: ProfileJavaScriptName, ScriptTemplate: `interval={{.Interval}} @@ -1287,6 +1314,7 @@ rm -f "$perf_fp_data" "$perf_dwarf_data" "$perf_dwarf_folded" "$perf_fp_folded" Superuser: true, Depends: []string{"perf", "stackcollapse-perf.pl"}, }, + // lock analysis scripts ProfileKernelLockScriptName: { Name: ProfileKernelLockScriptName, ScriptTemplate: `frequency={{.Frequency}}