Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions internal/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ func (rc *ReportingCommand) Run() error {
go func() {
sig := <-sigChannel
slog.Info("received signal", slog.String("signal", sig.String()))
// when perfspect receives ctrl-c while in the shell, the shell makes sure to propogate the
// signal to all our children. But when perfspect is run in the background or disowned and
// then receives SIGINT, e.g., from a script, we need to send the signal to our children
util.SignalChildren(syscall.SIGINT)
}()
// get the data we need to generate reports
orderedTargetScriptOutputs, err := rc.retrieveScriptOutputs(localTempDir)
Expand Down
9 changes: 6 additions & 3 deletions internal/report/table_defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -2163,10 +2163,11 @@ func powerStatsTableValues(outputs map[string]script.ScriptOutput) []Field {
}

func gaudiStatsTableValues(outputs map[string]script.ScriptOutput) []Field {
// build fields to match CSV output from hl_smi tool
fields := []Field{}
// parse the CSV output
csvOutput := outputs[script.GaudiStatsScriptName].Stdout
if csvOutput == "" {
return []Field{}
}
r := csv.NewReader(strings.NewReader(csvOutput))
rows, err := r.ReadAll()
if err != nil {
Expand All @@ -2177,6 +2178,8 @@ func gaudiStatsTableValues(outputs map[string]script.ScriptOutput) []Field {
slog.Error("gaudi stats output is not in expected format")
return []Field{}
}
// build fields to match CSV output from hl_smi tool
fields := []Field{}
// first row is the header, extract field names
for _, fieldName := range rows[0] {
fields = append(fields, Field{Name: strings.TrimSpace(fieldName)})
Expand Down Expand Up @@ -2229,7 +2232,7 @@ func instructionMixTableValues(outputs map[string]script.ScriptOutput) []Field {
var interval int
lines := strings.Split(outputs[script.InstructionMixScriptName].Stdout, "\n")
if len(lines) < 4 {
slog.Error("no data found in instruction mix output")
slog.Warn("no data found in instruction mix output")
return []Field{}
}
// TIME
Expand Down
51 changes: 39 additions & 12 deletions internal/script/script.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ func RunScripts(myTarget target.Target, scripts []ScriptDefinition, ignoreScript
if len(parallelScripts) > 0 {
// form one master script that calls all the parallel scripts in the background
masterScriptName := "parallel_master.sh"
masterScript, needsElevatedPrivileges := formMasterScript(myTarget, parallelScripts)
masterScript, needsElevatedPrivileges := formMasterScript(myTarget.GetTempDirectory(), parallelScripts)
// write master script to local file
masterScriptPath := path.Join(localTempDirForTarget, masterScriptName)
err = os.WriteFile(masterScriptPath, []byte(masterScript), 0644)
Expand Down Expand Up @@ -278,11 +278,10 @@ func scriptNameToFilename(name string) string {

// formMasterScript forms a master script that runs all parallel scripts in the background, waits for them to finish, then prints the output of each script.
// Return values are the master script and a boolean indicating whether the master script requires elevated privileges.
func formMasterScript(myTarget target.Target, parallelScripts []ScriptDefinition) (string, bool) {
func formMasterScript(targetTempDirectory string, parallelScripts []ScriptDefinition) (string, bool) {
// we write the stdout and stderr from each command to temporary files and save the PID of each command
// in a variable named after the script
var masterScript strings.Builder
targetTempDirectory := myTarget.GetTempDirectory()

masterScript.WriteString("#!/bin/bash\n")

Expand All @@ -304,11 +303,34 @@ func formMasterScript(myTarget target.Target, parallelScripts []ScriptDefinition
// function to handle SIGINT
masterScript.WriteString("\nhandle_sigint() {\n")
for _, script := range parallelScripts {
masterScript.WriteString(fmt.Sprintf("\tkill -SIGINT $%s_pid\n", sanitizeScriptName(script.Name)))
if script.NeedsKill {
// kill the command started by the script
masterScript.WriteString(fmt.Sprintf("\tkill -SIGKILL $(cat %s_cmd.pid)\n", sanitizeScriptName(script.Name)))
masterScript.WriteString(fmt.Sprintf("\t%s_exitcode=137\n", sanitizeScriptName(script.Name))) // 137 is the exit code for SIGKILL
// send SIGINT to the child script, if it is still running
masterScript.WriteString(fmt.Sprintf("\tif ps -p \"$%s_pid\" > /dev/null; then\n", sanitizeScriptName(script.Name)))
masterScript.WriteString(fmt.Sprintf("\t\tkill -SIGINT $%s_pid\n", sanitizeScriptName(script.Name)))
masterScript.WriteString("\tfi\n")
if script.NeedsKill { // this is primarily used for scripts that start commands in the background, some of which (processwatch) doesn't respond to SIGINT as expected
// if the *cmd.pid file exists, check if the process is still running
masterScript.WriteString(fmt.Sprintf("\tif [ -f %s_cmd.pid ]; then\n", sanitizeScriptName(script.Name)))
masterScript.WriteString(fmt.Sprintf("\t\tif ps -p $(cat %s_cmd.pid) > /dev/null; then\n", sanitizeScriptName(script.Name)))
// send SIGINT to the background process first, then SIGKILL if it doesn't respond to SIGINT
masterScript.WriteString(fmt.Sprintf("\t\t\tkill -SIGINT $(cat %s_cmd.pid)\n", sanitizeScriptName(script.Name)))
// give the process a chance to respond to SIGINT
masterScript.WriteString("\t\t\tsleep 0.5\n")
// if the background process is still running, send SIGKILL
masterScript.WriteString(fmt.Sprintf("\t\t\tif ps -p $(cat %s_cmd.pid) > /dev/null; then\n", sanitizeScriptName(script.Name)))
masterScript.WriteString(fmt.Sprintf("\t\t\t\tkill -SIGKILL $(cat %s_cmd.pid)\n", sanitizeScriptName(script.Name)))
masterScript.WriteString(fmt.Sprintf("\t\t\t\t%s_exitcode=137\n", sanitizeScriptName(script.Name))) // 137 is the exit code for SIGKILL
masterScript.WriteString("\t\t\telse\n")
// if the background process has exited, set the exit code to 0
masterScript.WriteString(fmt.Sprintf("\t\t\t\t%s_exitcode=0\n", sanitizeScriptName(script.Name)))
masterScript.WriteString("\t\t\tfi\n")
masterScript.WriteString("\t\telse\n")
// if the script itself has exited, set the exit code to 0
masterScript.WriteString(fmt.Sprintf("\t\t\t%s_exitcode=0\n", sanitizeScriptName(script.Name)))
masterScript.WriteString("\t\tfi\n")
masterScript.WriteString("\telse\n")
// if the *cmd.pid file doesn't exist, set the exit code to 1
masterScript.WriteString(fmt.Sprintf("\t\t%s_exitcode=0\n", sanitizeScriptName(script.Name)))
masterScript.WriteString("\tfi\n")
} else {
masterScript.WriteString(fmt.Sprintf("\twait \"$%s_pid\"\n", sanitizeScriptName(script.Name)))
masterScript.WriteString(fmt.Sprintf("\t%s_exitcode=$?\n", sanitizeScriptName(script.Name)))
Expand Down Expand Up @@ -402,10 +424,15 @@ func parseMasterScriptOutput(masterScriptOutput string) (scriptOutputs []ScriptO
}
stdout = strings.Join(stdoutLines, "\n")
stderr = strings.Join(stderrLines, "\n")
exitCodeInt, err := strconv.Atoi(exitcode)
if err != nil {
slog.Error("error converting exit code to integer, setting to -100", slog.String("exitcode", exitcode), slog.String("error", err.Error()))
exitCodeInt = -100
exitCodeInt := -100
if exitcode == "" {
slog.Warn("exit code for script not set", slog.String("script", scriptName))
} else {
var err error
exitCodeInt, err = strconv.Atoi(exitcode)
if err != nil {
slog.Warn("error converting exit code to integer", slog.String("exitcode", exitcode), slog.String("error", err.Error()), slog.String("script", scriptName))
}
}
scriptOutputs = append(scriptOutputs, ScriptOutput{
ScriptDefinition: ScriptDefinition{Name: scriptName},
Expand Down
74 changes: 51 additions & 23 deletions internal/script/script_defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,8 @@ if [ -d "$cstate_dir" ]; then
done
else
echo "C-state directory not found."
fi`,
fi
`,
},
SpecTurboCoresScriptName: {
Name: SpecTurboCoresScriptName,
Expand Down Expand Up @@ -495,8 +496,7 @@ rdmsr 0x1ad # MSR_TURBO_RATIO_LIMIT: Maximum Ratio Limit of Turbo Mode
},
ElcScriptName: {
Name: ElcScriptName,
ScriptTemplate: `
# Script derived from bhs-power-mode script in Intel PCM repository
ScriptTemplate: `# Script derived from bhs-power-mode script in Intel PCM repository
# Run the pcm-tpmi command to determine I/O and compute dies
output=$(pcm-tpmi 2 0x10 -d -b 26:26)

Expand Down Expand Up @@ -560,7 +560,7 @@ for die in "${!die_types[@]}"; do
fi
done <<< "$output"
done
`,
`,
Architectures: []string{x86_64},
Families: []string{"6"}, // Intel
Models: []string{"173", "175"}, // GNR, SRF
Expand Down Expand Up @@ -654,7 +654,8 @@ echo "" # finish the line
Name: ChaCountScriptName,
ScriptTemplate: `rdmsr 0x396
rdmsr 0x702
rdmsr 0x2FFE`, // uncore client cha count, uncore cha count, uncore cha count spr
rdmsr 0x2FFE
`, // uncore client cha count, uncore cha count, uncore cha count spr
Architectures: []string{x86_64},
Families: []string{"6"}, // Intel
Lkms: []string{"msr"},
Expand Down Expand Up @@ -707,7 +708,7 @@ rdmsr 0x2FFE`, // uncore client cha count, uncore cha count, uncore cha count sp
echo -n "IRQ Balance: "
pgrep irqbalance >/dev/null && echo "Enabled" || echo "Disabled"
done
`,
`,
Depends: []string{"lshw"},
Superuser: true,
},
Expand Down Expand Up @@ -756,13 +757,15 @@ do
fi
fi
echo "$name|$model|$size|$mountpoint|$fstype|$rqsize|$minio|$fw|$addr|$numa|$curlinkspeed|$curlinkwidth|$maxlinkspeed|$maxlinkwidth"
done`,
done
`,
},
HdparmScriptName: {
Name: HdparmScriptName,
ScriptTemplate: `lsblk -d -r -o NAME -e7 -e1 -n | while read -r device ; do
hdparm -i /dev/"$device"
done`,
done
`,
Superuser: true,
},
DfScriptName: {
Expand Down Expand Up @@ -859,7 +862,8 @@ for i in "${pmu_counters[@]}"; do
fi
# print the full list of PMU values
echo "Values: ${pmu_values[$i]}"
done`,
done
`,
Superuser: true,
Architectures: []string{x86_64},
Families: []string{"6"}, // Intel
Expand Down Expand Up @@ -897,7 +901,8 @@ if [ $needed_num_huge_pages -gt $orig_num_huge_pages ]; then
echo $needed_num_huge_pages > /proc/sys/vm/nr_hugepages
fi
mlc --loaded_latency
echo $orig_num_huge_pages > /proc/sys/vm/nr_hugepages`,
echo $orig_num_huge_pages > /proc/sys/vm/nr_hugepages
`,
Architectures: []string{x86_64},
Superuser: true,
Lkms: []string{"msr"},
Expand All @@ -917,7 +922,8 @@ if [ $needed_num_huge_pages -gt $orig_num_huge_pages ]; then
echo $needed_num_huge_pages > /proc/sys/vm/nr_hugepages
fi
mlc --bandwidth_matrix
echo $orig_num_huge_pages > /proc/sys/vm/nr_hugepages`,
echo $orig_num_huge_pages > /proc/sys/vm/nr_hugepages
`,
Architectures: []string{x86_64},
Superuser: true,
Lkms: []string{"msr"},
Expand All @@ -930,7 +936,8 @@ echo $orig_num_huge_pages > /proc/sys/vm/nr_hugepages`,
for method in $methods; do
printf "%s " "$method"
stress-ng --cpu 0 -t 1 --cpu-method "$method" --metrics-brief 2>&1 | tail -1 | awk '{print $9}'
done`,
done
`,
Superuser: false,
Depends: []string{"stress-ng"},
Sequential: true,
Expand Down Expand Up @@ -1005,7 +1012,8 @@ interleaved_core_list=$(IFS=,; echo "${interleaved_cores[*]}")
num_cores_per_socket=$( lscpu | grep 'Core(s) per socket:' | head -1 | awk '{print $4}' )

# Run the avx-turbo benchmark
avx-turbo --min-threads=1 --max-threads=$num_cores_per_socket --test scalar_iadd,avx128_fma,avx256_fma,avx512_fma --iters=100000 --cpuids=$interleaved_core_list`,
avx-turbo --min-threads=1 --max-threads=$num_cores_per_socket --test scalar_iadd,avx128_fma,avx256_fma,avx512_fma --iters=100000 --cpuids=$interleaved_core_list
`,
Superuser: true,
Lkms: []string{"msr"},
Depends: []string{"avx-turbo"},
Expand Down Expand Up @@ -1079,7 +1087,8 @@ if [ $duration -ne 0 ] && [ $interval -ne 0 ]; then
fi
mpstat -u -T -I SCPU -P ALL $interval $count &
echo $! > {{.ScriptName}}_cmd.pid
wait`,
wait
`,
Superuser: true,
Lkms: []string{},
Depends: []string{"mpstat"},
Expand All @@ -1094,7 +1103,8 @@ if [ $duration -ne 0 ] && [ $interval -ne 0 ]; then
fi
S_TIME_FORMAT=ISO iostat -d -t $interval $count | sed '/^loop/d' &
echo $! > {{.ScriptName}}_cmd.pid
wait`,
wait
`,
Superuser: true,
Lkms: []string{},
Depends: []string{"iostat"},
Expand All @@ -1109,7 +1119,8 @@ if [ $duration -ne 0 ] && [ $interval -ne 0 ]; then
fi
sar -r $interval $count &
echo $! > {{.ScriptName}}_cmd.pid
wait`,
wait
`,
Superuser: true,
Lkms: []string{},
Depends: []string{"sar", "sadc"},
Expand All @@ -1124,7 +1135,8 @@ if [ $duration -ne 0 ] && [ $interval -ne 0 ]; then
fi
sar -n DEV $interval $count &
echo $! > {{.ScriptName}}_cmd.pid
wait`,
wait
`,
Superuser: true,
Lkms: []string{},
Depends: []string{"sar", "sadc"},
Expand All @@ -1140,7 +1152,8 @@ if [ $duration -ne 0 ] && [ $interval -ne 0 ]; then
fi
turbostat -S -s PkgWatt,RAMWatt -q -i $interval $count | awk '{ print strftime("%H:%M:%S"), $0 }' &
echo $! > {{.ScriptName}}_cmd.pid
wait`,
wait
`,
Superuser: true,
Lkms: []string{"msr"},
Depends: []string{"turbostat"},
Expand Down Expand Up @@ -1173,21 +1186,35 @@ done

processwatch -c $arg_sampling_rate $arg_pid $arg_interval $arg_count $arg_filter &
echo $! > {{.ScriptName}}_cmd.pid
wait`,
wait
`,
Superuser: true,
Lkms: []string{"msr"},
Depends: []string{"processwatch"},
NeedsKill: true,
},
GaudiStatsScriptName: {
Name: GaudiStatsScriptName,
ScriptTemplate: `hl-smi --query-aip=timestamp,name,temperature.aip,module_id,utilization.aip,memory.total,memory.free,memory.used,power.draw --format=csv,nounits -l {{.Interval}} &
echo $! > {{.ScriptName}}_cmd.pid
sleep {{.Duration}}
kill -SIGINT $(cat {{.ScriptName}}_cmd.pid)`,
ScriptTemplate: `
# if the hl-smi program is in the path
if command -v hl-smi &> /dev/null; then
hl-smi --query-aip=timestamp,name,temperature.aip,module_id,utilization.aip,memory.total,memory.free,memory.used,power.draw --format=csv,nounits -l {{.Interval}} &
echo $! > {{.ScriptName}}_cmd.pid
# if duration is set, sleep for the duration then kill the process
if [ {{.Duration}} -ne 0 ]; then
sleep {{.Duration}}
kill -SIGINT $(cat {{.ScriptName}}_cmd.pid)
fi
wait
else
echo "hl-smi not found in the path" >&2
exit 1
fi
`,
Superuser: true,
NeedsKill: true,
},
// profile (flamegraph) scripts
ProfileJavaScriptName: {
Name: ProfileJavaScriptName,
ScriptTemplate: `interval={{.Interval}}
Expand Down Expand Up @@ -1287,6 +1314,7 @@ rm -f "$perf_fp_data" "$perf_dwarf_data" "$perf_dwarf_folded" "$perf_fp_folded"
Superuser: true,
Depends: []string{"perf", "stackcollapse-perf.pl"},
},
// lock analysis scripts
ProfileKernelLockScriptName: {
Name: ProfileKernelLockScriptName,
ScriptTemplate: `frequency={{.Frequency}}
Expand Down