Skip to content

Commit 06d90d3

Browse files
committed
cpudist: Use finish_task_switch kprobe instead of sched_switch tracepoint
The `sched_switch` tracepoint approach requires storing the previous task's tgid in a map and fetching it from there, because it is not available as a tracepoint argument. Instead, placing a kprobe on the `finish_task_switch` function allows cleanly fetching the previous task's pid and tgid from the task_struct.
1 parent 3c976bb commit 06d90d3

File tree

2 files changed

+8
-39
lines changed

2 files changed

+8
-39
lines changed

man/man8/cpudist.8

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,6 @@ This tool uses in-kernel eBPF maps for storing timestamps and the histogram,
1919
for efficiency. Despite this, the overhead of this tool may become significant
2020
for some workloads: see the OVERHEAD section.
2121

22-
This tool uses the sched:sched_switch kernel tracepoint to determine when a
23-
task is scheduled and descheduled. If the tracepoint arguments change in the
24-
future, this tool will have to be updated. Still, it is more reliable than
25-
using kprobes on the respective kernel functions directly.
26-
2722
Since this uses BPF, only the root user can use this tool.
2823
.SH REQUIREMENTS
2924
CONFIG_BPF and bcc.

tools/cpudist.py

Lines changed: 8 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,9 @@
4848
countdown = int(args.count)
4949
debug = 0
5050

51-
tp = Tracepoint.enable_tracepoint("sched", "sched_switch")
52-
bpf_text = "#include <uapi/linux/ptrace.h>\n"
53-
bpf_text += "#include <linux/sched.h>\n"
54-
bpf_text += tp.generate_decl()
55-
bpf_text += tp.generate_entry_probe()
56-
bpf_text += tp.generate_struct()
51+
bpf_text = """#include <uapi/linux/ptrace.h>
52+
#include <linux/sched.h>
53+
"""
5754

5855
if not args.offcpu:
5956
bpf_text += "#define ONCPU\n"
@@ -66,17 +63,8 @@
6663
6764
6865
BPF_HASH(start, u32, u64);
69-
BPF_HASH(tgid_for_pid, u32, u32);
7066
STORAGE
7167
72-
static inline u32 get_tgid(u32 pid)
73-
{
74-
u32 *stored_tgid = tgid_for_pid.lookup(&pid);
75-
if (stored_tgid != 0)
76-
return *stored_tgid;
77-
return 0xffffffff;
78-
}
79-
8068
static inline void store_start(u32 tgid, u32 pid, u64 ts)
8169
{
8270
if (FILTER)
@@ -99,32 +87,19 @@
9987
STORE
10088
}
10189
102-
int sched_switch(struct pt_regs *ctx)
90+
int sched_switch(struct pt_regs *ctx, struct task_struct *prev)
10391
{
10492
u64 ts = bpf_ktime_get_ns();
10593
u64 pid_tgid = bpf_get_current_pid_tgid();
10694
u32 tgid = pid_tgid >> 32, pid = pid_tgid;
107-
// Keep a mapping of tgid for pid because when sched_switch hits,
108-
// we only have the tgid information for the *current* pid, but not
109-
// for the previous one.
110-
tgid_for_pid.update(&pid, &tgid);
111-
112-
u64 *di = __trace_di.lookup(&pid_tgid);
113-
if (di == 0)
114-
return 0;
115-
116-
struct sched_switch_trace_entry args = {};
117-
bpf_probe_read(&args, sizeof(args), (void *)*di);
11895
11996
#ifdef ONCPU
120-
if (args.prev_state == TASK_RUNNING) {
97+
if (prev->state == TASK_RUNNING) {
12198
#else
12299
if (1) {
123100
#endif
124-
u32 prev_pid = args.prev_pid;
125-
u32 prev_tgid = get_tgid(prev_pid);
126-
if (prev_tgid == 0xffffffff)
127-
goto BAIL;
101+
u32 prev_pid = prev->pid;
102+
u32 prev_tgid = prev->tgid;
128103
#ifdef ONCPU
129104
update_hist(prev_tgid, prev_pid, ts);
130105
#else
@@ -173,8 +148,7 @@
173148
print(bpf_text)
174149

175150
b = BPF(text=bpf_text)
176-
Tracepoint.attach(b)
177-
b.attach_kprobe(event="perf_trace_sched_switch", fn_name="sched_switch")
151+
b.attach_kprobe(event="finish_task_switch", fn_name="sched_switch")
178152

179153
print("Tracing %s-CPU time... Hit Ctrl-C to end." %
180154
("off" if args.offcpu else "on"))

0 commit comments

Comments
 (0)