Skip to content
Browse files
cpudist: Use finish_task_switch kprobe instead of sched_switch tr…

The `sched_switch` tracepoint approach requires storing the previous
task's tgid in a map and fetching it from there, because it is not
available as a tracepoint argument. Instead, placing a kprobe on the
`finish_task_switch` function allows cleanly fetching the previous
task's pid and tgid from the task_struct.
  • Loading branch information
goldshtn committed Jun 30, 2016
1 parent 3c976bb commit 06d90d3
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 39 deletions.
@@ -19,11 +19,6 @@ This tool uses in-kernel eBPF maps for storing timestamps and the histogram,
for efficiency. Despite this, the overhead of this tool may become significant
for some workloads: see the OVERHEAD section.

This tool uses the sched:sched_switch kernel tracepoint to determine when a
task is scheduled and descheduled. If the tracepoint arguments change in the
future, this tool will have to be updated. Still, it is more reliable than
using kprobes on the respective kernel functions directly.

Since this uses BPF, only the root user can use this tool.
CONFIG_BPF and bcc.
@@ -48,12 +48,9 @@
countdown = int(args.count)
debug = 0

tp = Tracepoint.enable_tracepoint("sched", "sched_switch")
bpf_text = "#include <uapi/linux/ptrace.h>\n"
bpf_text += "#include <linux/sched.h>\n"
bpf_text += tp.generate_decl()
bpf_text += tp.generate_entry_probe()
bpf_text += tp.generate_struct()
bpf_text = """#include <uapi/linux/ptrace.h>
#include <linux/sched.h>

if not args.offcpu:
bpf_text += "#define ONCPU\n"
@@ -66,17 +63,8 @@
BPF_HASH(start, u32, u64);
BPF_HASH(tgid_for_pid, u32, u32);
static inline u32 get_tgid(u32 pid)
u32 *stored_tgid = tgid_for_pid.lookup(&pid);
if (stored_tgid != 0)
return *stored_tgid;
return 0xffffffff;
static inline void store_start(u32 tgid, u32 pid, u64 ts)
@@ -99,32 +87,19 @@
int sched_switch(struct pt_regs *ctx)
int sched_switch(struct pt_regs *ctx, struct task_struct *prev)
u64 ts = bpf_ktime_get_ns();
u64 pid_tgid = bpf_get_current_pid_tgid();
u32 tgid = pid_tgid >> 32, pid = pid_tgid;
// Keep a mapping of tgid for pid because when sched_switch hits,
// we only have the tgid information for the *current* pid, but not
// for the previous one.
tgid_for_pid.update(&pid, &tgid);
u64 *di = __trace_di.lookup(&pid_tgid);
if (di == 0)
return 0;
struct sched_switch_trace_entry args = {};
bpf_probe_read(&args, sizeof(args), (void *)*di);
#ifdef ONCPU
if (args.prev_state == TASK_RUNNING) {
if (prev->state == TASK_RUNNING) {
if (1) {
u32 prev_pid = args.prev_pid;
u32 prev_tgid = get_tgid(prev_pid);
if (prev_tgid == 0xffffffff)
goto BAIL;
u32 prev_pid = prev->pid;
u32 prev_tgid = prev->tgid;
#ifdef ONCPU
update_hist(prev_tgid, prev_pid, ts);
@@ -173,8 +148,7 @@

b = BPF(text=bpf_text)
b.attach_kprobe(event="perf_trace_sched_switch", fn_name="sched_switch")
b.attach_kprobe(event="finish_task_switch", fn_name="sched_switch")

print("Tracing %s-CPU time... Hit Ctrl-C to end." %
("off" if args.offcpu else "on"))

0 comments on commit 06d90d3

Please sign in to comment.