Permalink
Browse files

cpudist: Use `finish_task_switch` kprobe instead of `sched_switch` tr…

…acepoint

The `sched_switch` tracepoint approach requires storing the previous
task's tgid in a map and fetching it from there, because it is not
available as a tracepoint argument. Instead, placing a kprobe on the
`finish_task_switch` function allows cleanly fetching the previous
task's pid and tgid from the task_struct.
  • Loading branch information...
goldshtn committed Jun 30, 2016
1 parent 3c976bb commit 06d90d3d4b35815027b7b7a7fc48167d497d2de3
Showing with 8 additions and 39 deletions.
  1. +0 −5 man/man8/cpudist.8
  2. +8 −34 tools/cpudist.py
View
@@ -19,11 +19,6 @@ This tool uses in-kernel eBPF maps for storing timestamps and the histogram,
for efficiency. Despite this, the overhead of this tool may become significant
for some workloads: see the OVERHEAD section.
This tool uses the sched:sched_switch kernel tracepoint to determine when a
task is scheduled and descheduled. If the tracepoint arguments change in the
future, this tool will have to be updated. Still, it is more reliable than
using kprobes on the respective kernel functions directly.
Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS
CONFIG_BPF and bcc.
View
@@ -48,12 +48,9 @@
countdown = int(args.count)
debug = 0
tp = Tracepoint.enable_tracepoint("sched", "sched_switch")
bpf_text = "#include <uapi/linux/ptrace.h>\n"
bpf_text += "#include <linux/sched.h>\n"
bpf_text += tp.generate_decl()
bpf_text += tp.generate_entry_probe()
bpf_text += tp.generate_struct()
bpf_text = """#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
"""
if not args.offcpu:
bpf_text += "#define ONCPU\n"
@@ -66,17 +63,8 @@
BPF_HASH(start, u32, u64);
BPF_HASH(tgid_for_pid, u32, u32);
STORAGE
static inline u32 get_tgid(u32 pid)
{
u32 *stored_tgid = tgid_for_pid.lookup(&pid);
if (stored_tgid != 0)
return *stored_tgid;
return 0xffffffff;
}
static inline void store_start(u32 tgid, u32 pid, u64 ts)
{
if (FILTER)
@@ -99,32 +87,19 @@
STORE
}
int sched_switch(struct pt_regs *ctx)
int sched_switch(struct pt_regs *ctx, struct task_struct *prev)
{
u64 ts = bpf_ktime_get_ns();
u64 pid_tgid = bpf_get_current_pid_tgid();
u32 tgid = pid_tgid >> 32, pid = pid_tgid;
// Keep a mapping of tgid for pid because when sched_switch hits,
// we only have the tgid information for the *current* pid, but not
// for the previous one.
tgid_for_pid.update(&pid, &tgid);
u64 *di = __trace_di.lookup(&pid_tgid);
if (di == 0)
return 0;
struct sched_switch_trace_entry args = {};
bpf_probe_read(&args, sizeof(args), (void *)*di);
#ifdef ONCPU
if (args.prev_state == TASK_RUNNING) {
if (prev->state == TASK_RUNNING) {
#else
if (1) {
#endif
u32 prev_pid = args.prev_pid;
u32 prev_tgid = get_tgid(prev_pid);
if (prev_tgid == 0xffffffff)
goto BAIL;
u32 prev_pid = prev->pid;
u32 prev_tgid = prev->tgid;
#ifdef ONCPU
update_hist(prev_tgid, prev_pid, ts);
#else
@@ -173,8 +148,7 @@
print(bpf_text)
b = BPF(text=bpf_text)
Tracepoint.attach(b)
b.attach_kprobe(event="perf_trace_sched_switch", fn_name="sched_switch")
b.attach_kprobe(event="finish_task_switch", fn_name="sched_switch")
print("Tracing %s-CPU time... Hit Ctrl-C to end." %
("off" if args.offcpu else "on"))

0 comments on commit 06d90d3

Please sign in to comment.