improvements

Signed-off-by: Francisco Javier Honduvilla Coto <javierhonduco@gmail.com>
javierhonduco · Jan 25, 2023 · dae09ee · dae09ee
1 parent 38abbc5
commit dae09ee
Show file tree

Hide file tree

Showing 7 changed files with 58,701 additions and 102 deletions.
diff --git a/bpf/.clang-format b/bpf/.clang-format
@@ -2,4 +2,4 @@
 BasedOnStyle: LLVM
 AllowShortIfStatementsOnASingleLine: false
 AllowShortLoopsOnASingleLine: false
-ColumnLimit: 120
+ColumnLimit: 160
diff --git a/bpf/cpu/cpu.bpf.c b/bpf/cpu/cpu.bpf.c
@@ -12,10 +12,6 @@
 #include "../common.h"
 #include "hash.h"
 
-//#include <uapi/linux/bpf.h>
-enum {
-  BPF_F_NO_PREALLOC = (1U << 0),
-};
 #include <bpf/bpf_core_read.h>
 #include <bpf/bpf_endian.h>
 #include <bpf/bpf_helpers.h>
@@ -26,12 +22,10 @@ enum {
 // Number of frames to walk per tail call iteration.
 #define MAX_STACK_DEPTH_PER_PROGRAM 15
 // Number of BPF tail calls that will be attempted.
-//
-// invariant: `MAX_TAIL_CALLS * MAX_STACK_DEPTH_PER_PROGRAM` >=
-// `MAX_STACK_DEPTH`
 #define MAX_TAIL_CALLS 10
 // Maximum number of frames.
 #define MAX_STACK_DEPTH 127
+_Static_assert(MAX_TAIL_CALLS * MAX_STACK_DEPTH_PER_PROGRAM >= MAX_STACK_DEPTH, "Not enough iterations to traverse the whole stack");
 // Number of unique stacks.
 #define MAX_STACK_TRACES_ENTRIES 1024
 // Number of items in the stack counts aggregation map.
@@ -40,12 +34,16 @@ enum {
 #define MAX_PROCESSES 1500
 // Binary search iterations for dwarf based stack walking.
 // 2^19 can bisect ~524_288 entries.
-//
-// invariant: `2^MAX_BINARY_SEARCH_DEPTH >= MAX_UNWIND_TABLE_SIZE`
 #define MAX_BINARY_SEARCH_DEPTH 19
 // Size of the unwind table.
 // 250k * sizeof(stack_unwind_row_t) = 2MB
 #define MAX_UNWIND_TABLE_SIZE 250 * 1000
+_Static_assert(1 << MAX_BINARY_SEARCH_DEPTH >= MAX_UNWIND_TABLE_SIZE, "Unwind table too small");
+
+
+// Useful to isolate stack unwinding issues.
+#define DISABLE_BPF_HELPER_FP_UNWINDER 1
+
 // Unwind tables bigger than can't fit in the remaining space
 // of the current shard are broken up into chunks up to `MAX_UNWIND_TABLE_SIZE`.
 #define MAX_UNWIND_TABLE_CHUNKS 30
@@ -87,30 +85,28 @@ const volatile struct config_t config = {};
 
 /*============================== MACROS =====================================*/
 
-#define BPF_MAP(_name, _type, _key_type, _value_type, _max_entries)                                                    \
-  struct {                                                                                                             \
-    __uint(type, _type);                                                                                               \
-    __uint(max_entries, _max_entries);                                                                                 \
-    __type(key, _key_type);                                                                                            \
-    __type(value, _value_type);                                                                                        \
+#define BPF_MAP(_name, _type, _key_type, _value_type, _max_entries)                                                                                            \
+  struct {                                                                                                                                                     \
+    __uint(type, _type);                                                                                                                                       \
+    __uint(max_entries, _max_entries);                                                                                                                         \
+    __type(key, _key_type);                                                                                                                                    \
+    __type(value, _value_type);                                                                                                                                \
   } _name SEC(".maps");
 
 // Stack Traces are slightly different
 // in that the value is 1 big byte array
 // of the stack addresses
 typedef __u64 stack_trace_type[MAX_STACK_DEPTH];
-#define BPF_STACK_TRACE(_name, _max_entries)                                                                           \
-  BPF_MAP(_name, BPF_MAP_TYPE_STACK_TRACE, u32, stack_trace_type, _max_entries);
-
-#define BPF_HASH(_name, _key_type, _value_type, _max_entries)                                                          \
-  BPF_MAP(_name, BPF_MAP_TYPE_HASH, _key_type, _value_type, _max_entries);
-
-#define DEFINE_COUNTER(__func__name)                                                                                   \
-  static void BUMP_##__func__name() {                                                                                  \
-    u32 *c = bpf_map_lookup_elem(&percpu_stats, &__func__name);                                                        \
-    if (c != NULL) {                                                                                                   \
-      *c += 1;                                                                                                         \
-    }                                                                                                                  \
+#define BPF_STACK_TRACE(_name, _max_entries) BPF_MAP(_name, BPF_MAP_TYPE_STACK_TRACE, u32, stack_trace_type, _max_entries);
+
+#define BPF_HASH(_name, _key_type, _value_type, _max_entries) BPF_MAP(_name, BPF_MAP_TYPE_HASH, _key_type, _value_type, _max_entries);
+
+#define DEFINE_COUNTER(__func__name)                                                                                                                           \
+  static void BUMP_##__func__name() {                                                                                                                          \
+    u32 *c = bpf_map_lookup_elem(&percpu_stats, &__func__name);                                                                                                \
+    if (c != NULL) {                                                                                                                                           \
+      *c += 1;                                                                                                                                                 \
+    }                                                                                                                                                          \
   }
 
 /*============================= INTERNAL STRUCTS ============================*/
@@ -294,8 +290,8 @@ static void unwind_print_stats() {
     return;
   }
 
-  u32 *jit_errors = bpf_map_lookup_elem(&percpu_stats, &UNWIND_JIT_ERRORS);
-  if (jit_errors == NULL) {
+  u32 *unknown_jit = bpf_map_lookup_elem(&percpu_stats, &UNWIND_JIT_ERRORS);
+  if (unknown_jit == NULL) {
     return;
   }
 
@@ -305,7 +301,7 @@ static void unwind_print_stats() {
   bpf_printk("truncated=%lu", *truncated_counter);
   bpf_printk("catchall=%lu", *catchall_count);
   bpf_printk("never=%lu", *never);
-  bpf_printk("jit_failure=%lu", *jit_errors);
+  bpf_printk("unknown_jit=%lu", *unknown_jit);
 
   bpf_printk("total_counter=%lu", *total_counter);
   bpf_printk("(not_covered=%lu)", *not_covered_count);
@@ -329,10 +325,12 @@ static __always_inline void *bpf_map_lookup_or_try_init(void *map, const void *k
   if (val)
     return val;
 
-  err = bpf_map_update_elem(map, key, init, BPF_NOEXIST);
+  err = bpf_map_update_elem(map, key, init, BPF_ANY); // ANY?
   // 17 == EEXIST
-  if (err && err != -17)
+  if (err !=0) {
+    bpf_printk("[error] bpf_map_lookup_or_try_init with ret: %d", err);
     return 0;
+  }
 
   return bpf_map_lookup_elem(map, key);
 }
@@ -415,8 +413,7 @@ enum find_unwind_table_return {
 // Finds the shard information for a given pid and program counter. Optionally,
 // and offset can be passed that will be filled in with the mapping's load
 // address.
-static __always_inline enum find_unwind_table_return find_unwind_table(shard_info_t **shard_info, pid_t pid, u64 pc,
-                                                                       u64 *offset) {
+static __always_inline enum find_unwind_table_return find_unwind_table(shard_info_t **shard_info, pid_t pid, u64 pc, u64 *offset) {
   process_info_t *proc_info = bpf_map_lookup_elem(&process_info, &pid);
   // Appease the verifier.
   if (proc_info == NULL) {
@@ -493,8 +490,7 @@ static __always_inline enum find_unwind_table_return find_unwind_table(shard_inf
 }
 
 // Aggregate the given stacktrace.
-static __always_inline void add_stack(struct bpf_perf_event_data *ctx, u64 pid_tgid, enum stack_walking_method method,
-                                      unwind_state_t *unwind_state) {
+static __always_inline void add_stack(struct bpf_perf_event_data *ctx, u64 pid_tgid, enum stack_walking_method method, unwind_state_t *unwind_state) {
   u64 zero = 0;
   stack_count_key_t stack_key = {0};
 
@@ -523,8 +519,16 @@ static __always_inline void add_stack(struct bpf_perf_event_data *ctx, u64 pid_t
     stack_key.user_stack_id = 0;
 
     // Insert stack.
-    bpf_map_update_elem(&dwarf_stack_traces, &stack_hash, &unwind_state->stack, BPF_ANY);
+    int err = bpf_map_update_elem(&dwarf_stack_traces, &stack_hash, &unwind_state->stack, BPF_ANY);
+    if (err != 0) {
+      bpf_printk("[error] bpf_map_update_elem with ret: %d", err);
+    }
+
   } else if (method == STACK_WALKING_METHOD_FP) {
+    bpf_printk("[info] fp unwinding %d", DISABLE_BPF_HELPER_FP_UNWINDER);
+    if (DISABLE_BPF_HELPER_FP_UNWINDER) {
+      return;
+    }
     int stack_id = bpf_get_stackid(ctx, &stack_traces, BPF_F_USER_STACK);
     if (stack_id >= 0) {
       stack_key.user_stack_id = stack_id;
@@ -591,8 +595,7 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) {
     bpf_printk("========== left %llu right %llu", left, right);
     u64 table_idx = find_offset_for_pc(unwind_table, unwind_state->ip - offset, left, right);
 
-    if (table_idx == BINARY_SEARCH_NOT_FOUND || table_idx == BINARY_SEARCH_SHOULD_NEVER_HAPPEN ||
-        table_idx == BINARY_SEARCH_EXHAUSTED_ITERATIONS) {
+    if (table_idx == BINARY_SEARCH_NOT_FOUND || table_idx == BINARY_SEARCH_SHOULD_NEVER_HAPPEN || table_idx == BINARY_SEARCH_EXHAUSTED_ITERATIONS) {
       bpf_printk("[error] binary search failed with %llx", table_idx);
       return 1;
     }
@@ -676,7 +679,7 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) {
     // is *always* 8 bytes ahead of the previous stack pointer.
     u64 previous_rip_addr = previous_rsp - 8; // the saved return address is 8 bytes ahead of the previous stack pointer
     u64 previous_rip = 0;
-    int err = bpf_probe_read_user(&previous_rip, 8, (void *)(previous_rip_addr)); // 8 bytes, a whole word in a 64 bits machine
+    int err = bpf_probe_read_user(&previous_rip, 8, (void *)(previous_rip_addr));
 
     if (previous_rip == 0) {
       int user_pid = pid_tgid;
@@ -692,9 +695,7 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) {
         return 1;
       }
 
-      bpf_printk("[error] previous_rip should not be zero. This can mean that "
-                 "the read failed, ret=%d while reading @ %llx.",
-                 err, previous_rip_addr);
+      bpf_printk("[error] previous_rip should not be zero. This can mean that the read failed, ret=%d while reading @ %llx.", err, previous_rip_addr);
       BUMP_UNWIND_CATCHALL_ERROR();
       return 1;
     }
@@ -706,10 +707,7 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) {
     } else {
       u64 previous_rbp_addr = previous_rsp + found_rbp_offset;
       bpf_printk("\t(bp_offset: %d, bp value stored at %llx)", found_rbp_offset, previous_rbp_addr);
-      int ret = bpf_probe_read_user(&previous_rbp, 8,
-                                    (void *)(previous_rbp_addr)); // 8 bytes, a whole word in a 64 bits
-                                                                  // machine
-
+      int ret = bpf_probe_read_user(&previous_rbp, 8, (void *)(previous_rbp_addr));
       if (ret != 0) {
         bpf_printk("[error] previous_rbp should not be zero. This can mean "
                    "that the read has failed %d.",
@@ -747,9 +745,7 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) {
       bpf_printk("======= reached main! =======");
       add_stack(ctx, pid_tgid, STACK_WALKING_METHOD_DWARF, unwind_state);
       BUMP_UNWIND_SUCCESS();
-      bpf_printk("yesssss :)");
     } else {
-
       int user_pid = pid_tgid;
       process_info_t *proc_info = bpf_map_lookup_elem(&process_info, &user_pid);
       if (proc_info == NULL) {
@@ -773,8 +769,7 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) {
     bpf_tail_call(ctx, &programs, 0);
   }
 
-  // We couldn't walk enough frames
-  bpf_printk("nooooooo :(");
+  // We couldn't get the whole stacktrace.
   BUMP_UNWIND_TRUNCATED();
   return 0;
 }
@@ -817,14 +812,16 @@ int profile_cpu(struct bpf_perf_event_data *ctx) {
   int user_pid = pid_tgid;
   int user_tgid = pid_tgid >> 32;
 
-  if (user_pid == 0)
+  if (user_pid == 0) {
     return 0;
+  }
 
   if (config.debug) {
-    // very noisy
+    // This can be very noisy
     // bpf_printk("debug mode enabled, make sure you specified process name");
-    if (!is_debug_enabled_for_pid(user_tgid))
+    if (!is_debug_enabled_for_pid(user_tgid)) {
       return 0;
+    }
   }
 
   bool has_unwind_info = has_unwind_information(user_pid);

diff --git a/dnf b/dnf
@@ -0,0 +1 @@
+which