Skip to content
This repository has been archived by the owner on Apr 13, 2023. It is now read-only.

Commit

Permalink
sched_ext: Add scx_example_cgfifo BPF scheduler
Browse files Browse the repository at this point in the history
Add scx_example_cgfifo BPF scheduler which uses a draft implementation of
BPF rbtree to implement flattened hierarchical scheduling among cgroups and
performs simple FIFO scheduling within each cgroup.

One shortcoming of the flattened hierarchy implementation is that it can be
tricked by thundering herd among cgroups. IOW, if there are many cgroups
which are waking up at the same time, the subtree may consume more CPU time
than configured. However, outside those situations, it can implement
hierarchical weight based control at a very low overhead.

The benchmark setup:

* AMD ryzen 7 3800X 8-core machine w/ 32G memory

* Running two instances of apache, both serving a bash CGI script which
  invokes date, df, grep once and then does sha1sum of a 287k file.

* Another machine running wrk (https://github.com/wg/wrk) to generate load.
  `wrk -t16 -c60 -d30s http://TARGETIP/cgi-bin/example.sh`

Results (RPS):

1. No CPU control, single apache instance.

                   RUN1     RUN2     RUN3      AVG
          CFS:  3620.49  3683.45  3664.30  3656.08
       CGFIFO:  3813.13  3738.71  3764.70  3772.18 (+3.2%)

2. CPU control w/ 4 level nesting, two apache instances with weights 200 and 100

                   RUN1     RUN2     RUN3      AVG
      CFS-100:  1570.99  1604.35  1555.19  1576.84
      CFS-200:  1871.55  1892.61  1842.94  1869.03
      CFS-SUM:  3442.54  3496.96  3398.13  3445.87

   CGFIFO-100:  1430.37  1484.21  1386.95  1433.84
   CGFIFO-200:  2381.34  2321.86  2407.51  2370.24
   CGFIFO-SUM:  3811.71  3806.07  3794.46  3804.08 (+10.4%)

NOT_SIGNED_OFF
  • Loading branch information
htejun committed Nov 30, 2022
1 parent 25e102d commit f2fcd31
Show file tree
Hide file tree
Showing 5 changed files with 1,451 additions and 0 deletions.
177 changes: 177 additions & 0 deletions tools/sched_ext/Makefile
@@ -0,0 +1,177 @@
# SPDX-License-Identifier: GPL-2.0
include ../build/Build.include
include ../scripts/Makefile.arch
include ../scripts/Makefile.include

ifneq ($(LLVM),)
ifneq ($(filter %/,$(LLVM)),)
LLVM_PREFIX := $(LLVM)
else ifneq ($(filter -%,$(LLVM)),)
LLVM_SUFFIX := $(LLVM)
endif

CLANG_TARGET_FLAGS_arm := arm-linux-gnueabi
CLANG_TARGET_FLAGS_arm64 := aarch64-linux-gnu
CLANG_TARGET_FLAGS_hexagon := hexagon-linux-musl
CLANG_TARGET_FLAGS_m68k := m68k-linux-gnu
CLANG_TARGET_FLAGS_mips := mipsel-linux-gnu
CLANG_TARGET_FLAGS_powerpc := powerpc64le-linux-gnu
CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu
CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu
CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu
CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH))

ifeq ($(CROSS_COMPILE),)
ifeq ($(CLANG_TARGET_FLAGS),)
$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk
else
CLANG_FLAGS += --target=$(CLANG_TARGET_FLAGS)
endif # CLANG_TARGET_FLAGS
else
CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%))
endif # CROSS_COMPILE

CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as
else
CC := $(CROSS_COMPILE)gcc
endif # LLVM

CURDIR := $(abspath .)
TOOLSDIR := $(abspath ..)
LIBDIR := $(TOOLSDIR)/lib
BPFDIR := $(LIBDIR)/bpf
TOOLSINCDIR := $(TOOLSDIR)/include
BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
APIDIR := $(TOOLSINCDIR)/uapi
GENDIR := $(abspath ../../include/generated)
GENHDR := $(GENDIR)/autoconf.h

SCRATCH_DIR := $(CURDIR)/tools
BUILD_DIR := $(SCRATCH_DIR)/build
INCLUDE_DIR := $(SCRATCH_DIR)/include
BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a
ifneq ($(CROSS_COMPILE),)
HOST_BUILD_DIR := $(BUILD_DIR)/host
HOST_SCRATCH_DIR := host-tools
HOST_INCLUDE_DIR := $(HOST_SCRATCH_DIR)/include
else
HOST_BUILD_DIR := $(BUILD_DIR)
HOST_SCRATCH_DIR := $(SCRATCH_DIR)
HOST_INCLUDE_DIR := $(INCLUDE_DIR)
endif
HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a
RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids
DEFAULT_BPFTOOL := $(HOST_SCRATCH_DIR)/sbin/bpftool

VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \
$(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \
../../vmlinux \
/sys/kernel/btf/vmlinux \
/boot/vmlinux-$(shell uname -r)
VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
ifeq ($(VMLINUX_BTF),)
$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)")
endif

BPFTOOL ?= $(DEFAULT_BPFTOOL)

ifneq ($(wildcard $(GENHDR)),)
GENFLAGS := -DHAVE_GENHDR
endif

CFLAGS += -g -O2 -rdynamic -Wall -Werror $(GENFLAGS) \
-I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \
-I$(TOOLSINCDIR) -I$(APIDIR)

# Silence some warnings when compiled with clang
ifneq ($(LLVM),)
CFLAGS += -Wno-unused-command-line-argument
endif

LDFLAGS = -lelf -lz

IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null | \
grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')

# Get Clang's default includes on this system, as opposed to those seen by
# '-target bpf'. This fixes "missing" files on some architectures/distros,
# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
#
# Use '-idirafter': Don't interfere with include mechanics except where the
# build would have failed anyways.
define get_sys_includes
$(shell $(1) -v -E - </dev/null 2>&1 \
| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
$(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
endef

BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) \
$(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian) \
-I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \
-I../../include \
$(call get_sys_includes,$(CLANG)) \
-Wno-compare-distinct-pointer-types

all: scx_example_cgfifo

# sort removes libbpf duplicates when not cross-building
MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf \
$(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids \
$(INCLUDE_DIR))

$(MAKE_DIRS):
$(call msg,MKDIR,,$@)
$(Q)mkdir -p $@

$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \
$(APIDIR)/linux/bpf.h \
| $(BUILD_DIR)/libbpf
$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \
EXTRA_CFLAGS='-g -O0' \
DESTDIR=$(SCRATCH_DIR) prefix= all install_headers

$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \
$(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool
$(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \
ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD) \
EXTRA_CFLAGS='-g -O0' \
OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \
LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/ \
LIBBPF_DESTDIR=$(HOST_SCRATCH_DIR)/ \
prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install-bin

$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR)
ifeq ($(VMLINUX_H),)
$(call msg,GEN,,$@)
$(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
else
$(call msg,CP,,$@)
$(Q)cp "$(VMLINUX_H)" $@
endif

%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h scx_common.h \
| $(BPFOBJ)
$(call msg,CLNG-BPF,,$@)
$(Q)$(CLANG) $(BPF_CFLAGS) -O2 -target bpf -c $< -mcpu=v3 -o $@

%.skel.h: %.bpf.o $(BPFTOOL)
$(call msg,GEN-SKEL,,$@)
$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
$(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o)
$(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o)
$(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o)
$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(<:.bpf.o=) > $@
$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(<:.bpf.o=) > $(@:.skel.h=.subskel.h)

scx_example_cgfifo.bpf.o: scx_example_cgfifo.h

scx_example_cgfifo: scx_example_cgfifo.c scx_example_cgfifo.skel.h
$(CC) $(CFLAGS) -c $< -o $@.o
$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)

clean:
rm -rf $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)
rm -f *.o *.skel.h *.subskel.h
rm -f scx_example_cgfifo

.PHONY: all clean
111 changes: 111 additions & 0 deletions tools/sched_ext/scx_common.h
@@ -0,0 +1,111 @@
#ifndef __SCHED_EXT_COMMON_H
#define __SCHED_EXT_COMMON_H

#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>
#include <stdbool.h>
#include <errno.h>

#warning "remove the following once bpf can handle 64bit enums"
#define SCX_SLICE_INF 0xffffffffffffffffLLU
#define SCX_DQ_FLAG_BUILTIN (1LLU << 63)
#define SCX_DQ_FLAG_LOCAL_ON (1LLU << 61)
#define SCX_DQ_INVALID (SCX_DQ_FLAG_BUILTIN | 0)
#define SCX_DQ_GLOBAL (SCX_DQ_FLAG_BUILTIN | 1)
#define SCX_DQ_LOCAL (SCX_DQ_FLAG_BUILTIN | 2)
#define SCX_DQ_LOCAL_ON (SCX_DQ_FLAG_BUILTIN | SCX_DQ_FLAG_LOCAL_ON)
#define SCX_ENQ_PREEMPT (1LLU << 32)
#define SCX_ENQ_REENQ (1LLU << 40)
#define SCX_ENQ_LAST (1LLU << 41)
#define SCX_ENQ_SCD_LOCAL (1LLU << 42)

extern s32 scx_bpf_create_dq(u64 dq_id, s32 node) __ksym;
extern s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags) __ksym;
extern u32 scx_bpf_dispatch_nr_slots(void) __ksym;
extern s32 scx_bpf_dispatch(struct task_struct *p, u64 dq_id, u64 enq_flags) __ksym;
extern bool scx_bpf_consume(u64 dq_id) __ksym;
extern void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
extern s32 scx_bpf_dq_nr_queued(u64 dq_id) __ksym;
extern bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
extern s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed) __ksym;
extern bool scx_bpf_has_idle_cpus(void) __ksym;
extern s32 scx_bpf_destroy_dq(u64 dq_id) __ksym;
extern bool scx_bpf_task_running(const struct task_struct *p) __ksym;
extern s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
extern const struct cpumask *scx_bpf_task_cpumask(const struct task_struct *p) __ksym;
extern struct cgroup *scx_bpf_task_cgroup(const struct task_struct *p) __ksym;
extern struct task_struct *scx_bpf_find_task_by_pid(s32 pid) __ksym;
extern void scx_bpf_reenqueue_local(void) __ksym;

extern s32 scx_bpf_pick_idle_cpu_untyped(unsigned long cpus_allowed) __ksym;
extern bool scx_bpf_has_idle_cpus_among(const struct cpumask *cpus_allowed) __ksym;
extern bool scx_bpf_has_idle_cpus_among_untyped(unsigned long cpus_allowed) __ksym;
extern s32 scx_bpf_cpumask_test_cpu(s32 cpu, const struct cpumask *cpumask) __ksym;
extern s32 scx_bpf_cpumask_first(const struct cpumask *cpus_allowed) __ksym;
extern s32 scx_bpf_cpumask_first_untyped(unsigned long cpus_allowed) __ksym;
extern bool scx_bpf_cpumask_intersects(const struct cpumask *src1p, const struct cpumask *src2p) __ksym;

extern int extl_bpf_init(u32 le_data_size_req, u32 sq_data_size_req) __ksym;
extern int extl_bpf_enable(void) __ksym;
extern struct extl_sq *extl_bpf_create_sq(u64 id) __ksym;
extern void extl_bpf_set_task_sq(struct task_struct *p, struct extl_sq *sq) __ksym;
extern struct extl_sq *extl_bpf_task_sq(struct task_struct *p) __ksym;
extern struct extl_sq *extl_bpf_find_sq(u64 id) __ksym;
extern void extl_bpf_sq_lock(struct extl_sq *sq) __ksym;
extern void extl_bpf_sq_lock_by_task(struct task_struct *p) __ksym;
extern void extl_bpf_sq_unlock(void) __ksym;
extern void extl_bpf_sq_lock_double(struct extl_sq *sq0, struct extl_sq *sq1) __ksym;
extern void extl_bpf_sq_lock_double_by_task(struct task_struct *p, struct extl_sq *sq) __ksym;
extern void extl_bpf_sq_unlock_double(void) __ksym;
extern void extl_bpf_enqueue_task(struct task_struct *p, u64 key) __ksym;
extern bool extl_bpf_dequeue_task(struct task_struct *p) __ksym;
extern void extl_bpf_dispatch_dequeue(struct task_struct *p) __ksym;
extern struct task_struct *extl_bpf_sq_first_task(struct extl_sq *sq) __ksym;

#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_EXITING 0x00000004
#define CLOCK_MONOTONIC 1

#define BPF_STRUCT_OPS(name, args...) \
SEC("struct_ops/"#name) \
BPF_PROG(name, ##args)

/**
* MEMBER_VPTR - Obtain the verified pointer to a struct or array member
* @base: struct or array to index
* @member: dereferenced member (e.g. ->field, [idx0][idx1], ...)
*
* The verifier often gets confused by the instruction sequence the compiler
* generates for indexing struct fields or arrays. This macro forces the
* compiler to generate a code sequence which first calculates the byte offset,
* checks it against the struct or array size and add that byte offset to
* generate the pointer to the member to help the verifier.
*
* Ideally, we want to abort if the calculated offset is out-of-bounds. However,
* BPF currently doesn't support abort, so evaluate to NULL instead. The caller
* must check for NULL and take appropriate action to appease the verifier. To
* avoid confusing the verifier, it's best to check for NULL and dereference
* immediately.
*
* vptr = MEMBER_VPTR(my_array, [i][j]);
* if (!vptr)
* return error;
* *vptr = new_value;
*/
#define MEMBER_VPTR(base, member) (typeof(base member) *)({ \
u64 __base = (u64)base; \
u64 __addr = (u64)&(base member) - __base; \
asm volatile ( \
"if %0 <= %[max] goto +2\n" \
"%0 = 0\n" \
"goto +1\n" \
"%0 += %1\n" \
: "+r"(__addr) \
: "r"(__base), \
[max]"i"(sizeof(base) - sizeof(base member))); \
__addr; \
})

#endif /* __SCHED_EXT_COMMON_H */

0 comments on commit f2fcd31

Please sign in to comment.