Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Per-pipeline-invocation profiling #8153

Merged
merged 15 commits into from
Jun 25, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/BoundsInference.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1388,6 +1388,13 @@ Stmt bounds_inference(Stmt s,
Expr marker = Call::make(Int(32), Call::skip_stages_marker, {}, Call::Intrinsic);
s = Block::make(Evaluate::make(marker), s);

if (target.has_feature(Target::Profile) || target.has_feature(Target::ProfileByTimer)) {
// Add a note in the IR for what profiling should cover, so that it doesn't
// include bounds queries as pipeline executions.
marker = Call::make(Int(32), Call::profiling_enable_instance_marker, {}, Call::Intrinsic);
s = Block::make(Evaluate::make(marker), s);
}

// Add a note in the IR for where assertions on input images
// should go. Those are handled by a later lowering pass.
marker = Call::make(Int(32), Call::add_image_checks_marker, {}, Call::Intrinsic);
Expand Down
4 changes: 2 additions & 2 deletions src/CodeGen_Internal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ bool function_takes_user_context(const std::string &name) {
"halide_print",
"halide_profiler_memory_allocate",
"halide_profiler_memory_free",
"halide_profiler_pipeline_start",
"halide_profiler_pipeline_end",
"halide_profiler_instance_start",
"halide_profiler_instance_end",
"halide_profiler_stack_peak_update",
"halide_spawn_thread",
"halide_device_release",
Expand Down
1 change: 1 addition & 0 deletions src/IR.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,7 @@ const char *const intrinsic_op_names[] = {
"mux",
"popcount",
"prefetch",
"profiling_enable_instance_marker",
"promise_clamped",
"random",
"register_destructor",
Expand Down
1 change: 1 addition & 0 deletions src/IR.h
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,7 @@ struct Call : public ExprNode<Call> {
mux,
popcount,
prefetch,
profiling_enable_instance_marker,
promise_clamped,
random,
register_destructor,
Expand Down
229 changes: 145 additions & 84 deletions src/Profiling.cpp

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/Profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class Function;
* storage flattening, but after all bounds inference.
*
*/
Stmt inject_profiling(Stmt, const std::string &, const std::map<std::string, Function> &env);
Stmt inject_profiling(const Stmt &, const std::string &, const std::map<std::string, Function> &env);

} // namespace Internal
} // namespace Halide
Expand Down
130 changes: 85 additions & 45 deletions src/runtime/HalideRuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -1246,6 +1246,9 @@ enum halide_error_code_t {
/** A factor used to split a loop was discovered to be zero or negative at
* runtime. */
halide_error_code_split_factor_not_positive = -46,

/** Profiling failed for a pipeline invocation. */
halide_error_code_cannot_profile_pipeline = -47,
};

/** Halide calls the functions below on various error conditions. The
Expand Down Expand Up @@ -1829,7 +1832,7 @@ struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_func_stats {
/** Per-pipeline state tracked by the sampling profiler. These exist
* in a linked list. */
struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_pipeline_stats {
/** Total time spent inside this pipeline (in nanoseconds) */
/** Total time spent in this pipeline (in nanoseconds) */
uint64_t time;

/** The current memory allocation of funcs in this pipeline. */
Expand Down Expand Up @@ -1858,9 +1861,6 @@ struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_pipeline_stats {
/** The number of funcs in this pipeline. */
int num_funcs;

/** An internal base id used to identify the funcs in this pipeline. */
int first_func_id;

/** The number of times this pipeline has been run. */
int runs;

Expand All @@ -1871,48 +1871,98 @@ struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_pipeline_stats {
int num_allocs;
};

/** The global state of the profiler. */
/** Per-invocation-of-a-pipeline state. Lives on the stack of the Halide
* code. Exists in a doubly-linked list to that it can be cleanly
* removed. */
struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_instance_state {
/** Time billed to funcs in this instance by the sampling thread. */
uint64_t billed_time;

struct halide_profiler_state {
/** Guards access to the fields below. If not locked, the sampling
* profiler thread is free to modify things below (including
* reordering the linked list of pipeline stats). */
struct halide_mutex lock;
/** Wall clock time of the start of the instance. */
uint64_t start_time;

/** The amount of time the profiler thread sleeps between samples
* in milliseconds. Defaults to 1 */
int sleep_time;
/** The current memory allocation of funcs in this instance. */
uint64_t memory_current;

/** An internal id used for bookkeeping. */
int first_free_id;
/** The peak memory allocation of funcs in this instance. */
uint64_t memory_peak;

/** The total memory allocation of funcs in this instance. */
uint64_t memory_total;

/** The average number of thread pool worker threads doing useful
* work while computing this instance. */
uint64_t active_threads_numerator, active_threads_denominator;

/** A pointer to the next running instance, so that the running instances
* can exist in a linked list. */
struct halide_profiler_instance_state *next;

/** A pointer to the address of the next pointer of the previous instance,
* so that this can be removed from the linked list when the instance
* terminates. */
struct halide_profiler_instance_state **prev_next;

/** Information shared across all instances. The stats above are merged into
* it when the instance is retired. */
struct halide_profiler_pipeline_stats *pipeline_stats;

/** An array containing states for each Func in this instance of this pipeline. */
struct halide_profiler_func_stats *funcs;

/** The id of the current running Func. Set by the pipeline, read
* periodically by the profiler thread. */
int current_func;

/** The number of threads currently doing work. */
/** The number of threads currently doing work on this pipeline instance. */
int active_threads;

/** The number of samples taken by this instance. */
int samples;

/** The total number of memory allocation of funcs in this instance. */
int num_allocs;

/** Whether or not this instance should count towards pipeline
* statistics. */
int should_collect_statistics;
};

/** The global state of the profiler. */
struct halide_profiler_state {
/** Guards access to the fields below. If not locked, the sampling
* profiler thread is free to modify things below (including
* reordering the linked list of pipeline stats). */
struct halide_mutex lock;

/** A linked list of stats gathered for each pipeline. */
struct halide_profiler_pipeline_stats *pipelines;

/** Retrieve remote profiler state. Used so that the sampling
* profiler can follow along with execution that occurs elsewhere,
* e.g. on a DSP. If null, it reads from the int above instead. */
void (*get_remote_profiler_state)(int *func, int *active_workers);

/** Sampling thread reference to be joined at shutdown. */
struct halide_thread *sampling_thread;
};

/** Profiler func ids with special meanings. */
enum {
/// current_func takes on this value when not inside Halide code
halide_profiler_outside_of_halide = -1,
/// Set current_func to this value to tell the profiling thread to
/// halt. It will start up again next time you run a pipeline with
/// profiling enabled.
halide_profiler_please_stop = -2
/** The running instances of Halide pipelines. */
struct halide_profiler_instance_state *instances;

/** If this callback is defined, the profiler asserts that there is a single
* live instance, and then uses it to get the current func and number of
* active threads insted of reading the fields in the instance. This is used
* so that the profiler can follow along with execution that occurs
* elsewhere (e.g. on an accelerator). */
void (*get_remote_profiler_state)(int *func, int *active_workers);

/** The amount of time the profiler thread sleeps between samples in
* microseconds. Defaults to 1000. To change it call
* halide_profiler_get_state and mutate this field. */
int sleep_time;
abadams marked this conversation as resolved.
Show resolved Hide resolved

/** Set to 1 when you want the profiler to wait for all running instances to
* finish and then stop gracefully. */
int shutdown;
};

/** Get a pointer to the global profiler state for programmatic
Expand All @@ -1930,34 +1980,24 @@ extern struct halide_profiler_pipeline_stats *halide_profiler_get_pipeline_state
* accurate time interval if desired. */
extern int halide_profiler_sample(struct halide_profiler_state *s, uint64_t *prev_t);

/** Reset profiler state cheaply. May leave threads running or some
* memory allocated but all accumluated statistics are reset.
* WARNING: Do NOT call this method while any halide pipeline is
* running; halide_profiler_memory_allocate/free and
* halide_profiler_stack_peak_update update the profiler pipeline's
* state without grabbing the global profiler state's lock. */
/** Reset profiler state cheaply. May leave threads running or some memory
* allocated but all accumulated statistics are reset. Blocks until all running
* profiled Halide pipelines exit. */
extern void halide_profiler_reset(void);

/** Reset all profiler state.
* WARNING: Do NOT call this method while any halide pipeline is
* running; halide_profiler_memory_allocate/free and
* halide_profiler_stack_peak_update update the profiler pipeline's
* state without grabbing the global profiler state's lock. */
void halide_profiler_shutdown(void);
/** Reset all profiler state. Blocks until all running profiled Halide
* pipelines exit. */
extern void halide_profiler_shutdown(void);

/** Print out timing statistics for everything run since the last
* reset. Also happens at process exit. */
extern void halide_profiler_report(void *user_context);

/** For timer based profiling, this routine starts the timer chain running.
* halide_get_profiler_state can be called to get the current timer interval.
*/
extern void halide_start_timer_chain(void);
/** These routines are called to temporarily disable and then reenable
* timer interuppts for profiling */
* the profiler. */
//@{
extern void halide_disable_timer_interrupt(void);
extern void halide_enable_timer_interrupt(void);
extern void halide_profiler_lock(struct halide_profiler_state *);
extern void halide_profiler_unlock(struct halide_profiler_state *);
//@}

/// \name "Float16" functions
Expand Down
4 changes: 2 additions & 2 deletions src/runtime/fuchsia_clock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) {
return zx_clock_get_monotonic() - halide_reference_clock;
}

WEAK void halide_sleep_ms(void *user_context, int ms) {
zx_nanosleep(zx_deadline_after(ms * 1000));
WEAK void halide_sleep_us(void *user_context, int us) {
zx_nanosleep(zx_deadline_after(us));
}
}
19 changes: 16 additions & 3 deletions src/runtime/hexagon_host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,11 +478,22 @@ WEAK int halide_hexagon_run(void *user_context,
// get_remote_profiler_func to retrieve the current
// func. Otherwise leave it alone - the cost of remote running
// will be billed to the calling Func.
halide_profiler_state *s = halide_profiler_get_state();
if (remote_poll_profiler_state) {
halide_profiler_get_state()->get_remote_profiler_state = get_remote_profiler_state;
halide_profiler_lock(s);
const halide_profiler_instance_state *instance = s->instances;
// The instance that called this runtime function should be registered.
halide_abort_if_false(user_context, instance);
if (instance->next) {
halide_profiler_unlock(s);
error(user_context) << "Hexagon: multiple simultaneous profiled pipelines is unsupported.";
return halide_error_code_cannot_profile_pipeline;
}
s->get_remote_profiler_state = get_remote_profiler_state;
if (remote_profiler_set_current_func) {
remote_profiler_set_current_func(halide_profiler_get_state()->current_func);
remote_profiler_set_current_func(instance->current_func);
}
halide_profiler_unlock(s);
}

// Call the pipeline on the device side.
Expand All @@ -498,7 +509,9 @@ WEAK int halide_hexagon_run(void *user_context,
return halide_error_code_generic_error;
}

halide_profiler_get_state()->get_remote_profiler_state = nullptr;
halide_profiler_lock(s);
s->get_remote_profiler_state = nullptr;
halide_profiler_unlock(s);

#ifdef DEBUG_RUNTIME
uint64_t t_after = halide_current_time_ns(user_context);
Expand Down
15 changes: 8 additions & 7 deletions src/runtime/hexagon_remote/qurt/halide_remote.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -419,23 +419,24 @@ int halide_hexagon_remote_release_library(handle_t module_ptr) {
return 0;
}

halide_profiler_instance_state *halide_hexagon_remote_profiler_get_global_instance() {
static halide_profiler_instance_state hvx_profiler_instance;
return &hvx_profiler_instance;
}

int halide_hexagon_remote_poll_profiler_state(int *func, int *threads) {
// Increase the current thread priority to match working threads priorities,
// so profiler can access the remote state without extra latency.
qurt_thread_t current_thread_id = qurt_thread_get_id();
qurt_thread_set_priority(current_thread_id, 100);

*func = halide_profiler_get_state()->current_func;
*threads = halide_profiler_get_state()->active_threads;
*func = halide_hexagon_remote_profiler_get_global_instance()->current_func;
*threads = halide_hexagon_remote_profiler_get_global_instance()->active_threads;
return 0;
}
int halide_hexagon_remote_profiler_set_current_func(int current_func) {
halide_profiler_get_state()->current_func = current_func;
halide_hexagon_remote_profiler_get_global_instance()->current_func = current_func;
return 0;
}
halide_profiler_state *halide_profiler_get_state() {
static halide_profiler_state hvx_profiler_state;
return &hvx_profiler_state;
}

} // extern "C"
6 changes: 3 additions & 3 deletions src/runtime/hexagon_remote/qurt/sim_remote.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,12 +152,12 @@ int release_library(handle_t module_ptr) {
}

extern "C" {
halide_profiler_state profiler_state;
halide_profiler_instance_state profiler_state;
int *profiler_current_func_addr = &profiler_state.current_func;
}

halide_profiler_state *halide_profiler_get_state() {
return (halide_profiler_state *)(&profiler_state);
halide_profiler_instance_state *halide_profiler_get_state() {
return (halide_profiler_instance_state *)(&profiler_state);
}

extern "C" {
Expand Down
4 changes: 2 additions & 2 deletions src/runtime/linux_clock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) {
}

extern int usleep(int);
WEAK void halide_sleep_ms(void *user_context, int ms) {
usleep(ms * 1000);
WEAK void halide_sleep_us(void *user_context, int us) {
usleep(us);
}
}
4 changes: 2 additions & 2 deletions src/runtime/osx_clock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) {
}

extern int usleep(int);
WEAK void halide_sleep_ms(void *user_context, int ms) {
usleep(ms * 1000);
WEAK void halide_sleep_us(void *user_context, int us) {
usleep(us);
}
}
4 changes: 2 additions & 2 deletions src/runtime/posix_clock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) {
}

extern int usleep(int);
WEAK void halide_sleep_ms(void *user_context, int ms) {
usleep(ms * 1000);
WEAK void halide_sleep_us(void *user_context, int us) {
usleep(us);
}
}
2 changes: 1 addition & 1 deletion src/runtime/posix_timer_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ WEAK extern "C" void halide_start_timer_chain() {
halide_profiler_state *s = halide_profiler_get_state();
itimerval timer_state;
timer_state.it_interval.tv_sec = 0;
timer_state.it_interval.tv_usec = s->sleep_time * 1000.0;
timer_state.it_interval.tv_usec = s->sleep_time;
timer_state.it_value = timer_state.it_interval;

signal(SIGPROF, &profiler_handler);
Expand Down
Loading
Loading