Skip to content
This repository was archived by the owner on Aug 30, 2024. It is now read-only.

Commit a8d9e7d

Browse files
Control printing information using NEURAL_SPEED_VERBOSE (#26)
1 parent 9be307f commit a8d9e7d

File tree

20 files changed

+38
-55
lines changed

20 files changed

+38
-55
lines changed

.github/workflows/scripts/models/cpp_graph_inference.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ function main() {
136136
## run inference
137137
export LANG=en_US.UTF-8
138138
export LC_ALL=en_US.UTF-8
139-
OMP_NUM_THREADS=$(($cores_per_instance * 1)) numactl -m 0 -C 0-$(($cores_per_instance * 1 - 1)) \
139+
NEURAL_SPEED_VERBOSE=1 OMP_NUM_THREADS=$(($cores_per_instance * 1)) numactl -m 0 -C 0-$(($cores_per_instance * 1 - 1)) \
140140
$infer_cmd --seed 1234 -t $cores_per_instance -b 2047 -c ${ctx} -n ${output} -m ${model}-${precision}.bin -p "$prompt" 2>&1 | tee ${WORKING_DIR}/${logs_file} || true &
141141
monitor
142142
done

CMakeLists.txt

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,10 +76,7 @@ if(NE_BUILD_TESTS)
7676
add_compile_definitions(NE_BUILD_TESTS)
7777
endif()
7878

79-
option(NE_PROFILING "neural_engine: use Profiling" OFF)
80-
if (NE_PROFILING)
81-
add_compile_definitions(NE_PERF)
82-
endif()
79+
add_compile_definitions(NE_PERF)
8380
option(NE_BEAM_SEARCH_VERBOSE "neural_engine: print beam search processing log" OFF)
8481
if (NE_BEAM_SEARCH_VERBOSE)
8582
add_compile_definitions(NE_BEAM_SEARCH_VERBOSE_ON)

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -434,3 +434,12 @@ stopping_criteria = StoppingCriteriaList(
434434

435435
outputs = model.generate(inputs, streamer=streamer, stopping_criteria=stopping_criteria)
436436
```
437+
438+
### 6. Verbose Mode
439+
440+
Enable verbose mode and control tracing information using the `NEURAL_SPEED_VERBOSE` environment variable.
441+
442+
Available modes:
443+
- 0: Print all tracing information. Comprehensive output, including: evaluation time and operator profiling.
444+
- 1: Print evaluation time. Time taken for each evaluation.
445+
- 2: Profile individual operator. Identify performance bottleneck within the model.

neural_speed/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,8 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa
197197
streamer.end()
198198

199199
self.generate_round += 1
200+
if os.getenv("NEURAL_SPEED_VERBOSE") and os.getenv("NEURAL_SPEED_VERBOSE") in ["1", "0"]:
201+
self.model.print_time()
200202
return ret
201203

202204
def is_token_end(self):

neural_speed/application/main_pybind.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ class Model {
9090
curr_input_ids.resize(params.batch_size);
9191
generate_count = 0;
9292
}
93-
93+
void print_time() { model_print_timings(ctx); }
9494
static size_t np_bestla_qpack(py::array_t<int8_t> src_w, py::array_t<float> src_scales, py::array_t<int8_t> src_zeros,
9595
py::array_t<int32_t> g_idx, py::array_t<int8_t> dst, const std::string& weight_dtype,
9696
const std::string& alg, int group_size, const std::string& scale_dtype,
@@ -689,5 +689,6 @@ PYBIND11_MODULE(qwen_cpp, m)
689689
py::arg("src_w"), py::arg("dst"), py::arg("weight_dtype") = "int4", py::arg("alg") = "sym",
690690
py::arg("group_size") = 32, py::arg("scale_dtype") = "fp32", py::arg("compute_dtype") = "int8",
691691
py::arg("threads") = 8)
692+
.def("print_time", &Model::print_time)
692693
.def("reinit", &Model::reinit);
693694
}

neural_speed/application/main_run.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -765,7 +765,9 @@ int main(int argc, char** argv) { // NOLINT
765765
model_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
766766
}
767767

768-
model_print_timings(ctx);
768+
if (ns_log_level() == 0 || ns_log_level() == 1) {
769+
model_print_timings(ctx);
770+
}
769771
model_free(ctx);
770772

771773
return 0;

neural_speed/models/baichuan/baichuan.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -282,12 +282,9 @@ static bool baichuan_model_eval_internal(model_context* ctx, const model_input*
282282
ne_build_forward_expand(&gf, inpL);
283283
ne_graph_compute(ctx0, &gf);
284284

285-
#ifdef NE_PERF
286-
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
287-
if (engine_profiling_) {
285+
if (ns_log_level() == 0 || ns_log_level() == 2) {
288286
ne_graph_profiling(&gf);
289287
}
290-
#endif
291288

292289
// update kv token count
293290
lctx.model.kv_self.n = n_past + N;

neural_speed/models/bloom/bloom.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -260,12 +260,9 @@ static bool bloom_model_eval_internal(model_context* ctx, const model_input* inp
260260
ne_build_forward_expand(&gf, inpL);
261261
ne_graph_compute(ctx0, &gf);
262262

263-
#ifdef NE_PERF
264-
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
265-
if (engine_profiling_) {
263+
if (ns_log_level() == 0 || ns_log_level() == 2) {
266264
ne_graph_profiling(&gf);
267265
}
268-
#endif
269266

270267
// update kv token count
271268
lctx.model.kv_self.n = n_past + N;

neural_speed/models/chatglm/chatglm.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -288,12 +288,9 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
288288
ne_build_forward_expand(&gf, inpL);
289289
ne_graph_compute(ctx0, &gf);
290290

291-
#ifdef NE_PERF
292-
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
293-
if (engine_profiling_) {
291+
if (ns_log_level() == 0 || ns_log_level() == 2) {
294292
ne_graph_profiling(&gf);
295293
}
296-
#endif
297294

298295
// update kv token count
299296
lctx.model.kv_self.n = n_past + N;

neural_speed/models/chatglm/chatglm2.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -314,12 +314,9 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
314314
ne_build_forward_expand(&gf, inpL);
315315
ne_graph_compute(ctx0, &gf);
316316

317-
#ifdef NE_PERF
318-
bool engine_profiling_ = (getenv("ENGINE_PROFILING") != NULL);
319-
if (engine_profiling_) {
317+
if (ns_log_level() == 0 || ns_log_level() == 2) {
320318
ne_graph_profiling(&gf);
321319
}
322-
#endif
323320

324321
// update kv token count
325322
lctx.model.kv_self.n = n_cached;

0 commit comments

Comments
 (0)