From bf1f59298f20f74959f827aeeae4b0c5b65e1c4e Mon Sep 17 00:00:00 2001 From: ganler Date: Tue, 27 Sep 2022 18:29:34 -0700 Subject: [PATCH] add note to xla[gpu] under crash-safe mode --- doc/known-issues.md | 11 +++++++++++ doc/known_issues.md | 9 --------- nnsmith/backends/factory.py | 3 +++ nnsmith/cli/fuzz.py | 17 ++++++++++++++++- 4 files changed, 30 insertions(+), 10 deletions(-) create mode 100644 doc/known-issues.md delete mode 100644 doc/known_issues.md diff --git a/doc/known-issues.md b/doc/known-issues.md new file mode 100644 index 00000000..171b5320 --- /dev/null +++ b/doc/known-issues.md @@ -0,0 +1,11 @@ +## Incompatibility of TensorFlow-GPU over fork-based crash safty + +Currently we enabled `fuzz.crash_safe=true` by default where we run the compilation & execution in a forked process as a sandbox to catch crash and timeout. However, CUDA runtime is not compatible with fork. In tensorflow, the symptom is crash in forked subprocess: + +```txt +F tensorflow/stream_executor/cuda/cuda_driver.cc:219] Failed setting context: CUDA_ERROR_NOT_INITIALIZED: initialization error +``` + +- For `tflite` it's okay as it does not require GPU and `nnsmith.fuzz` will directly set `CUDA_VISIBLE_DEVICES=-1` in the beginning; +- For `xla` it's a bit headache, currently we need to manually specify `fuzz.crash_safe=false` for fuzzing and allow it to crash; +- We are tracking this [issue](https://github.com/tensorflow/tensorflow/issues/57877) in TensorFlow. We are likely to fix this by executing a TensorFlow model in a seperated process if it cannot be resolved in the near future. diff --git a/doc/known_issues.md b/doc/known_issues.md deleted file mode 100644 index dfea772f..00000000 --- a/doc/known_issues.md +++ /dev/null @@ -1,9 +0,0 @@ -## TensorFlow Lite CUDA Init Error in Crash-safe Mode - -If we run `tflite` in the fuzzing loop with `fuzz.crash_safe=true`, you may encounter tons of crashes of: - -```txt -F tensorflow/stream_executor/cuda/cuda_driver.cc:219] Failed setting context: CUDA_ERROR_NOT_INITIALIZED: initialization error -``` - -It is temporarily "fixed" by setting environment variable `CUDA_VISIBLE_DEVICES=-1` if we found cuda is not used in the fuzzing loop. Nevertheless, this should be a TensorFlow bug that needs to be fixed. diff --git a/nnsmith/backends/factory.py b/nnsmith/backends/factory.py index 00109a19..cfd1c375 100644 --- a/nnsmith/backends/factory.py +++ b/nnsmith/backends/factory.py @@ -135,14 +135,17 @@ def checked_compile_and_exec( def crash_safe_compile_exec(sdict): try: + CORE_LOG.debug(f"[FORK] Compiling.") bug_or_exec = self.checked_compile(testcase) if isinstance(bug_or_exec, BugReport): sdict["symptom"] = bug_or_exec.symptom sdict["log"] = bug_or_exec.log return + CORE_LOG.debug(f"[FORK] Executing.") sdict["stage"] = Stage.EXECUTION bug_or_result = self.checked_exec(bug_or_exec, testcase) + CORE_LOG.debug(f"[FORK] Done.") if isinstance(bug_or_result, BugReport): sdict["symptom"] = bug_or_result.symptom sdict["log"] = bug_or_result.log diff --git a/nnsmith/cli/fuzz.py b/nnsmith/cli/fuzz.py index 85c21132..f38cedda 100644 --- a/nnsmith/cli/fuzz.py +++ b/nnsmith/cli/fuzz.py @@ -43,11 +43,26 @@ def __init__( # FIXME(@ganler): well-form the fix or report to TF # Dirty fix for TFLite on CUDA-enabled systems. # If CUDA is not needed, disable them all. + cmpwith = cfg["cmp"]["with"] if cfg["backend"]["type"] == "tflite" and ( - cfg["cmp"]["with"] is None or cfg["cmp"]["with"]["target"] != "cuda" + cmpwith is None or cmpwith["target"] != "cuda" ): os.environ["CUDA_VISIBLE_DEVICES"] = "-1" + if ( + cfg["fuzz"]["crash_safe"] + and cfg["backend"]["type"] == "xla" + and cfg["backend"]["target"] == "cuda" + ) or ( + cmpwith is not None + and cmpwith["type"] == "xla" + and cmpwith["target"] == "cuda" + ): + raise ValueError( + "Please set `fuzz.crash_safe=false` for XLA on CUDA. " + "Also see https://github.com/ise-uiuc/nnsmith/blob/main/doc/known-issues.md" + ) + self.crash_safe = bool(cfg["fuzz"]["crash_safe"]) self.test_timeout = cfg["fuzz"]["test_timeout"] if self.test_timeout is not None: