add note to xla[gpu] under crash-safe mode

ise-uiuc · Sep 28, 2022 · bf1f592 · bf1f592
1 parent 1fc1c1c
commit bf1f592
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 10 deletions.
diff --git a/doc/known-issues.md b/doc/known-issues.md
@@ -0,0 +1,11 @@
+## Incompatibility of TensorFlow-GPU over fork-based crash safty
+
+Currently we enabled `fuzz.crash_safe=true` by default where we run the compilation & execution in a forked process as a sandbox to catch crash and timeout. However, CUDA runtime is not compatible with fork. In tensorflow, the symptom is crash in forked subprocess:
+
+```txt
+F tensorflow/stream_executor/cuda/cuda_driver.cc:219] Failed setting context: CUDA_ERROR_NOT_INITIALIZED: initialization error
+```
+
+- For `tflite` it's okay as it does not require GPU and `nnsmith.fuzz` will directly set `CUDA_VISIBLE_DEVICES=-1` in the beginning;
+- For `xla` it's a bit headache, currently we need to manually specify `fuzz.crash_safe=false` for fuzzing and allow it to crash;
+- We are tracking this [issue](https://github.com/tensorflow/tensorflow/issues/57877) in TensorFlow. We are likely to fix this by executing a TensorFlow model in a seperated process if it cannot be resolved in the near future.
diff --git a/doc/known_issues.md b/doc/known_issues.md
diff --git a/nnsmith/backends/factory.py b/nnsmith/backends/factory.py
@@ -135,14 +135,17 @@ def checked_compile_and_exec(
 
             def crash_safe_compile_exec(sdict):
                 try:
+                    CORE_LOG.debug(f"[FORK] Compiling.")
                     bug_or_exec = self.checked_compile(testcase)
                     if isinstance(bug_or_exec, BugReport):
                         sdict["symptom"] = bug_or_exec.symptom
                         sdict["log"] = bug_or_exec.log
                         return
 
+                    CORE_LOG.debug(f"[FORK] Executing.")
                     sdict["stage"] = Stage.EXECUTION
                     bug_or_result = self.checked_exec(bug_or_exec, testcase)
+                    CORE_LOG.debug(f"[FORK] Done.")
                     if isinstance(bug_or_result, BugReport):
                         sdict["symptom"] = bug_or_result.symptom
                         sdict["log"] = bug_or_result.log

diff --git a/nnsmith/cli/fuzz.py b/nnsmith/cli/fuzz.py
@@ -43,11 +43,26 @@ def __init__(
         # FIXME(@ganler): well-form the fix or report to TF
         # Dirty fix for TFLite on CUDA-enabled systems.
         # If CUDA is not needed, disable them all.
+        cmpwith = cfg["cmp"]["with"]
         if cfg["backend"]["type"] == "tflite" and (
-            cfg["cmp"]["with"] is None or cfg["cmp"]["with"]["target"] != "cuda"
+            cmpwith is None or cmpwith["target"] != "cuda"
         ):
             os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 
+        if (
+            cfg["fuzz"]["crash_safe"]
+            and cfg["backend"]["type"] == "xla"
+            and cfg["backend"]["target"] == "cuda"
+        ) or (
+            cmpwith is not None
+            and cmpwith["type"] == "xla"
+            and cmpwith["target"] == "cuda"
+        ):
+            raise ValueError(
+                "Please set `fuzz.crash_safe=false` for XLA on CUDA. "
+                "Also see https://github.com/ise-uiuc/nnsmith/blob/main/doc/known-issues.md"
+            )
+
         self.crash_safe = bool(cfg["fuzz"]["crash_safe"])
         self.test_timeout = cfg["fuzz"]["test_timeout"]
         if self.test_timeout is not None: