From bf1f59298f20f74959f827aeeae4b0c5b65e1c4e Mon Sep 17 00:00:00 2001
From: ganler <jaway.liu@gmail.com>
Date: Tue, 27 Sep 2022 18:29:34 -0700
Subject: [PATCH] add note to xla[gpu] under crash-safe mode

---
 doc/known-issues.md         | 11 +++++++++++
 doc/known_issues.md         |  9 ---------
 nnsmith/backends/factory.py |  3 +++
 nnsmith/cli/fuzz.py         | 17 ++++++++++++++++-
 4 files changed, 30 insertions(+), 10 deletions(-)
 create mode 100644 doc/known-issues.md
 delete mode 100644 doc/known_issues.md

diff --git a/doc/known-issues.md b/doc/known-issues.md
new file mode 100644
index 00000000..171b5320
--- /dev/null
+++ b/doc/known-issues.md
@@ -0,0 +1,11 @@
+## Incompatibility of TensorFlow-GPU over fork-based crash safty
+
+Currently we enabled `fuzz.crash_safe=true` by default where we run the compilation & execution in a forked process as a sandbox to catch crash and timeout. However, CUDA runtime is not compatible with fork. In tensorflow, the symptom is crash in forked subprocess:
+
+```txt
+F tensorflow/stream_executor/cuda/cuda_driver.cc:219] Failed setting context: CUDA_ERROR_NOT_INITIALIZED: initialization error
+```
+
+- For `tflite` it's okay as it does not require GPU and `nnsmith.fuzz` will directly set `CUDA_VISIBLE_DEVICES=-1` in the beginning;
+- For `xla` it's a bit headache, currently we need to manually specify `fuzz.crash_safe=false` for fuzzing and allow it to crash;
+- We are tracking this [issue](https://github.com/tensorflow/tensorflow/issues/57877) in TensorFlow. We are likely to fix this by executing a TensorFlow model in a seperated process if it cannot be resolved in the near future.
diff --git a/doc/known_issues.md b/doc/known_issues.md
deleted file mode 100644
index dfea772f..00000000
--- a/doc/known_issues.md
+++ /dev/null
@@ -1,9 +0,0 @@
-## TensorFlow Lite CUDA Init Error in Crash-safe Mode
-
-If we run `tflite` in the fuzzing loop with `fuzz.crash_safe=true`, you may encounter tons of crashes of:
-
-```txt
-F tensorflow/stream_executor/cuda/cuda_driver.cc:219] Failed setting context: CUDA_ERROR_NOT_INITIALIZED: initialization error
-```
-
-It is temporarily "fixed" by setting environment variable `CUDA_VISIBLE_DEVICES=-1` if we found cuda is not used in the fuzzing loop. Nevertheless, this should be a TensorFlow bug that needs to be fixed.
diff --git a/nnsmith/backends/factory.py b/nnsmith/backends/factory.py
index 00109a19..cfd1c375 100644
--- a/nnsmith/backends/factory.py
+++ b/nnsmith/backends/factory.py
@@ -135,14 +135,17 @@ def checked_compile_and_exec(
 
             def crash_safe_compile_exec(sdict):
                 try:
+                    CORE_LOG.debug(f"[FORK] Compiling.")
                     bug_or_exec = self.checked_compile(testcase)
                     if isinstance(bug_or_exec, BugReport):
                         sdict["symptom"] = bug_or_exec.symptom
                         sdict["log"] = bug_or_exec.log
                         return
 
+                    CORE_LOG.debug(f"[FORK] Executing.")
                     sdict["stage"] = Stage.EXECUTION
                     bug_or_result = self.checked_exec(bug_or_exec, testcase)
+                    CORE_LOG.debug(f"[FORK] Done.")
                     if isinstance(bug_or_result, BugReport):
                         sdict["symptom"] = bug_or_result.symptom
                         sdict["log"] = bug_or_result.log
diff --git a/nnsmith/cli/fuzz.py b/nnsmith/cli/fuzz.py
index 85c21132..f38cedda 100644
--- a/nnsmith/cli/fuzz.py
+++ b/nnsmith/cli/fuzz.py
@@ -43,11 +43,26 @@ def __init__(
         # FIXME(@ganler): well-form the fix or report to TF
         # Dirty fix for TFLite on CUDA-enabled systems.
         # If CUDA is not needed, disable them all.
+        cmpwith = cfg["cmp"]["with"]
         if cfg["backend"]["type"] == "tflite" and (
-            cfg["cmp"]["with"] is None or cfg["cmp"]["with"]["target"] != "cuda"
+            cmpwith is None or cmpwith["target"] != "cuda"
         ):
             os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 
+        if (
+            cfg["fuzz"]["crash_safe"]
+            and cfg["backend"]["type"] == "xla"
+            and cfg["backend"]["target"] == "cuda"
+        ) or (
+            cmpwith is not None
+            and cmpwith["type"] == "xla"
+            and cmpwith["target"] == "cuda"
+        ):
+            raise ValueError(
+                "Please set `fuzz.crash_safe=false` for XLA on CUDA. "
+                "Also see https://github.com/ise-uiuc/nnsmith/blob/main/doc/known-issues.md"
+            )
+
         self.crash_safe = bool(cfg["fuzz"]["crash_safe"])
         self.test_timeout = cfg["fuzz"]["test_timeout"]
         if self.test_timeout is not None: