huggingface · fxmarty · Jan 25, 2024 · Jan 22, 2024 · Jan 22, 2024
diff --git a/src/accelerate/hooks.py b/src/accelerate/hooks.py
@@ -166,7 +166,12 @@ def new_forward(module, *args, **kwargs):
             output = module._old_forward(*args, **kwargs)
         return module._hf_hook.post_forward(module, output)
 
-    module.forward = functools.update_wrapper(functools.partial(new_forward, module), old_forward)
+    # Overriding a GraphModuleImpl forward freezes the forward call and later modifications on the graph will fail.
+    # Reference: https://pytorch.slack.com/archives/C3PDTEV8E/p1705929610405409
+    if "GraphModuleImpl" in str(type(module)):
+        module.__class__.forward = functools.update_wrapper(functools.partial(new_forward, module), old_forward)
+    else:
+        module.forward = functools.update_wrapper(functools.partial(new_forward, module), old_forward)
 
     return module
 
@@ -189,7 +194,12 @@ def remove_hook_from_module(module: nn.Module, recurse=False):
         delattr(module, "_hf_hook")
 
     if hasattr(module, "_old_forward"):
-        module.forward = module._old_forward
+        # Overriding a GraphModuleImpl forward freezes the forward call and later modifications on the graph will fail.
+        # Reference: https://pytorch.slack.com/archives/C3PDTEV8E/p1705929610405409
+        if "GraphModuleImpl" in str(type(module)):
+            module.__class__.forward = module._old_forward
+        else:
+            module.forward = module._old_forward
         delattr(module, "_old_forward")
 
     if recurse:

diff --git a/tests/test_hooks.py b/tests/test_hooks.py
@@ -17,6 +17,7 @@
 
 import torch
 import torch.nn as nn
+from torch.fx import symbolic_trace
 
 from accelerate.hooks import (
     AlignDevicesHook,
@@ -347,3 +348,51 @@ def test_attach_align_device_hook_as_cpu_offload_with_weight_map(self):
         self.assertEqual(model.linear1.weight.device, torch.device("cpu"))
         self.assertEqual(model.batchnorm.weight.device, torch.device("cpu"))
         self.assertEqual(model.linear2.weight.device, torch.device("cpu"))
+
+    def test_add_remove_hook_fx_graph_module(self):
+        with torch.no_grad():
+            test_model = ModelForTest()
+            test_hook = ModelHook()
+
+            x = torch.randn(2, 3)
+            output1 = test_model(x)
+
+            graph_model = symbolic_trace(test_model)
+
+            output2 = graph_model(x)
+
+            self.assertTrue(torch.allclose(output1, output2))
+
+            add_hook_to_module(graph_model, test_hook)
+            remove_hook_from_module(graph_model, recurse=True)
+
+            # We want to make sure that `add_hook_to_module` and `remove_hook_from_module` yields back an fx.GraphModule
+            # that behaves correctly (for example that is not frozen, see https://github.com/huggingface/accelerate/pull/2369).
+            # For that, we add a sigmoid node to the FX graph and make sure that the new output (output3 below) is different than
+            # the original model's output.
+            linear2_node = None
+            for node in graph_model.graph.nodes:
+                if node.name == "linear2":
+                    linear2_node = node
+            self.assertTrue(linear2_node is not None)
+
+            graph_model.graph.inserting_after(linear2_node)
+            new_node = graph_model.graph.create_node(
+                op="call_function", target=torch.sigmoid, args=(linear2_node,), name="relu"
+            )
+
+            output_node = None
+            for node in graph_model.graph.nodes:
+                if node.name == "output":
+                    output_node = node
+            self.assertTrue(output_node is not None)
+
+            output_node.replace_input_with(linear2_node, new_node)
+
+            graph_model.graph.lint()
+            graph_model.recompile()
+
+            output3 = graph_model(x)
+
+            # Now the output is expected to be different since we modified the graph.
+            self.assertFalse(torch.allclose(output1, output3))