VZ docstring

inseq-team · Feb 28, 2024 · 3c7823e · 3c7823e
1 parent eb76ef6
commit 3c7823e
Show file tree

Hide file tree

Showing 7 changed files with 73 additions and 8 deletions.
diff --git a/docs/source/main_classes/feature_attribution.rst b/docs/source/main_classes/feature_attribution.rst
@@ -17,7 +17,7 @@ Attribution Methods
 .. autoclass:: inseq.attr.FeatureAttribution
     :members:
 
-Gradient Attribution Methods
+Gradient-based Attribution Methods
 -----------------------------------------------------------------------------------------------------------------------
 
 .. autoclass:: inseq.attr.feat.GradientAttributionRegistry
@@ -67,7 +67,7 @@ Layer Attribution Methods
     :members:
 
 
-Attention Attribution Methods
+Internals-based Attribution Methods
 -----------------------------------------------------------------------------------------------------------------------
 
 .. autoclass:: inseq.attr.feat.InternalsAttributionRegistry
@@ -76,3 +76,18 @@ Attention Attribution Methods
 
 .. autoclass:: inseq.attr.feat.AttentionWeightsAttribution
     :members:
+
+Perturbation-based Attribution Methods
+-----------------------------------------------------------------------------------------------------------------------
+
+.. autoclass:: inseq.attr.feat.PerturbationAttributionRegistry
+    :members:
+
+.. autoclass:: inseq.attr.feat.OcclusionAttribution
+    :members:
+
+.. autoclass:: inseq.attr.feat.LimeAttribution
+    :members:
+
+.. autoclass:: inseq.attr.feat.ValueZeroingAttribution
+    :members:
diff --git a/inseq/attr/feat/__init__.py b/inseq/attr/feat/__init__.py
@@ -17,6 +17,8 @@
 from .perturbation_attribution import (
     LimeAttribution,
     OcclusionAttribution,
+    PerturbationAttributionRegistry,
+    ValueZeroingAttribution,
 )
 
 __all__ = [
@@ -39,4 +41,6 @@
     "OcclusionAttribution",
     "LimeAttribution",
     "SequentialIntegratedGradientsAttribution",
+    "ValueZeroingAttribution",
+    "PerturbationAttributionRegistry",
 ]
diff --git a/inseq/attr/feat/ops/value_zeroing.py b/inseq/attr/feat/ops/value_zeroing.py
@@ -184,7 +184,6 @@ def compute_modules_post_zeroing_similarity(
                 to be the same as the length of the hidden states.
             similarity_metric (:obj:`str`): The name of the similarity metric used. Default: "cosine".
             mode (:obj:`str`): The mode of the model to compute the similarity for. Default: "decoder".
-
             zeroed_units_indices (:obj:`Union[int, tuple[int, int], list[int]]` or :obj:`dict` with :obj:`int` keys and
                 `Union[int, tuple[int, int], list[int]]` values, optional): The indices of the attention heads
                 that should be zeroed to compute corrupted states.

diff --git a/inseq/attr/feat/perturbation_attribution.py b/inseq/attr/feat/perturbation_attribution.py
@@ -121,7 +121,55 @@ def attribute_step(
 
 
 class ValueZeroingAttribution(PerturbationAttributionRegistry):
-    """Value Zeroing attribution method."""
+    """Value Zeroing method for feature attribution.
+
+    Introduced by `Mohebbi et al. (2023) <https://aclanthology.org/2023.eacl-main.245/>`__ to quantify context mixing
+    in Transformer models. The method is based on the observation that context mixing is regulated by the value vectors
+    of the attention mechanism. The method consists of two steps:
+
+    1. Zeroing the value vectors of the attention mechanism for a given token index at a given layer of the model.
+    2. Computing the similarity between hidden states produced with and without the zeroing operation, and using it
+       as a measure of context mixing for the given token at the given layer.
+
+    The method is converted into a feature attribution method by allowing for extraction of value zeroing scores at
+    specific layers, or by aggregating them across layers.
+
+    Reference implementations:
+    - Original implementation: `hmohebbi/ValueZeroing <https://github.com/hmohebbi/ValueZeroing>`__
+    - Encoder-decoder implementation: `hmohebbi/ContextMixingASR <https://github.com/hmohebbi/ContextMixingASR>`__
+
+    Args:
+        similarity_metric (:obj:`str`, optional): The similarity metric to use for computing the distance between
+            hidden states produced with and without the zeroing operation. Options: cosine, euclidean. Default: cosine.
+        encoder_zeroed_units_indices (:obj:`Union[int, tuple[int, int], list[int], dict]`, optional): The indices of
+            the attention heads that should be zeroed to compute corrupted states in the encoder self-attention module.
+            Not used for decoder-only models, or if ``output_encoder_self_scores`` is False. Format
+
+            - None: all attention heads across all layers are zeroed.
+            - int: the same attention head is zeroed across all layers.
+            - tuple of two integers: the attention heads in the range are zeroed across all layers.
+            - list of integers: the attention heads in the list are zeroed across all layers.
+            - dictionary: the keys are the layer indices and the values are the zeroed attention heads for the corresponding layer.
+
+            Default: None (all heads are zeroed for every encoder layer).
+        decoder_zeroed_units_indices (:obj:`Union[int, tuple[int, int], list[int], dict]`, optional): Same as
+            ``encoder_zeroed_units_indices`` but for the decoder self-attention module. Not used for encoder-decoder
+            models or if ``output_decoder_self_scores`` is False. Default: None (all heads are zeroed for every decoder layer).
+        cross_zeroed_units_indices (:obj:`Union[int, tuple[int, int], list[int], dict]`, optional): Same as
+            ``encoder_zeroed_units_indices`` but for the cross-attention module in encoder-decoder models. Not used
+            if the model is decoder-only. Default: None (all heads are zeroed for every layer).
+        output_decoder_self_scores (:obj:`bool`, optional): Whether to produce scores derived from zeroing the
+            decoder self-attention value vectors in encoder-decoder models. Cannot be false for decoder-only, or
+            if target-side attribution is requested using `attribute_target=True`. Default: True.
+        output_encoder_self_scores (:obj:`bool`, optional): Whether to produce scores derived from zeroing the
+            encoder self-attention value vectors in encoder-decoder models. Default: True.
+
+    Returns:
+        :class:`~inseq.data.MultiDimensionalFeatureAttributionStepOutput`: The final dimension returned by the method
+        is ``[attributed_seq_len, generated_seq_len, num_layers]``. If ``output_decoder_self_scores`` and
+        ``output_encoder_self_scores`` are True, the respective scores are returned in the ``sequence_scores``
+        output dictionary.
+    """
 
     method_name = "value_zeroing"
 

diff --git a/inseq/attr/step_functions.py b/inseq/attr/step_functions.py
@@ -462,8 +462,7 @@ def register_step_function(
     attribution targets by gradient-based feature attribution methods.
 
     Args:
-        fn (:obj:`callable`): The function to be used to compute step scores. Default parameters (use kwargs to capture
-        unused ones when defining your function):
+        fn (:obj:`callable`): The function to be used to compute step scores. Default parameters (use kwargs to capture unused ones when defining your function):
 
             - :obj:`attribution_model`: an :class:`~inseq.models.AttributionModel` instance, corresponding to the model
                 used for computing the score.

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -361,7 +361,7 @@ traitlets==5.14.1
     #   jupyter-client
     #   jupyter-core
     #   matplotlib-inline
-transformers==4.37.2
+transformers==4.38.1
 typeguard==2.13.3
     # via jaxtyping
 typer==0.9.0

diff --git a/requirements.txt b/requirements.txt
@@ -93,7 +93,7 @@ tqdm==4.66.2
     #   captum
     #   huggingface-hub
     #   transformers
-transformers==4.37.2
+transformers==4.38.1
 typeguard==2.13.3
     # via jaxtyping
 typing-extensions==4.9.0