Skip to content

Commit

Permalink
v4.39 deprecations 🧼 (#29492)
Browse files Browse the repository at this point in the history
  • Loading branch information
gante authored Mar 7, 2024
1 parent 979fccc commit ffe60fd
Show file tree
Hide file tree
Showing 14 changed files with 9 additions and 400 deletions.
6 changes: 0 additions & 6 deletions docs/source/en/internal/generation_utils.md
Original file line number Diff line number Diff line change
Expand Up @@ -336,12 +336,6 @@ A [`Constraint`] can be used to force the generation to include specific tokens
- process
- finalize

## Utilities

[[autodoc]] top_k_top_p_filtering

[[autodoc]] tf_top_k_top_p_filtering

## Streamers

[[autodoc]] TextStreamer
Expand Down
6 changes: 0 additions & 6 deletions docs/source/ja/internal/generation_utils.md
Original file line number Diff line number Diff line change
Expand Up @@ -335,12 +335,6 @@ generation_output[:2]
- process
- finalize

## Utilities

[[autodoc]] top_k_top_p_filtering

[[autodoc]] tf_top_k_top_p_filtering

## Streamers

[[autodoc]] TextStreamer
Expand Down
6 changes: 0 additions & 6 deletions docs/source/zh/internal/generation_utils.md
Original file line number Diff line number Diff line change
Expand Up @@ -330,12 +330,6 @@ generation_output[:2]
- process
- finalize

## Utilities

[[autodoc]] top_k_top_p_filtering

[[autodoc]] tf_top_k_top_p_filtering

## Streamers

[[autodoc]] TextStreamer
Expand Down
4 changes: 0 additions & 4 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1409,7 +1409,6 @@
"TypicalLogitsWarper",
"UnbatchedClassifierFreeGuidanceLogitsProcessor",
"WhisperTimeStampLogitsProcessor",
"top_k_top_p_filtering",
]
)
_import_structure["generation_utils"] = []
Expand Down Expand Up @@ -3814,7 +3813,6 @@
"TFTemperatureLogitsWarper",
"TFTopKLogitsWarper",
"TFTopPLogitsWarper",
"tf_top_k_top_p_filtering",
]
)
_import_structure["generation_tf_utils"] = []
Expand Down Expand Up @@ -6206,7 +6204,6 @@
TypicalLogitsWarper,
UnbatchedClassifierFreeGuidanceLogitsProcessor,
WhisperTimeStampLogitsProcessor,
top_k_top_p_filtering,
)
from .modeling_utils import PreTrainedModel
from .models.albert import (
Expand Down Expand Up @@ -8178,7 +8175,6 @@
TFTemperatureLogitsWarper,
TFTopKLogitsWarper,
TFTopPLogitsWarper,
tf_top_k_top_p_filtering,
)
from .keras_callbacks import KerasMetricCallback, PushToHubCallback
from .modeling_tf_utils import (
Expand Down
9 changes: 0 additions & 9 deletions src/transformers/activations.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# limitations under the License.

import math
import warnings
from collections import OrderedDict

import torch
Expand Down Expand Up @@ -138,14 +137,6 @@ def forward(self, input: Tensor) -> Tensor:
return 0.5 * input * (1 + torch.tanh(self.precomputed_constant * (input + 0.044715 * torch.pow(input, 3))))


class SiLUActivation(nn.SiLU):
def __init__(self, *args, **kwargs):
warnings.warn(
"The SiLUActivation class has been deprecated and will be removed in v4.39. Please use nn.SiLU instead.",
)
super().__init__(*args, **kwargs)


class MishActivation(nn.Module):
"""
See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
Expand Down
4 changes: 0 additions & 4 deletions src/transformers/generation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@
]
_import_structure["utils"] = [
"GenerationMixin",
"top_k_top_p_filtering",
"GreedySearchEncoderDecoderOutput",
"GreedySearchDecoderOnlyOutput",
"SampleEncoderDecoderOutput",
Expand Down Expand Up @@ -130,7 +129,6 @@
]
_import_structure["tf_utils"] = [
"TFGenerationMixin",
"tf_top_k_top_p_filtering",
"TFGreedySearchDecoderOnlyOutput",
"TFGreedySearchEncoderDecoderOutput",
"TFSampleEncoderDecoderOutput",
Expand Down Expand Up @@ -241,7 +239,6 @@
GreedySearchEncoderDecoderOutput,
SampleDecoderOnlyOutput,
SampleEncoderDecoderOutput,
top_k_top_p_filtering,
)

try:
Expand Down Expand Up @@ -279,7 +276,6 @@
TFGreedySearchEncoderDecoderOutput,
TFSampleDecoderOnlyOutput,
TFSampleEncoderDecoderOutput,
tf_top_k_top_p_filtering,
)

try:
Expand Down
62 changes: 0 additions & 62 deletions src/transformers/generation/tf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3088,68 +3088,6 @@ def contrastive_search_body_fn(
return generated


def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
"""
Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
Args:
logits: logits distribution shape (batch size, vocabulary size)
top_k (`int`, *optional*, defaults to 0):
If > 0, only keep the top k tokens with highest probability (top-k filtering)
top_p (`float`, *optional*, defaults to 1.0):
If < 1.0, only keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus
filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
min_tokens_to_keep (`int`, *optional*, defaults to 1):
Minimumber of tokens we keep per batch example in the output.
From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
"""

warnings.warn(
"`tf_top_k_top_p_filtering` is scheduled for deletion in v4.39. Use `TFTopKLogitsWarper` and "
"`TFTopPLogitsWarper` instead.",
DeprecationWarning,
)

logits_shape = shape_list(logits)

if top_k > 0:
top_k = min(max(top_k, min_tokens_to_keep), logits_shape[-1]) # Safety check
# Remove all tokens with a probability less than the last token of the top-k
indices_to_remove = logits < tf.math.top_k(logits, k=top_k)[0][..., -1, None]
logits = tf.where(indices_to_remove, filter_value, logits)
if top_p < 1.0:
sorted_indices = tf.argsort(logits, direction="DESCENDING")
sorted_logits = tf.gather(
logits, sorted_indices, axis=-1, batch_dims=1
) # expects logits to be of dim (batch_size, vocab_size)

cumulative_probs = tf.math.cumsum(stable_softmax(sorted_logits, axis=-1), axis=-1)

# Remove tokens with cumulative probability above the threshold (token with 0 are kept)
sorted_indices_to_remove = cumulative_probs > top_p

if min_tokens_to_keep > 1:
# Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
sorted_indices_to_remove = tf.concat(
[
tf.zeros_like(sorted_indices_to_remove[:, :min_tokens_to_keep]),
sorted_indices_to_remove[:, min_tokens_to_keep:],
],
-1,
)

# Shift the indices to the right to keep also the first token above the threshold
sorted_indices_to_remove = tf.concat(
[tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, :-1]],
-1,
)
# scatter sorted tensors to original indexing
indices_to_remove = scatter_values_on_batch_indices(sorted_indices_to_remove, sorted_indices)
logits = tf.where(indices_to_remove, filter_value, logits)
return logits


def scatter_values_on_batch_indices(values, batch_indices):
shape = shape_list(batch_indices)
# broadcast batch dim to shape
Expand Down
41 changes: 0 additions & 41 deletions src/transformers/generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4810,47 +4810,6 @@ def _split_model_outputs(outputs, new_outputs, cur_len, added_len, is_decoder_at
return outputs


def top_k_top_p_filtering(
logits: torch.FloatTensor,
top_k: int = 0,
top_p: float = 1.0,
filter_value: float = -float("Inf"),
min_tokens_to_keep: int = 1,
) -> torch.FloatTensor:
"""
Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
Args:
logits: logits distribution shape (batch size, vocabulary size)
top_k (`int`, *optional*, defaults to 0):
If > 0, only keep the top k tokens with highest probability (top-k filtering)
top_p (`float`, *optional*, defaults to 1.0):
If < 1.0, only keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus
filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
min_tokens_to_keep (`int`, *optional*, defaults to 1):
Minimumber of tokens we keep per batch example in the output.
From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
"""
warnings.warn(
"`top_k_top_p_filtering` is scheduled for deletion in v4.39. Use `TopKLogitsWarper` and `TopPLogitsWarper` "
"instead.",
DeprecationWarning,
)

if top_k > 0:
logits = TopKLogitsWarper(top_k=top_k, filter_value=filter_value, min_tokens_to_keep=min_tokens_to_keep)(
None, logits
)

if 0 <= top_p <= 1.0:
logits = TopPLogitsWarper(top_p=top_p, filter_value=filter_value, min_tokens_to_keep=min_tokens_to_keep)(
None, logits
)

return logits


def _ranking_fast(
context_hidden: torch.FloatTensor,
next_hidden: torch.FloatTensor,
Expand Down
13 changes: 5 additions & 8 deletions src/transformers/models/llama/modeling_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,7 @@ def cos_cached(self):
return self._cos_cached

@torch.no_grad()
def forward(self, x, position_ids, seq_len=None):
if seq_len is not None:
logger.warning_once("The `seq_len` argument is deprecated and unused. It will be removed in v4.39.")

def forward(self, x, position_ids):
# x: [bs, num_attention_heads, seq_len, head_size]
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
position_ids_expanded = position_ids[:, None, :].float()
Expand All @@ -151,17 +148,17 @@ def forward(self, x, position_ids, seq_len=None):
class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
"""LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""

def forward(self, x, position_ids, seq_len=None):
def forward(self, x, position_ids):
# difference to the original RoPE: a scaling factor is aplied to the position ids
position_ids = position_ids.float() / self.scaling_factor
cos, sin = super().forward(x, position_ids, seq_len)
cos, sin = super().forward(x, position_ids)
return cos, sin


class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
"""LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""

def forward(self, x, position_ids, seq_len=None):
def forward(self, x, position_ids):
# difference to the original RoPE: inv_freq is recomputed when the sequence length > original length
seq_len = torch.max(position_ids) + 1
if seq_len > self.max_position_embeddings:
Expand All @@ -173,7 +170,7 @@ def forward(self, x, position_ids, seq_len=None):
)
self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: this may break with compilation

cos, sin = super().forward(x, position_ids, seq_len)
cos, sin = super().forward(x, position_ids)
return cos, sin


Expand Down
25 changes: 4 additions & 21 deletions src/transformers/models/opt/modeling_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,27 +120,10 @@ def __init__(
):
super().__init__()
self.config = config

def _handle_deprecated_argument(config_arg_name, config, fn_arg_name, kwargs):
"""
If a the deprecated argument `fn_arg_name` is passed, raise a deprecation
warning and return that value, otherwise take the equivalent config.config_arg_name
"""
val = None
if fn_arg_name in kwargs:
logging.warning(
"Passing in {fn_arg_name} to {self.__class__.__name__} is deprecated and won't be supported from "
"v4.39. Please set it in the config instead"
)
val = kwargs.pop(fn_arg_name)
else:
val = getattr(config, config_arg_name)
return val

self.embed_dim = _handle_deprecated_argument("hidden_size", config, "embed_dim", kwargs)
self.num_heads = _handle_deprecated_argument("num_attention_heads", config, "num_heads", kwargs)
self.dropout = _handle_deprecated_argument("attention_dropout", config, "dropout", kwargs)
self.enable_bias = _handle_deprecated_argument("enable_bias", config, "bias", kwargs)
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.dropout = config.attention_dropout
self.enable_bias = config.enable_bias

self.head_dim = self.embed_dim // self.num_heads
self.is_causal = True
Expand Down
4 changes: 0 additions & 4 deletions src/transformers/utils/dummy_pt_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,10 +408,6 @@ def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])


def top_k_top_p_filtering(*args, **kwargs):
requires_backends(top_k_top_p_filtering, ["torch"])


class PreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]

Expand Down
4 changes: 0 additions & 4 deletions src/transformers/utils/dummy_tf_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,6 @@ def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])


def tf_top_k_top_p_filtering(*args, **kwargs):
requires_backends(tf_top_k_top_p_filtering, ["tf"])


class KerasMetricCallback(metaclass=DummyObject):
_backends = ["tf"]

Expand Down
Loading

0 comments on commit ffe60fd

Please sign in to comment.