huggingface · stevhliu · Nov 19, 2025 · Nov 19, 2025
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -204,9 +204,8 @@ def __call__(
         return_token_timestamps: Optional[bool] = None,
         **kwargs,
     ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
-        the STFT computation if available, otherwise a slower NumPy based one.
+        """Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch
+        for the STFT computation if available, otherwise a slower NumPy based one.
 
         Args:
             raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
@@ -220,6 +219,11 @@ def __call__(
 
                 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
             return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
                 to the specific feature_extractor's default.
@@ -232,18 +236,24 @@ def __call__(
                 bugs.
 
                 </Tip>
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
+            padding (`str` or [`~utils.PaddingStrategy`], *optional*, defaults to `'max_length'`):
+                Activates and controls padding. Accepts the following values:
+
+                - `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence is
+                  provided).
+                - `'max_length'` (default): Pad to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided.
+                - `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
+            max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters.
+
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
             sampling_rate (`int`, *optional*):
                 The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                 `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                 pipeline.
-            padding_value (`float`, *optional*, defaults to 0.0):
-                The value that is used to fill the padding values / vectors.
             do_normalize (`bool`, *optional*, defaults to `False`):
                 Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
                 improve the performance of the model.
@@ -255,6 +265,7 @@ def __call__(
 
                 Whether or not to return the number of frames of the input raw_speech.
                 These num_frames can be used by the model to compute word level timestamps.
+            **kwargs: Not supported by WhisperFeatureExtractor.__call__() and ignored.
         """
         if sampling_rate is not None:
             if sampling_rate != self.sampling_rate: