huggingface · patrickvonplaten · Feb 25, 2021 · Feb 22, 2021 · Feb 22, 2021 · Feb 22, 2021
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
@@ -0,0 +1,65 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ Feature extraction classes for python tokenizers.
+"""
+
+
+class BatchFeature(UserDict):
+    """"""
+
+    def __init__(
+        self,
+        data: Optional[Dict[str, Any]] = None,
+        encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
+        tensor_type: Union[None, str, TensorType] = None,
+        prepend_batch_axis: bool = False,
+        n_sequences: Optional[int] = None,
+    ):
+        super().__init__(data)
+        # add similar functionality as BatchEncoding
+
+
+class PreTrainedFeatureExtractor:
+    """
+    This is a general feature extraction class for speech recognition
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # IMPORTANT: Feature Extractor are always deterministic -> they are never trained
+        # in any way like Tokenizers are -> therefore all configuration params should be
+        # stored in a json config
+        self.sampling_rate = kwargs.get("sampling_rate", None)
+        self.pad_vector = kwargs.get("pad_vector", None)
+        self.feature_dim = kwargs.get("feature_dim", None)  # this will be 1 for Wav2Vec2, but 768 for Speech2TextTransformers
+
+    def pad(self, feature: BatchFeature):
+        """
+        Implement general padding method
+        """
+        pass
+
+    def from_pretained(self, path):
+        """
+        General loading method
+        """
+        pass
+
+    def save_pretrained(self, path):
+        """
+        General saving method
+        """
+        pass
diff --git a/src/transformers/models/wav2vec2/feature_processing_wav2vec2.py b/src/transformers/models/wav2vec2/feature_processing_wav2vec2.py
@@ -0,0 +1,86 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Speech processor class for Wav2Vec2
+"""
+
+
+# NOTE inheritance from feature extractor
+class Wav2Vec2FeatureExtractor(PreTrainedFeatureExtractor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def __call__(self, raw_speech):
+        """
+        Implement the call method
+        """
+        pass
+
+
+# NOTE inheritance from tokenizer
+class Wav2Vec2Tokenizer(PreTrainedTokenizer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def __call__(self, text):
+        """
+        Implement encoding functionality
+        """
+        pass
+
+    def _decode(self, text):
+        """
+        Implement decoding functionality
+        """
+        pass
+
+
+class Wav2Vec2Processor:
+    def __init__(self, feature_extractor, tokenizer):
+        self.feature_extractor = feature_extractor
+        self.tokenizer = tokenizer
+        self.current_processor = self.feature_extractor
+
+    def save_pretrained(self, pretrained_model_name_or_path):
+        self.feature_extractor.save_pretrained(pretrained_model_name_or_path)
+        self.tokenizer.save_pretrained(pretrained_model_name_or_path)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        # will look for a `feature_extractor_config.json` file
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path)
+        # will look for the tokenizer files
+        tokenizer = Wav2Vec2Tokenizer.from_pretrained(pretrained_model_name_or_path)
+
+        return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
+
+    def __call__(self, *args, **kwargs):
+        return self.current_processor(*args, **kwargs)
+
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @contextmanager
+    def as_target_tokenizer(self):
+        """
+        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
+        sequence-to-sequence models that need a slightly different processing for the labels.
+        """
+        self.current_processor = self.tokenizer
+        yield
+        self.current_processor = self.feature_extractor