tokenization tests added

huggingface · Mar 23, 2023 · 5dd2d02 · 5dd2d02
1 parent 4d9fcc3
commit 5dd2d02
Show file tree

Hide file tree

Showing 11 changed files with 604 additions and 35 deletions.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
@@ -369,7 +369,7 @@ Flax), PyTorch, and/or TensorFlow.
 |          Pix2Struct           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            PLBart             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          PoolFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Pop2Piano           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Pop2Piano           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          ProphetNet           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            QDQBert            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |              RAG              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |

diff --git a/docs/source/en/model_doc/pop2piano.mdx b/docs/source/en/model_doc/pop2piano.mdx
@@ -37,12 +37,38 @@ Tips:
 
 1. Pop2Piano is an Encoder-Decoder based model like T5.
 2. Pop2Piano can be used to generate midi-audio files for a given audio sequence. This HuggingFace implementation allows to save midi_output as well as stereo-mix output of the audio sequence.
-3. Choosing different composers in Pop2PianoForConditionalGeneration.generate can lead to variety of different results. 
+3. Choosing different composers in `Pop2PianoForConditionalGeneration.generate()` can lead to variety of different results.
 4. Please note that  HuggingFace implementation of Pop2Piano(both Pop2PianoForConditionalGeneration and Pop2PianoFeatureExtractor) can only work with one raw_audio sequence at a time. So if you want to process multiple files, please feed them one by one.  
 
 This model was contributed by [Susnato Dhar](https://huggingface.co/susnato).
 The original code can be found [here](https://github.com/sweetcocoa/pop2piano).
 
+Example:
+```
+import librosa
+from transformers import Pop2PianoFeatureExtractor, Pop2PianoForConditionalGeneration, Pop2PianoTokenizer
+
+raw_audio, sr = librosa.load("audio.mp3", sr=44100)
+model = Pop2PianoForConditionalGeneration.from_pretrained("susnato/pop2piano_dev")
+feature_extractor = Pop2PianoFeatureExtractor.from_pretrained("susnato/pop2piano_dev")
+tokenizer = Pop2PianoTokenizer.from_pretrained("susnato/pop2piano_dev")
+
+model.eval()
+
+feature_extractor_outputs = fe(raw_audio=raw_audio, audio_sr=sr, return_tensors="pt")
+model_outputs = model.generate(feature_extractor_outputs, composer="composer1")
+
+opt_postprocess = tokenizer(relative_tokens=model_outputs,
+                           beatsteps=feature_extractor_outputs["beatsteps"],
+                           ext_beatstep=feature_extractor_outputs["ext_beatstep"],
+                           raw_audio=raw_audio,
+                           sampling_rate=sr,
+                           save_path="./Music/Outputs/",
+                           audio_file_name="filename",
+                           save_midi=True
+                )
+```
+
 
 ## Pop2PianoConfig
 
@@ -59,3 +85,8 @@ The original code can be found [here](https://github.com/sweetcocoa/pop2piano).
     - forward
     - generate
 
+## Pop2PianoTokenizer
+
+[[autodoc]] Pop2PianoTokenizer
+    - __call__
+
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -3443,6 +3443,7 @@
     ]
 else:
     _import_structure["models.pop2piano"].append("Pop2PianoFeatureExtractor")
+    _import_structure["models.pop2piano"].append("Pop2PianoTokenizer")
 
 
 # FLAX-backed objects
@@ -6554,7 +6555,7 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_music_objects import *
     else:
-        from .models.pop2piano import Pop2PianoFeatureExtractor
+        from .models.pop2piano import Pop2PianoFeatureExtractor, Pop2PianoTokenizer
 
     try:
         if not is_flax_available():

diff --git a/src/transformers/models/pop2piano/__init__.py b/src/transformers/models/pop2piano/__init__.py
@@ -57,6 +57,7 @@
     pass
 else:
     _import_structure["feature_extraction_pop2piano"] = ["Pop2PianoFeatureExtractor"]
+    _import_structure["tokenization_pop2piano"] = ["Pop2PianoTokenizer"]
 
 
 if TYPE_CHECKING:
@@ -89,6 +90,7 @@
         pass
     else:
         from .feature_extraction_pop2piano import Pop2PianoFeatureExtractor
+        from .tokenization_pop2piano import Pop2PianoTokenizer
 
 else:
     import sys

diff --git a/src/transformers/models/pop2piano/feature_extraction_pop2piano.py b/src/transformers/models/pop2piano/feature_extraction_pop2piano.py
@@ -51,16 +51,6 @@ class Pop2PianoFeatureExtractor(SequenceFeatureExtractor):
             Whether to preprocess for `LogMelSpectrogram` or not. For the current implementation this must be `True`.
         padding_value (`int`, *optional*, defaults to 0):
             Padding value used to pad the audio. Should correspond to silences.
-        vocab_size_special (`int`, *optional*, defaults to 4):
-            Number of special values.
-        vocab_size_note (`int`, *optional*, defaults to 128):
-            This represents the number of Note Values. Note values indicate a pitch event for one of the MIDI pitches.
-            But only the 88 pitches corresponding to piano keys are actually used.
-        vocab_size_velocity (`int`, *optional*, defaults to 2):
-            Number of Velocity tokens.
-        vocab_size_time (`int`, *optional*, defaults to 100):
-            This represents the number of Beat Shifts. Beat Shift [100 values] Indicates the relative time shift within
-            the segment quantized into 8th-note beats(half-beats).
         n_fft (`int`, *optional*, defaults to 4096):
             Size of Fast Fourier Transform, creates n_fft // 2 + 1 bins.
         hop_length (`int`, *optional*, defaults to 1024):
@@ -78,10 +68,6 @@ def __init__(
         sampling_rate: int = 22050,
         use_mel: int = True,
         padding_value: int = 0,
-        vocab_size_special: int = 4,
-        vocab_size_note: int = 128,
-        vocab_size_velocity: int = 2,
-        vocab_size_time: int = 100,
         n_fft: int = 4096,
         hop_length: int = 1024,
         f_min: float = 10.0,
@@ -99,10 +85,6 @@ def __init__(
         self.sampling_rate = sampling_rate
         self.use_mel = use_mel
         self.padding_value = padding_value
-        self.vocab_size_special = vocab_size_special
-        self.vocab_size_note = vocab_size_note
-        self.vocab_size_velocity = vocab_size_velocity
-        self.vocab_size_time = vocab_size_time
         self.n_fft = n_fft
         self.hop_length = hop_length
         self.f_min = f_min