diff --git a/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py index 643ef3d2d615..86f01ec58616 100644 --- a/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +++ b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py @@ -47,7 +47,7 @@ def __call__( batch_size: int = 1, num_inference_steps: int = 100, generator: Optional[torch.Generator] = None, - sample_length_in_s: Optional[float] = None, + audio_length_in_s: Optional[float] = None, return_dict: bool = True, ) -> Union[AudioPipelineOutput, Tuple]: r""" @@ -60,6 +60,9 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. + audio_length_in_s (`float`, *optional*, defaults to `self.unet.config.sample_size/self.unet.config.sample_rate`): + The length of the generated audio sample in seconds. Note that the output of the pipeline, *i.e.* + `sample_size`, will be `audio_length_in_s` * `self.unet.sample_rate`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipeline_utils.AudioPipelineOutput`] instead of a plain tuple. @@ -69,23 +72,23 @@ def __call__( generated images. """ - if sample_length_in_s is None: - sample_length_in_s = self.unet.sample_size / self.unet.sample_rate + if audio_length_in_s is None: + audio_length_in_s = self.unet.config.sample_size / self.unet.config.sample_rate - sample_size = sample_length_in_s * self.unet.sample_rate + sample_size = audio_length_in_s * self.unet.sample_rate down_scale_factor = 2 ** len(self.unet.up_blocks) if sample_size < 3 * down_scale_factor: raise ValueError( - f"{sample_length_in_s} is too small. Make sure it's bigger or equal to" + f"{audio_length_in_s} is too small. Make sure it's bigger or equal to" f" {3 * down_scale_factor / self.unet.sample_rate}." ) original_sample_size = int(sample_size) if sample_size % down_scale_factor != 0: - sample_size = ((sample_length_in_s * self.unet.sample_rate) // down_scale_factor + 1) * down_scale_factor + sample_size = ((audio_length_in_s * self.unet.sample_rate) // down_scale_factor + 1) * down_scale_factor logger.info( - f"{sample_length_in_s} is increased to {sample_size / self.unet.sample_rate} so that it can be handled" + f"{audio_length_in_s} is increased to {sample_size / self.unet.sample_rate} so that it can be handled" f" by the model. It will be cut to {original_sample_size / self.unet.sample_rate} after the denoising" " process." ) diff --git a/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/tests/pipelines/dance_diffusion/test_dance_diffusion.py index bdf57ec08f54..72e67e4479d2 100644 --- a/tests/pipelines/dance_diffusion/test_dance_diffusion.py +++ b/tests/pipelines/dance_diffusion/test_dance_diffusion.py @@ -91,7 +91,7 @@ def test_dance_diffusion(self): pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device=device).manual_seed(0) - output = pipe(generator=generator, num_inference_steps=100, sample_length_in_s=4.096) + output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096) audio = output.audios audio_slice = audio[0, -3:, -3:] @@ -108,7 +108,7 @@ def test_dance_diffusion_fp16(self): pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device=device).manual_seed(0) - output = pipe(generator=generator, num_inference_steps=100, sample_length_in_s=4.096) + output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096) audio = output.audios audio_slice = audio[0, -3:, -3:]