In [1]:
from typing import Literal
import numpy as np
import soundfile
import IPython
from io import BytesIO
from transformers import AutoProcessor, AutoModel, Pipeline, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_audio_model() -> tuple[AutoProcessor, AutoModel]:

    #Download the small bark processor which prepares input text prompt for the core model
    processor = AutoProcessor.from_pretrained("suno/bark-small")


    #Download the bark model which will be used to generate the output audio.
    model = AutoModel.from_pretrained("suno/bark-small")
    return processor, model

In [3]:
VoicePresets = Literal["v2/en_speaker_1", "v2/en_speaker_9"]

def generate_audio(
    processor: AutoProcessor,
    model: AutoModel,
    prompt: str,
    preset: VoicePresets ) -> tuple[np.array, int]:


    # Preprocess text prompt with a speaker voice preset embedding and return a Pytorch tensor array of tokenized inputs using return_tensors="pt"
    inputs = processor(text=[prompt], return_tensors="pt", voice_preset=preset)


    # Generate an audio array that contains amplitude values of the synthesized audio signal over time.
    output = model.generate(**inputs, do_sample=True).cpu().numpy().squeeze()

    # Obtain the sampling rate from model generating configurations which can be used to produce the audio.
    sample_rate = model.generation_config.sample_rate
    return output, sample_rate

In [7]:
prompt = "What is Generative AI IN AI"
preset = "v2/en_speaker_9"
processor, model = load_audio_model()
output, sample_rate = generate_audio(processor=processor,model=model,prompt=prompt,preset=preset)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [8]:
print("sample_rate : ",sample_rate)
print("tyep_output :",type(output))
print(output.shape)

sample_rate :  24000
tyep_output : <class 'numpy.ndarray'>
(187840,)


In [9]:
import IPython.display


audio_array = output
output_file = "generated_audio.wav"
soundfile.write(output_file,audio_array, sample_rate,format="wav")
IPython.display.Audio(output_file)