In [None]:
"""
Showcase Bark, a transformer-based text-to-audio model.

https://github.com/suno-ai/bark
"""
import datetime as dt
from time import perf_counter
from typing import Tuple

import nltk
import numpy as np
from bark import SAMPLE_RATE, semantic_to_waveform, preload_models
from bark.generation import generate_text_semantic
from IPython.display import Audio
from scipy.io.wavfile import write as write_wav

# download and load all models
preload_models()

print(f"{SAMPLE_RATE=}")  # 24000

def generate_sentence(text: str, history_prompt = None) -> Tuple[dict, np.array]:
    """
    Short text to speech that returns a tuple of history_prompt [0] and audio array [1].
    """
    semantic_tokens = generate_text_semantic(
        text=text,
        history_prompt=history_prompt,
        temp=0.7,  # 0.7 is default; generation temperature (1.0 more diverse, 0.0 more conservative)
        top_k=None,
        top_p=None,
        silent=False,  # disable progress bar
        min_eos_p=0.05,  # this controls how likely the generation is to end or to hallucinate; default 0.2
        max_gen_duration_s=None,
        allow_early_stop=True,
        use_kv_caching=True,  # default: False, text_to_semantic() uses True
    )
    out = semantic_to_waveform(
        semantic_tokens,
        history_prompt=history_prompt,
        temp=0.999,  # 0.7 is default; generation temperature (1.0 more diverse, 0.0 more conservative)
        silent=False,  # disable progress bar
        output_full=True,  # True: return full generation [0] and adio array [1]; False: return only audio array
    )
    full_generation, audio = out
    return full_generation, audio


def gen_short_audio(text: str, speaker = None) -> np.array:
    """Short audio, allegedly works best up to 13 seconds."""
    _, audio = generate_sentence(text=text, history_prompt=speaker)
    return audio


def gen_long_audio(text: str, speaker = None) -> np.array:
    """Long audio, broken down by sentences aka tokens."""
    sentences = nltk.sent_tokenize(text)
    silence = np.zeros(int(0.25 * SAMPLE_RATE))  # quarter second of silence

    audio_parts = []
    for sentence in sentences:
        full_generation, audio_part = generate_sentence(text=sentence, history_prompt=speaker)
        audio_parts += [audio_part, silence.copy()]
    
    audio = np.concatenate(audio_parts)
    return audio


speaker = None  # random speaker
# speaker = "v2/en_speaker_9"
# speaker = "v2/en_speaker_6"  # suo favorite male

In [None]:
# Get a rando sentence in order to determine if you like the speaker
# Best use a sentence of the long text you want to render, because, 
# apparently, bark chooses random speakers according to the text's content.
# text = """
#     Hello, my name is Suno. And, uh — and I like pizza. [laughs]
#     But I also have other interests such as playing tic tac toe.
# """
# text = "Wubbalubbadubdub"
text = "The quick brown fox jumps over the lazy dog."
start = perf_counter()
full_generation, audio = generate_sentence(text)
duration_s = perf_counter() - start
print(f"Done. Took {round(duration_s)} seconds")
Audio(audio, rate=SAMPLE_RATE)

In [None]:
# optionally, load a speaker from disk
full_generation = dict(np.load("alicia.npz"))


In [None]:
# generate audio from text
# when you have a speaker you are happy with, you may want to use this speaker for long text generation

text = """
    Wubbalubbadubdub.
"""

start = perf_counter()
audio = gen_long_audio(text, speaker=full_generation)
duration_s = perf_counter() - start
print(f"Done. Took {round(duration_s)} seconds")
Audio(audio, rate=SAMPLE_RATE)

In [None]:
# In case you found a speaker you like, write the full_generation to disk for later use
from pathlib import Path

filename="alicia.npz"

if Path(filename).exists():
    print("Meh. Won't save.")
else:
    np.savez(filename, **full_generation)