In [88]:
"""
Showcase Bark, a transformer-based text-to-audio model.

https://github.com/suno-ai/bark
"""
import datetime as dt
from time import perf_counter
from typing import Tuple

import nltk
import numpy as np
from bark import SAMPLE_RATE, semantic_to_waveform, preload_models
from bark.generation import generate_text_semantic
from IPython.display import Audio
from scipy.io.wavfile import write as write_wav

# download and load all models
preload_models()

print(f"{SAMPLE_RATE=}")  # 24000

def generate_sentence(text: str, history_prompt = None) -> Tuple[dict, np.array]:
    """
    Short text to speech that returns a tuple of history_prompt [0] and audio array [1].
    """
    semantic_tokens = generate_text_semantic(
        text=text,
        history_prompt=history_prompt,
        temp=0.7,  # 0.7 is default; generation temperature (1.0 more diverse, 0.0 more conservative)
        top_k=None,
        top_p=None,
        silent=False,  # disable progress bar
        min_eos_p=0.05,  # this controls how likely the generation is to end or to hallucinate; default 0.2
        max_gen_duration_s=None,
        allow_early_stop=True,
        use_kv_caching=True,  # default: False, text_to_semantic() uses True
    )
    out = semantic_to_waveform(
        semantic_tokens,
        history_prompt=history_prompt,
        temp=0.7,  # 0.7 is default; generation temperature (1.0 more diverse, 0.0 more conservative)
        silent=False,  # disable progress bar
        output_full=True,  # True: return full generation [0] and adio array [1]; False: return only audio array
    )
    full_generation, audio = out
    return full_generation, audio


def gen_short_audio(text: str, speaker = None) -> np.array:
    """Short audio, allegedly works best up to 13 seconds."""
    _, audio = generate_sentence(text=text, history_prompt=speaker)
    return audio


def gen_long_audio(text: str, speaker = None) -> np.array:
    """Long audio, broken down by sentences aka tokens."""
    sentences = nltk.sent_tokenize(text)
    silence = np.zeros(int(0.25 * SAMPLE_RATE))  # quarter second of silence

    audio_parts = []
    for sentence in sentences:
        full_generation, audio_part = generate_sentence(text=sentence, history_prompt=speaker)
        audio_parts += [audio_part, silence.copy()]
    
    audio = np.concatenate(audio_parts)
    return audio


speaker = None  # random speaker
# speaker = "v2/en_speaker_9"
# speaker = "v2/en_speaker_6"  # suo favorite male

SAMPLE_RATE=24000


In [None]:
# Get a rando sentence in order to determine if you like the speaker
# Best use a sentence of the long text you want to render, because, 
# apparently, bark chooses random speakers according to the text's content.
# text = """
#     Hello, my name is Suno. And, uh — and I like pizza. [laughs]
#     But I also have other interests such as playing tic tac toe.
# """
text = "The quick brown fox jumps over the lazy dog."
start = perf_counter()
full_generation, audio = generate_sentence(text)
duration_s = perf_counter() - start
print(f"Done. Took {round(duration_s)} seconds")
Audio(audio, rate=SAMPLE_RATE)

100%|████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 39.86it/s]
  5%|████▊                                                                                      | 1/19 [00:00<00:07,  2.41it/s]

In [92]:
# when you have a speaker you are happy with, you may want to use this speaker for long text generation

# generate audio from text
# text = """
#     Wubbalubbadubdub.
# """
text = """
    Talk is important, talk is good. Say Ukraine liberated Crimea and expelled the Russian inhabitants
    all at once, immediately, or that people had suspected they would do this, what do you think the
    reactions around the world would be? Probably not favourable to Ukraine, might even generate
    sympathy for Russia. Making it clear about what they are going to do, how they are going to do it
    after liberation, is a very useful thing to explain. So while yeah, it is just talks, Crimea will be
    liberated, and this explains what happens next.
"""
text = """
    Okay. Noch mal probiert, keine Ahung ob das funktioniert. Verschwende hier gerade meine Mittagspause drauf.
    Naja, ist auch cool. Hab ein bisschen in die codebase und spärliche Dokumentation geguckt. Es gibt ein paar
    Variablen an denen man schrauben kann, aber das mit dem "uh" und "ehh" war wohl eher die choice of speaker.
    Bin jetzt gespannt auf das Resultat. Hab ein kleines Jupyter Notebook gebaut wo ich interaktiv mit einem einfachen
    Satz austesten kann wie der Sprecher klingt, und wenn mir die Stimme gefällt, geb ich die Stimme auf ganze Dokument los.
    Wenn das klappt, rendere ich Homers Ilias into speech oder so. Als Punk Rock Version.
"""
start = perf_counter()
audio = gen_long_audio(text, speaker=full_generation)
duration_s = perf_counter() - start
print(f"Done. Took {round(duration_s)} seconds")
Audio(audio, rate=SAMPLE_RATE)

100%|███████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 152.05it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.12it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 55.07it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:06<00:00,  1.95it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 42.73it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:08<00:00,  1.97it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 72.49it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 10/10 [

Done. Took 139 seconds
