In [None]:
"""
Showcase Bark, a transformer-based text-to-audio model.

https://github.com/suno-ai/bark
"""
import datetime as dt
from pathlib import Path
from time import perf_counter
from typing import Tuple

import nltk
import numpy as np
from bark import SAMPLE_RATE, semantic_to_waveform, preload_models
from bark.generation import generate_text_semantic
from IPython.display import Audio, display
from scipy.io.wavfile import write as write_wav

# download and load all models
preload_models()

print(f"{SAMPLE_RATE=}")  # 24000

def generate_sentence(text: str, history_prompt = None) -> Tuple[dict, np.array]:
    """
    Short text to speech that returns a tuple of history_prompt [0] and audio array [1].
    """
    semantic_tokens = generate_text_semantic(
        text=text,
        history_prompt=history_prompt,
        temp=0.7,  # 0.7 is default; generation temperature (1.0 more diverse, 0.0 more conservative), low gets slow like a person falling asleep
        top_k=None,
        top_p=None,
        silent=False,  # disable progress bar
        min_eos_p=0.05,  # this controls how likely the generation is to end or to hallucinate; default 0.2
        max_gen_duration_s=None,
        allow_early_stop=True,
        use_kv_caching=True,  # default: False, text_to_semantic() uses True
    )
    out = semantic_to_waveform(
        semantic_tokens,
        history_prompt=history_prompt,
        temp=0.7,  # 0.7 is default; generation temperature (1.0 more diverse, 0.0 more conservative); apparently, higher is more lively
        silent=False,  # disable progress bar
        output_full=True,  # True: return full generation [0] and adio array [1]; False: return only audio array
    )
    full_generation, audio = out
    return full_generation, audio


def gen_long_audio(text: str, speaker = None) -> np.array:
    """Long audio, broken down by sentences aka tokens."""
    sentences = nltk.sent_tokenize(text)
    silence = np.zeros(int(0.25 * SAMPLE_RATE))  # quarter second of silence

    audio_parts = []
    for sentence in sentences:
        _, audio_part = generate_sentence(text=sentence, history_prompt=speaker)
        audio_parts += [audio_part, silence.copy()]
    
    audio = np.concatenate(audio_parts)
    return audio


def beep():
    """Play a beep sound."""
    framerate = 4410
    play_time_s = 2
    t = np.linspace(0, play_time_s, framerate*play_time_s)
    audio = np.sin(2*np.pi*170*t)
    display(Audio(audio, rate=framerate, autoplay=True))


speaker = None  # random speaker
# speaker = "v2/en_speaker_9"
# speaker = "v2/en_speaker_6"  # suno favorite male

In [None]:
# Render, play and save a rando sentence n times in order to find a speaker that you like.
# Best use a sentence of the long text you want to render, because,
# apparently, bark chooses random speakers according to the text's content.

# text = """
#     Hello, my name is Suno. And, uh — and I like pizza. [laughs]
#     But I also have other interests such as playing tic tac toe.
# """

speaker_name = "Penny"

text = f"My name is {speaker_name}. The quick brown fox jumps over the lazy dog."

for i in range(100):
    text_with_number = text + f" ~ Number {i} ~ [laughs]"

    start = perf_counter()
    full_generation, audio = generate_sentence(text_with_number)
    duration_s = perf_counter() - start
    print(f"Done. Took {round(duration_s)} seconds")
    
    display(Audio(audio, rate=SAMPLE_RATE, autoplay=True))
    
    filename = f"{speaker_name}-{i}"
    if Path(filename).exists():
        print(f"Meh. Won't save. {filename}. Exists already.")
    else:
        np.savez(filename, **full_generation)

In [None]:
# optionally, load a speaker from disk
full_generation = dict(np.load("voices/penny.npz"))

In [None]:
# generate audio from text
# when you have a speaker you are happy with, you may want to use this speaker for long text generation

text = """
    The quick brown fox jumps over the lazy dog.
"""

text = """
"""

start = perf_counter()
audio = gen_long_audio(text, speaker=full_generation)
duration_s = perf_counter() - start
print(f"Done. Took {round(duration_s)} seconds")
beep()
display(Audio(audio, rate=SAMPLE_RATE, autoplay=False))
# display(Audio(audio, rate=SAMPLE_RATE, autoplay=True))
