In [None]:
"""
Showcase Bark, a transformer-based text-to-audio model.

https://github.com/suno-ai/bark
"""
import datetime as dt
from time import perf_counter
from typing import Tuple

import nltk
import numpy as np
from bark import SAMPLE_RATE, semantic_to_waveform, preload_models
from bark.generation import generate_text_semantic
from IPython.display import Audio, display
from scipy.io.wavfile import write as write_wav

# download and load all models
preload_models()

print(f"{SAMPLE_RATE=}")  # 24000

def generate_sentence(text: str, history_prompt = None) -> Tuple[dict, np.array]:
    """
    Short text to speech that returns a tuple of history_prompt [0] and audio array [1].
    """
    semantic_tokens = generate_text_semantic(
        text=text,
        history_prompt=history_prompt,
        temp=0.7,  # 0.7 is default; generation temperature (1.0 more diverse, 0.0 more conservative)
        top_k=None,
        top_p=None,
        silent=False,  # disable progress bar
        min_eos_p=0.05,  # this controls how likely the generation is to end or to hallucinate; default 0.2
        max_gen_duration_s=None,
        allow_early_stop=True,
        use_kv_caching=True,  # default: False, text_to_semantic() uses True
    )
    out = semantic_to_waveform(
        semantic_tokens,
        history_prompt=history_prompt,
        temp=0.999,  # 0.7 is default; generation temperature (1.0 more diverse, 0.0 more conservative)
        silent=False,  # disable progress bar
        output_full=True,  # True: return full generation [0] and adio array [1]; False: return only audio array
    )
    full_generation, audio = out
    return full_generation, audio


def gen_short_audio(text: str, speaker = None) -> np.array:
    """Short audio, allegedly works best up to 13 seconds."""
    _, audio = generate_sentence(text=text, history_prompt=speaker)
    return audio


def gen_long_audio(text: str, speaker = None) -> np.array:
    """Long audio, broken down by sentences aka tokens."""
    sentences = nltk.sent_tokenize(text)
    silence = np.zeros(int(0.25 * SAMPLE_RATE))  # quarter second of silence

    audio_parts = []
    for sentence in sentences:
        _, audio_part = generate_sentence(text=sentence, history_prompt=speaker)
        audio_parts += [audio_part, silence.copy()]
    
    audio = np.concatenate(audio_parts)
    return audio


speaker = None  # random speaker
# speaker = "v2/en_speaker_9"
# speaker = "v2/en_speaker_6"  # suno favorite male

In [None]:
# Render, play and save a rando sentence n times in order to find a speaker that you like.
# Best use a sentence of the long text you want to render, because,
# apparently, bark chooses random speakers according to the text's content.

# text = """
#     Hello, my name is Suno. And, uh — and I like pizza. [laughs]
#     But I also have other interests such as playing tic tac toe.
# """
# text = "Wubbalubbadubdub"
text = "The quick brown fox jumps over the lazy dog."

speaker_name = "sarah"

for i in range(10):
    text_with_number = text + f" ~ Number {i}"

    start = perf_counter()
    full_generation, audio = generate_sentence(text_with_number)
    duration_s = perf_counter() - start
    print(f"Done. Took {round(duration_s)} seconds")
    
    display(Audio(audio, rate=SAMPLE_RATE, autoplay=True))
    
    filename = f"{speaker_name}-{i}"
    if Path(filename).exists():
        print(f"Meh. Won't save. {filename}. Exists already.")
    else:
        np.savez(filename, **full_generation)

In [None]:
# optionally, load a speaker from disk
full_generation = dict(np.load("voices/alicia.npz"))

In [None]:
# generate audio from text
# when you have a speaker you are happy with, you may want to use this speaker for long text generation

# text = """
#     Wubbalubbadubdub.
# """
text = """
    The quick brown fox jumps over the lazy dog.
"""
text = """
    She impatiently tugged at the waistband of my underwear. I raised my hips and we pulled them down
    together. She pried my knees apart, exposing my vulva to her. My breath was heavy and short all at
    once in anticipation of more of her tongue on more of me.

    Her breath was hot and teasing as she kissed up the length of my inner thigh, ever closer to my
    aching core. She flicked me once, twice, between every fold. Sucking in the bud of my clit, she held
    it in the warmth of her mouth. My pulse tried to hit the roof of her mouth, but I felt it in mine.

    The subtle shock urged my eyes open. I admired her tongue as it moved in and out of my field of
    vision, and noticed that though one hand was squeezing my thigh, the other had made its way between
    her legs. I bit my lip to stifle a groan.

    She didn’t skip a beat while she rummaged blindly in the side pocket of her bag. When the rustling
    stopped, so did her lips abandon me. I didn’t make a note of what was happening until her face was
    hovering next to mine and the softest droning buzz spring from somewhere between us.

    “Don’t worry,” she whispered. “I just cleaned it.”

    Before I had a chance to react, the rounded edge of her tiny (surprisingly quiet) purple vibrator
    was where her tongue had just been. She reached down for my hand and encouraged me to take the
    reins. I obediently clutched the squished egg-like device and began to acquaint myself with it. She
    bit my earlobe approvingly and kissed me again, tongue darting in and out, as she teased at my
    entrance with one, then two fingers.

    The tips of her fingers massaged at my opening while I pushed my clit to the brink. My hips rose in
    anticipation. She slid deeper and deeper until she was as far as she could reach with every thrust,
    curling her fingers up to my belly button every time she slid out. I like to think we were the
    definition of stealth, but there’s no way someone wouldn’t have heard us if they set up shop in the
    next cubicle over in that moment.

    She nibbled at my neck as she dragged the orgasm right out of me.

    Palm pressed to the side wall, eyes scrunched, jaw clenched, my head tapped the wall behind me a
    little harder this time. My leg started to twitch violently as my hips fell down following the harsh
    yet extremely welcome release. She pressed her lips to mine to distract me from the loss of her
    fingers, breathing life back into me.

    She took the vibrator from my hands, pressing the little thing silent; drawing an end to our total
    fluke of an encounter.

    I pulled up my pants. She adjusted her dress. We picked up our bags. She opened the door to check:
    Yep, coast is clear. She turned back to steal one more kiss before we walked out.

    We selected side-by-side sinks to go through the motions of washing our hands and checking our faces
    for posterity, stealing side-eye glances as she put on new, dramatically darker lips.

    “Safe travels,” she said, and planted a soft peck on my cheek before walking away from me forever.

    I checked my face in the mirror. I smiled sheepishly at my reflection, rubbing the echo of her dark
    kiss, mussed my hair back into place, and slung my bag back over my shoulder.

    When I walked back out into the busy hallway, it seemed that her flight was boarding—she was already
    in line. We made brief eye contact without so much as another smile as I shoved my hands in my
    pockets and walked back to my own gate.
"""

start = perf_counter()
audio = gen_long_audio(text, speaker=full_generation)
duration_s = perf_counter() - start
print(f"Done. Took {round(duration_s)} seconds")
Audio(audio, rate=SAMPLE_RATE)