# Bark Experiments - Using Transformers library

#### Author: Kenneth Leung
___

In [1]:
from transformers import AutoProcessor, BarkModel
from IPython.display import Audio
import torch
import scipy

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# torch_dtype = torch.float16
# model_type = "suno/bark-small"
torch_dtype = torch.float32 # Default 
model_type = "suno/bark"

In [4]:
processor = AutoProcessor.from_pretrained(model_type, torch_dtype=torch_dtype)
model = BarkModel.from_pretrained(model_type, torch_dtype=torch_dtype)

In [5]:
model = model.to(device)
model.device

device(type='cuda', index=0)

In [6]:
model

BarkModel(
  (semantic): BarkSemanticModel(
    (input_embeds_layer): Embedding(129600, 1024)
    (position_embeds_layer): Embedding(1024, 1024)
    (drop): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x BarkBlock(
        (layernorm_1): BarkLayerNorm()
        (layernorm_2): BarkLayerNorm()
        (attn): BarkSelfAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (att_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): BarkMLP(
          (in_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (out_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (dropout): Dropout(p=0.0, inplace=False)
          (gelu): GELU(approximate='none')
        )
      )
    )
    (layernorm_final): BarkLayerNorm()
    (lm_head): Linear(in_feat

In [7]:
# Define voice preset
voice_preset = "v2/en_speaker_6"

In [8]:
# Define text prompt
text_prompt = '''
[clears throat] Is this the real life? Is this just fantasy? [laughs]
Caught in a landslide, no escape from reality!
'''

In [17]:
text_prompt = '''
Hello darkness, my old friend. [sigh]
I've come to talk with you again
'''

In [18]:
inputs = processor(text_prompt, voice_preset=voice_preset)

In [19]:
# View input audio tensors
inputs

{'input_ids': tensor([[31178, 25100, 14010,   117, 15127, 12898, 20104,   119,   164, 11546,
         10237,   166,   146,   112, 10323, 10678, 10114, 31311, 10169, 13028,
         13123,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [20]:
audio_arrays = model.generate(**inputs.to(device))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [21]:
# .cpu() moves a tensor from GPU to CPU. If audio_array is already on CPU, this operation would typically have no effect.
# .numpy() converts a tensor (e.g., a PyTorch tensor) into a NumPy array.
# .squeeze() remove dimensions of size 1 from the shape of an array. For eg, if the shape of the array was (1, 44100, 1), after applying .squeeze(), the shape would become (44100,). 
audio_arrays = audio_arrays.cpu().numpy().squeeze()

In [22]:
# Set sampling rate
sample_rate = model.generation_config.sample_rate

In [23]:
# Convert audio output into sampled audio played in widget
Audio(audio_arrays, rate=sample_rate)

In [None]:
# Download audio output as wav file
# scipy.io.wavfile.write("bark_out.wav", rate=sample_rate, data=audio_arrays)