# Bark Experiments 
### Using Transformers library

**Author**: Kenneth Leung  
**Last Updated**: 02 Oct 2023

___

## Initial Setup

In [None]:
# Import dependencies
from transformers import AutoProcessor, BarkModel
from IPython.display import Audio
import torch
import scipy

In [None]:
# Setup accelerator device (CUDA)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Define parameters - precision and model type
torch_dtype = torch.float32 # Default 
model_type = "suno/bark"  # Default. Alternative model is "suno/bark-small"

In [None]:
# Instantiate processor and Bark model
processor = AutoProcessor.from_pretrained(model_type, torch_dtype=torch_dtype)
model = BarkModel.from_pretrained(model_type, torch_dtype=torch_dtype)

In [None]:
# Shift model to accelerator device (CUDA)
model = model.to(device)
model.device

In [None]:
# View model architecture details
model

In [None]:
# Define voice preset (optional)
voice_preset = "v2/en_speaker_6"

# Define text prompt
text_prompt = '''
[clears throat] Is this the real life? Is this just fantasy? [laughs]
Caught in a landslide, no escape from reality!
'''

___
## Generate Audio Output

In [None]:
# Pass voice preset and text prompt into processor object
inputs = processor(text=text_prompt, voice_preset=voice_preset)

# View input audio tensors
inputs

In [None]:
# Generate output audio arrays (tensors) from Bark model
audio_arrays = model.generate(**inputs.to(device))

In [None]:
# Convert output arrays into NumPy arrays, remove dimensions of size 1 from shape of array, and shift to CPU
audio_arrays = audio_arrays.cpu().numpy().squeeze()

In [None]:
# Set sampling rate (Default is 24kHz)
sample_rate = model.generation_config.sample_rate

In [None]:
# Convert audio output into sampled audio to be played in notebook widget
Audio(audio_arrays, rate=sample_rate)

In [None]:
# Download audio output as wav file
# scipy.io.wavfile.write("bark_out.wav", rate=sample_rate, data=audio_arrays)

___
## Exploration of Bark's Capabilities with Prompt Engineering

In [None]:
# Create function to generate audio array
def generate_audio(text_prompt, voice_preset=None):
    inputs = processor(text=text_prompt, voice_preset=voice_preset)
    audio_arrays = model.generate(**inputs.to(device)).cpu().numpy().squeeze()
    return audio_arrays

### (1) Multilingual Speech Synthesis

#### (1.1) Mandarin

In [None]:
text_prompt = '千里之行，始于足下。所以，不要害怕迈出第一步'

audio_arrays = generate_audio(text_prompt)
Audio(audio_arrays, rate=model.generation_config.sample_rate)

In [None]:
text_prompt = '千里之行，始于足下。所以，不要害怕迈出第一步'

audio_arrays = generate_audio(text_prompt)
Audio(audio_arrays, rate=model.generation_config.sample_rate)

#### (1.2) Hindi

In [None]:
text_prompt = 'धैर्य और समय, दोनों ही बहुत महत्वपूर्ण हैं। जब तुम अपने लक्ष्य की ओर बढ़ रहे हो, तो इन दोनों की आवश्यकता होती है। धैर्य रखो और समय दो, सफलता जरूर मिलेगी।'

audio_arrays = generate_audio(text_prompt)
Audio(audio_arrays, rate=model.generation_config.sample_rate)

___
### (2) Code Switching

#### (2.1) French - English

In [None]:
text_prompt = '''
Mon père me disait toujours: 
"The best way to predict the future is to create it."
'''

audio_arrays = generate_audio(text_prompt)
Audio(audio_arrays, rate=model.generation_config.sample_rate)

___
### (3) Non-Verbal Effects

#### Mix of non-verbal effects

In [None]:
text_prompt = '''
Remember when we tried cooking that 5-minute recipe, but it took us THREE HOURS? [laughs]
And the kitchen looked like ... a war zone afterwards! [sighs]
'''

audio_arrays = generate_audio(text_prompt)
Audio(audio_arrays, rate=model.generation_config.sample_rate)

#### Speaker Prompts
(Does not consistently give the correct output)

In [None]:
text_prompt = '''
WOMAN: I would like an oatmilk latte please.
MAN: Wow, that's expensive!
'''

audio_arrays = generate_audio(text_prompt)
Audio(audio_arrays, rate=model.generation_config.sample_rate)

___
### (4) Music

In [None]:
text_prompt = '♪ I want to break free! I want to break FREE!!  ♪'

audio_arrays = generate_audio(text_prompt)
Audio(audio_arrays, rate=model.generation_config.sample_rate)

___