# Create adversarial speaker

1. Use audio file and transcription to find corresponding semantics
2. Use this semantics to construct "first order" speaker and make some generations
3. Choose good generation to create "second order" speaker from it
4. Save speaker

### Import stuff and load model

In [None]:
import torch
from transformers import BarkProcessor, BarkModel
from bark_tinkering.adversarial_speaker import find_semantics_by_wav, \
    create_voice_preset, save_voice_preset_safetensors, load_voice_preset_safetensors, load_voice_preset_numpy
from bark_tinkering.utils import make_text_generations
import IPython

device = 'cuda'
transformers_cache_dir = '.cache'
model_id = 'suno/bark' # use 'suno/bark-small' if you don't have enough memory, but generations will differ

processor: BarkProcessor = BarkProcessor.from_pretrained(model_id, cache_dir=transformers_cache_dir)
model: BarkModel = BarkModel.from_pretrained(model_id, cache_dir=transformers_cache_dir).to(device)

# default temperature is 0.7, I want model to be more conservative
model.generation_config.semantic_config['temperature'] = 1.0
model.generation_config.coarse_acoustics_config['temperature'] = 1.0 

## 1. Find adversarial semantic for audio sample

**You can skip running following cell if you just want to see outputs**

In [None]:
torch.manual_seed(952933090653967155) # fixing seed to make this reproducable
find_semantics_by_wav(model, processor, "One little spark and before you know it, the whole world is burning.",
                      './character_samples/lina.wav',
                      './generations/lina',
                      lr=1e-1,
                      save_every_n_steps=100,
                      no_save_steps=999,
                      steps=2000,
                      device=device,
                      perplexity_loss_weight=0.02) # 0.02 to 0.2 usually ok

### Check generations on different steps. They should become icncreasingly close to source audio sample

You can either do it from here or open generations/lina/all.m3u in your audio player. Usually somewhere around 1500-2000 steps should be a good generation

In [None]:
for step in range(1000, 2001, 100):
    print(f'step {step}')
    IPython.display.display(IPython.display.Audio(f'generations/lina/step_{step}/audio.wav'))

I like generation on step 1900, let's make a speaker out of it

## 2. Create first order speaker and make some generations

In [None]:
torch.manual_seed(952933090653967155)
voice_preset = create_voice_preset(
    model,
    'character_samples/lina.wav', # original audio sample
    'generations/lina/step_1900/semantic.pt' # semantic we found on step 1900
)
make_text_generations(model, processor, ["There are a lot of things I could talk about, but it would probably sound similar to this."] * 10,
                      f'generations/lina_first_order_speaker_1900',
                      voice_preset=voice_preset)

Check the generations

In [None]:
for gen in range(10):
    print(f'gen {gen}')
    IPython.display.display(IPython.display.Audio(f'generations/lina_first_order_speaker_1900/{gen}/audio.wav'))

Let's take generation 8 and make second order speaker

## 3. Create second order speaker

In [None]:
torch.manual_seed(952933090653967155)
voice_preset = create_voice_preset(
    model,
    'generations/lina_first_order_speaker_1900/8/audio.wav',
    'generations/lina_first_order_speaker_1900/8/semantic.pt'
)
make_text_generations(model, processor, ["This is one bridge I don't mind burning."] * 10,
                      f'generations/lina_second_order_speaker_8',
                      voice_preset=voice_preset)

Check the results

In [None]:
for gen in range(10):
    print(f'gen {gen}')
    IPython.display.display(IPython.display.Audio(f'generations/lina_second_order_speaker_8/{gen}/audio.wav'))

## 4. Save speaker

In [None]:
voice_preset = create_voice_preset(
    model,
    'generations/lina_first_order_speaker_1900/8/audio.wav',
    'generations/lina_first_order_speaker_1900/8/semantic.pt'
)
save_voice_preset_safetensors(voice_preset, 'voice_presets/lina.safetensors')

## Load speaker and make some generations

In [None]:
voice_preset = load_voice_preset_safetensors('voice_presets/lina.safetensors')

torch.manual_seed(952933090653967155)
make_text_generations(model, processor, ["Hey, look at you, you got there!"],
                      f'generations/lina_temp',
                      voice_preset=voice_preset)

IPython.display.Audio(f'generations/lina_temp/0/audio.wav')