# SoundMime.com

Imitates the voice of a human voice.

- Soundstream for encoder/decoder.
- SoundStorm for text-to-speech generation

In [1]:
import torch
import torchaudio
import os
curr_dir = os.getcwd()

In [2]:
import soundstorm_pytorch
# torchaudio.set_audio_backend("soundfile") 
print(torchaudio.list_audio_backends())

['soundfile']


# Choose your pretrained models

In [3]:
from audiolm_pytorch import EncodecWrapper
from soundstream import from_pretrained
encodec = EncodecWrapper()
# Now you can use the encodec variable in the same way you'd use the soundstream variables below.
# or use audio_codec
ss_model = from_pretrained()  # downloads model from Hugging Face
ss_model.eval()
print(type(ss_model))


<class 'soundstream.soundstream.SoundStream'>


In [4]:
from soundstream import load

# Provide initial audio input
curr_dir = os.getcwd()
file_path = os.path.join(curr_dir, 'audio_samples', 'sample1.wav')
audio = load(file_path)
print(f"audio.shape: {audio.shape}")

encoded_features = ss_model.encoder(audio)  # Ensure audio is in the correct format
# Pass the features through the quantizer to get the codes
quantized, codes, _ = ss_model.quantizer(encoded_features.permute(0, 2, 1))  # Adjust dimensions if needed

# quantized = audio_codec(waveform, mode='encode')
# recovered = audio_codec(quantized, mode='decode')

# torchaudio.save('out.wav', recovered[0], 16000)


# # Identify soundstream to utilize.
# encodec = EncodecWrapper()

# # Use encodec as soundstream
# soundstream = encodec 
# # or train your own soundstream

# Save the codes to a file
torch.save(codes, "codes.pt")


audio.shape: torch.Size([1, 1, 1477632])


# AudioLM
## SoundStream & Encodec
Neural audio compression. It focuses on compressing audio data efficiently while preserving as much quality as possible at low bitrates, aimed at real-time audio streaming or storage.

### Train the Soundstream

In [4]:
from audiolm_pytorch import SoundStream, SoundStreamTrainer

soundstream = SoundStream(
    codebook_size = 4096,
    rq_num_quantizers = 8,
    rq_groups = 2,                       # this paper proposes using multi-headed residual vector quantization - https://arxiv.org/abs/2305.02765
    use_lookup_free_quantizer = True,    # whether to use residual lookup free quantization - there are now reports of successful usage of this unpublished technique
    use_finite_scalar_quantizer = False, # whether to use residual finite scalar quantization
    attn_window_size = 128,              # local attention receptive field at bottleneck
    attn_depth = 2                       # 2 local attention transformer blocks - the soundstream folks were not experts with attention, so i took the liberty to add some. encodec went with lstms, but attention should be better
)

folder_path = str(os.path.join(curr_dir,'audio_samples'))

trainer = SoundStreamTrainer(
    soundstream,
    folder = folder_path,
    batch_size = 3,
    grad_accum_every = 8,         # effective batch size of 24
    data_max_length_seconds = 2,  # train on 2 second audio
    num_train_steps = 1_000_000
).cpu()

trainer.train()

# after a lot of training, you can test the autoencoding as so
soundstream.eval() # your soundstream must be in eval mode, to avoid having the residual dropout of the residual VQ necessary for training

# Provide audio input
# audio = torch.randn(10080).cpu() 

# Reconstruct audio using soundstream
recons = soundstream(audio, return_recons_only = True) # (1, 10080) - 1 channel # reconstructed audio signal

training with dataset of 44 samples and validating with randomly splitted 3 samples


KeyboardInterrupt: 

### Trained Soundstream as Tokenizer

In [6]:
# Your trained Soundstream can then be used as a generic tokenizer for audio
# audio = torch.randn(1, 512 * 320) # input
# codes = soundstream.tokenize(audio) # convert into codes
codes = torch.load('codes.pt') # (batch, seq, num residual VQ)
# # you can now train anything with the codebook ids
recon_audio_from_codes = ss_model.decode_from_codebook_indices(codes)

# # Sanity Check
# # Compares the reconstructed audio from codes and audio
# assert torch.allclose(
#     recon_audio_from_codes,
#     soundstream(audio, return_recons_only = True)
# )

AttributeError: 'SoundStream' object has no attribute 'decode_from_codebook_indices'

# SoundStorm

In [7]:
import torch
from soundstorm_pytorch import SoundStorm, ConformerWrapper

conformer = ConformerWrapper(
    codebook_size = 1024,
    num_quantizers = 16,
    conformer = dict(
        dim = 512,
        depth = 2
    ),
)

model = SoundStorm(
    conformer,
    steps = 18,          # 18 steps, as in original maskgit paper
    schedule = 'cosine'  # currently the best schedule is cosine
)

# get your pre-encoded codebook ids from the soundstream from a lot of raw audio
codes = torch.load('codes.pt') # (batch, seq, num residual VQ)

# do the below in a loop for a ton of data
loss, _ = model(codes)
loss.backward()

# model can now generate in 18 steps. ~2 seconds sounds reasonable

generated = model.generate(1024, batch_size = 2) # (2, 1024)

100%|██████████| 18/18 [01:06<00:00,  3.67s/it]
  6%|▌         | 1/18 [00:03<01:03,  3.72s/it]
  6%|▌         | 1/18 [00:03<00:59,  3.47s/it]
  6%|▌         | 1/18 [00:03<01:05,  3.88s/it]
  6%|▌         | 1/18 [00:03<01:00,  3.54s/it]
  6%|▌         | 1/18 [00:03<01:02,  3.69s/it]
  6%|▌         | 1/18 [00:03<01:05,  3.85s/it]
  6%|▌         | 1/18 [00:03<00:57,  3.40s/it]
  6%|▌         | 1/18 [00:03<01:01,  3.62s/it]
  6%|▌         | 1/18 [00:03<01:02,  3.68s/it]
  6%|▌         | 1/18 [00:03<00:57,  3.40s/it]
  6%|▌         | 1/18 [00:03<01:06,  3.90s/it]
  6%|▌         | 1/18 [00:03<01:01,  3.62s/it]
  6%|▌         | 1/18 [00:03<01:01,  3.60s/it]
  6%|▌         | 1/18 [00:04<01:09,  4.09s/it]
  6%|▌         | 1/18 [00:03<01:00,  3.58s/it]


In [23]:
from spear_tts_pytorch import TextToSemantic
from sentence_transformers import SentenceTransformer
text_to_semantic = SentenceTransformer('all-MiniLM-L6-v2')

text_to_semantic = TextToSemantic(
    dim = 512,
    source_depth = 12,
    target_depth = 12,
    num_text_token_ids = 50000,
    num_semantic_token_ids = 20000,
    use_openai_tokenizer = False
)

# # load the trained text-to-semantic transformer

# text_to_semantic.load(model)

# pass it into the soundstorm

model_tts = SoundStorm(
    conformer,
    soundstream = ss_model,
    spear_tts_text_to_semantic = text_to_semantic
).cpu()

# and now you can generate state-of-the-art speech

generated_speech = model_tts.generate(
    texts = [
        'the rain in spain stays mainly in the plain',
        'the quick brown fox jumps over the lazy dog'
    ],
    seconds=30,  # specify the number of seconds of audio to generate
    batch_size=2
) # (2, n) - raw waveform decoded from soundstream

BeartypeCallHintParamViolation: Method soundstorm_pytorch.soundstorm.SoundStorm.__init__() parameter soundstream="SoundStream(
  (encoder): Encoder(
    (layers): Sequential(
      (0): CausalConv1d(
    ...)" violates type hint audiolm_pytorch.soundstream.SoundStream | None, as <protocol "soundstream.soundstream.SoundStream"> "SoundStream(
  (encoder): Encoder(
    (layers): Sequential(
      (0): CausalConv1d(
    ...)" not <class "builtins.NoneType"> or <protocol "audiolm_pytorch.soundstream.SoundStream">.