# SoundMime.com

Imitates the voice of a human voice.

- Soundstream for encoder/decoder.
- SoundStorm for text-to-speech generation

In [None]:
import torch
import torchaudio
import os
curr_dir = os.getcwd()

In [2]:
from audiolm_pytorch import EncodecWrapper
encodec = EncodecWrapper()
# Now you can use the encodec variable in the same way you'd use the soundstream variables below.

In [3]:
import soundstorm_pytorch
# torchaudio.set_audio_backend("soundfile") 
print(torchaudio.list_audio_backends())


['soundfile']


In [4]:
from soundstream import from_pretrained, load

# Provide initial audio input
curr_dir = os.getcwd()
file_path = os.path.join(curr_dir, 'audio_samples', 'sample1.wav')

waveform, sample_rate = torchaudio.load(file_path)
print(f"waveform.shape: {waveform.shape}")
print(f"sample_rate: {sample_rate}")

waveform = load(file_path)
audio = waveform.mean(dim=1, keepdim=False)

audio_codec = from_pretrained()  # downloads model from Hugging Face

# quantized = audio_codec(waveform, mode='encode')
# recovered = audio_codec(quantized, mode='decode')

# torchaudio.save('out.wav', recovered[0], 16000)


# # Identify soundstream to utilize.
# encodec = EncodecWrapper()

# # Use encodec as soundstream
# soundstream = encodec 
# # or train your own soundstream


waveform.shape: torch.Size([2, 4432896])
sample_rate: 48000


# SoundStream  
Neural audio compression. It focuses on compressing audio data efficiently while preserving as much quality as possible at low bitrates, aimed at real-time audio streaming or storage.

### Train the Soundstream

In [None]:
from audiolm_pytorch import SoundStream, SoundStreamTrainer

soundstream = SoundStream(
    codebook_size = 4096,
    rq_num_quantizers = 8,
    rq_groups = 2,                       # this paper proposes using multi-headed residual vector quantization - https://arxiv.org/abs/2305.02765
    use_lookup_free_quantizer = True,    # whether to use residual lookup free quantization - there are now reports of successful usage of this unpublished technique
    use_finite_scalar_quantizer = False, # whether to use residual finite scalar quantization
    attn_window_size = 128,              # local attention receptive field at bottleneck
    attn_depth = 2                       # 2 local attention transformer blocks - the soundstream folks were not experts with attention, so i took the liberty to add some. encodec went with lstms, but attention should be better
)

folder_path = str(os.path.join(curr_dir,'audio_samples'))

trainer = SoundStreamTrainer(
    soundstream,
    folder = folder_path,
    batch_size = 3,
    grad_accum_every = 8,         # effective batch size of 24
    data_max_length_seconds = 2,  # train on 2 second audio
    num_train_steps = 1_000_000
).cpu()

trainer.train()

# after a lot of training, you can test the autoencoding as so
soundstream.eval() # your soundstream must be in eval mode, to avoid having the residual dropout of the residual VQ necessary for training

# Provide audio input
# audio = torch.randn(10080).cpu() 

# Reconstruct audio using soundstream
recons = soundstream(audio, return_recons_only = True) # (1, 10080) - 1 channel # reconstructed audio signal

TypeError: cannot unpack non-iterable NoneType object

### Trained Soundstream as Tokenizer

In [None]:
# Your trained Soundstream can then be used as a generic tokenizer for audio
# audio = torch.randn(1, 512 * 320) # input
codes = soundstream.tokenize(audio) # convert into codes

# you can now train anything with the codebook ids
recon_audio_from_codes = soundstream.decode_from_codebook_indices(codes)

# Sanity Check
# Compares the reconstructed audio from codes and audio
assert torch.allclose(
    recon_audio_from_codes,
    soundstream(audio, return_recons_only = True)
)

NameError: name 'soundstream' is not defined

In [18]:
torchaudio.set_audio_backend("soundfile")

# SoundStorm

In [None]:
import torch
from soundstorm_pytorch import SoundStorm, ConformerWrapper

conformer = ConformerWrapper(
    codebook_size = 1024,
    num_quantizers = 12,
    conformer = dict(
        dim = 512,
        depth = 2
    ),
)

model = SoundStorm(
    conformer,
    steps = 18,          # 18 steps, as in original maskgit paper
    schedule = 'cosine'  # currently the best schedule is cosine
)

# get your pre-encoded codebook ids from the soundstream from a lot of raw audio
codes = torch.randint(0, 1024, (2, 1024, 12)) # (batch, seq, num residual VQ)

# do the below in a loop for a ton of data

loss, _ = model(codes)
loss.backward()

# model can now generate in 18 steps. ~2 seconds sounds reasonable

generated = model.generate(1024, batch_size = 2) # (2, 1024)

100%|██████████| 18/18 [00:47<00:00,  2.63s/it]
  6%|▌         | 1/18 [00:02<00:49,  2.91s/it]
  6%|▌         | 1/18 [00:02<00:49,  2.91s/it]
  6%|▌         | 1/18 [00:02<00:47,  2.78s/it]
  6%|▌         | 1/18 [00:02<00:49,  2.89s/it]
  6%|▌         | 1/18 [00:02<00:49,  2.89s/it]
  6%|▌         | 1/18 [00:03<00:54,  3.18s/it]
  6%|▌         | 1/18 [00:02<00:50,  2.96s/it]
  6%|▌         | 1/18 [00:03<00:53,  3.12s/it]
  6%|▌         | 1/18 [00:02<00:50,  2.94s/it]
  6%|▌         | 1/18 [00:03<00:52,  3.09s/it]
  6%|▌         | 1/18 [00:03<00:52,  3.09s/it]


In [None]:
import torch
from soundstorm_pytorch import SoundStorm, ConformerWrapper
from soundstream_pytorch import SoundStream

# Initialize SoundStream Codec
soundstream = SoundStream(
    codebook_size=1024,  # Must match SoundStorm's codebook size
    num_quantizers=12,   # Must match SoundStorm's num_quantizers
    channels=32,
    strides=(2, 4, 5),   # Compression strides
    multipliers=(2, 4, 8),
    use_residual=True
)

# Initialize SoundStorm with the same parameters
conformer = ConformerWrapper(
    codebook_size=1024,
    num_quantizers=12,
    conformer=dict(
        dim=512,
        depth=2
    ),
)

model = SoundStorm(
    conformer,
    steps=18,           # 18 steps, as in original maskgit paper
    schedule='cosine'   # currently the best schedule is cosine
)

# ====== Training Step ======

# Example raw audio data (batch of 2 mono audio signals, 16000 samples each)
raw_audio = torch.randn(2, 16000)  # Replace with actual audio data

# Step 1: Encode raw audio to quantized codes
codes = soundstream.encode(raw_audio)  # Shape: (batch, seq, num_quantizers)

# Step 2: Train SoundStorm with these codes
loss, _ = model(codes)
loss.backward()

# ====== Generation Step ======

# Step 3: Generate new codes using SoundStorm
generated_codes = model.generate(1024, batch_size=2)  # Shape: (2, 1024)

# Step 4: Decode generated codes back to audio
generated_audio = soundstream.decode(generated_codes)


ModuleNotFoundError: No module named 'soundstream_pytorch'

In [None]:
torch.randn(2, 16000)
print(type(torch.randn(2,16000)))

<class 'torch.Tensor'>


In [None]:
import torch
from soundstorm_pytorch import SoundStorm, ConformerWrapper, Conformer, SoundStream

conformer = ConformerWrapper(
    codebook_size = 1024,
    num_quantizers = 12,
    conformer = dict(
        dim = 512,
        depth = 2
    ),
)

soundstream = SoundStream(
    codebook_size = 1024,
    rq_num_quantizers = 12,
    attn_window_size = 128,
    attn_depth = 2
)

model = SoundStorm(
    conformer,
    soundstream = soundstream   # pass in the soundstream
)

# find as much audio you'd like the model to learn

audio = torch.randn(2, 10080)

# course it through the model and take a gazillion tiny steps

loss, _ = model(audio)
loss.backward()

# and now you can generate state-of-the-art speech

generated_audio = model.generate(seconds = 30, batch_size = 2)  # generate 30 seconds of audio (it will calculate the length in seconds based off the sampling frequency and cumulative downsamples in the soundstream passed in above)

100%|██████████| 18/18 [01:54<00:00,  6.36s/it]
  6%|▌         | 1/18 [00:05<01:39,  5.82s/it]
  6%|▌         | 1/18 [00:06<01:51,  6.56s/it]
  6%|▌         | 1/18 [00:05<01:36,  5.68s/it]
  6%|▌         | 1/18 [00:06<01:50,  6.48s/it]
  6%|▌         | 1/18 [00:05<01:38,  5.80s/it]
  6%|▌         | 1/18 [00:06<01:42,  6.00s/it]
  6%|▌         | 1/18 [00:05<01:27,  5.15s/it]
  6%|▌         | 1/18 [00:05<01:34,  5.59s/it]
  6%|▌         | 1/18 [00:05<01:37,  5.72s/it]
  6%|▌         | 1/18 [00:05<01:39,  5.83s/it]
  6%|▌         | 1/18 [00:06<01:43,  6.10s/it]


In [None]:
from spear_tts_pytorch import TextToSemantic
from sentence_transformers import SentenceTransformer

text_to_semantic = SentenceTransformer('all-MiniLM-L6-v2')


text_to_semantic = TextToSemantic(
    dim = 512,
    source_depth = 12,
    target_depth = 12,
    num_text_token_ids = 50000,
    num_semantic_token_ids = 20000,
    use_openai_tokenizer = False
)

# # load the trained text-to-semantic transformer

# text_to_semantic.load(model)

# pass it into the soundstorm

model = SoundStorm(
    conformer,
    soundstream = soundstream,
    spear_tts_text_to_semantic = text_to_semantic
).cpu()

# and now you can generate state-of-the-art speech

generated_speech = model.generate(
    texts = [
        'the rain in spain stays mainly in the plain',
        'the quick brown fox jumps over the lazy dog'
    ],
    seconds=30,  # specify the number of seconds of audio to generate
    batch_size=2
) # (2, n) - raw waveform decoded from soundstream

100%|██████████| 18/18 [01:39<00:00,  5.55s/it]
  6%|▌         | 1/18 [00:05<01:31,  5.40s/it]
  6%|▌         | 1/18 [00:05<01:34,  5.54s/it]
  6%|▌         | 1/18 [00:04<01:23,  4.89s/it]
  6%|▌         | 1/18 [00:05<01:26,  5.10s/it]
  6%|▌         | 1/18 [00:04<01:21,  4.77s/it]
  6%|▌         | 1/18 [00:05<01:26,  5.11s/it]
  6%|▌         | 1/18 [00:05<01:25,  5.05s/it]
  6%|▌         | 1/18 [00:04<01:22,  4.88s/it]
  6%|▌         | 1/18 [00:04<01:17,  4.56s/it]
  6%|▌         | 1/18 [00:04<01:13,  4.30s/it]
  6%|▌         | 1/18 [00:05<01:29,  5.26s/it]


In [None]:
generated_speech.shape


torch.Size([2, 480000])

In [None]:
# Convert the tensor to a NumPy array
audio_np = generated_speech.numpy()
# Define the sample rate (e.g., 44100 Hz)
sample_rate = 16000

# Save the audio file (in stereo)
sf.write('output_audio.wav', audio_np.T, sample_rate)  # `.T` transposes to [samples, channels]

In [None]:
if len(generated_speech.shape) == 1:  # If it's a 1D tensor (mono)
    generated_speech = generated_speech.unsqueeze(0)  # Add a channel dimension


In [None]:
torchaudio.set_audio_backend("soundfile")


In [16]:
import torchaudio
import soundfile as sf

# Ensure correct backend
torchaudio.set_audio_backend("sox_io")

# Prepare the tensor (ensure it's 2D for torchaudio)
if len(generated_speech.shape) == 1:  # If it's a 1D tensor (mono)
    generated_speech = generated_speech.unsqueeze(0)  # Add a channel dimension

# Normalize (if needed)
generated_speech = generated_speech / torch.max(torch.abs(generated_speech))

# Define the output file path
output_file = "generated_speech.wav"

try:
    # Save the tensor as a WAV file using torchaudio
    torchaudio.save(output_file, generated_speech, 22050)
except RuntimeError as e:
    print(f"Error with torchaudio save: {e}")
    # If torchaudio fails, use soundfile as a fallback
    generated_speech_np = generated_speech.numpy()
    sf.write(output_file, generated_speech_np.T, 22050)  # Transpose if necessary


ModuleNotFoundError: No module named 'soundfile'