# Bitwav Tokenizer Demo

This notebook demonstrates how to use the Bitwav tokenizer library for:

- Speech feature extraction (content tokens and global embeddings)
- Speech resynthesis from features
- Voice conversion


## 1. Imports

In [None]:
import IPython.display as ipd

## 2. Load Models and Audio Data

Load the Bitwav model and vocoder, then find example audio files for processing.


In [None]:
from bitwav import BitwavModel, load_audio, load_vocoder, vocode

# Load Bitwav model
model = BitwavModel.from_pretrained("bitwav/bitwav-25hz-clean")  # or "bitwav/bitwav-12.5hz"

model = model.eval().cuda()
print("Bitwav model loaded")

# Load vocoder
vocoder = load_vocoder(model.config.vocoder_name).cuda()
print("Vocoder loaded")

In [None]:
import random
from glob import glob

# Find example audio files. Please change the path to your local LibriTTS dataset, or use your own audio files.
audio_paths = glob("/path/to/LibriTTS/test-clean/**/*.wav", recursive=True)
random.shuffle(audio_paths)
example_audio = audio_paths[0]
reference_audio_path = audio_paths[1]
print("Using audio files:")
print(f"  Source: {example_audio}")
print(f"  Reference: {reference_audio_path}")

In [None]:
# Load and preprocess audio
sample_rate = model.config.sample_rate
waveform = load_audio(example_audio, sample_rate=sample_rate)

print(f"Sample rate: {sample_rate} Hz")
print(f"Source audio shape: {waveform.shape}")
ipd.display(ipd.Audio(waveform, rate=sample_rate))

## 3. Extract Speech Features

Extract content tokens and global embeddings from the audio using the Bitwav model.


In [None]:
# Extract features from the source audio
features = model.encode(waveform.cuda())

print("Extracted features:")
for key, value in features.__dict__.items():
    print(f"  {key}: {value.shape} {value.dtype}")

# Features contain:
# - content_token_indices: Discrete tokens representing linguistic content (seq_len,)
# - content_embedding: Continuous version of the content tokens (seq_len, dim_content)
# - global_embedding: Utterance-level global embedding (dim_global,)

## 4. Speech Resynthesis from Features

Resynthesize speech from the extracted features.


In [None]:
# Synthesize audio from extracted features
mel_spectrogram = model.decode(
    content_token_indices=features.content_token_indices, global_embedding=features.global_embedding
)
resynthesized_waveform = vocode(vocoder, mel_spectrogram.unsqueeze(0))  # (1, samples)

# The target audio length is estimated from the content token length, which may differ from the original audio length
print(f"Resynthesized waveform shape: {resynthesized_waveform.shape}")
ipd.display(ipd.Audio(resynthesized_waveform.cpu(), rate=sample_rate))

## 5. Voice Conversion

Convert the voice of one speaker to sound like another while preserving the linguistic content.

In [None]:
# Load reference audio for voice conversion
reference_audio = load_audio(reference_audio_path, sample_rate=sample_rate)
print(f"Reference audio shape: {reference_audio.shape}")
ipd.display(ipd.Audio(reference_audio, rate=sample_rate))

# Extract source content features and reference global features
source_features = model.encode(waveform.cuda(), return_content=True, return_global=False)
reference_features = model.encode(reference_audio.cuda(), return_content=False, return_global=True)

# Perform voice conversion
converted_mel_spectrogram = model.decode(
    content_embedding=source_features.content_embedding,
    global_embedding=reference_features.global_embedding,
    target_audio_length=waveform.size(0),
)

# Or use the convenience method:
# converted_mel_spectrogram = model.voice_conversion(source_waveform=waveform.cuda(), reference_waveform=reference_audio.cuda())

converted_waveform = vocode(vocoder, converted_mel_spectrogram.unsqueeze(0))  # (1, samples)

print(f"Converted waveform shape: {converted_waveform.shape}")
ipd.display(ipd.Audio(converted_waveform.cpu(), rate=sample_rate))