# Speech Resynthesis Using Conditional Flow Matching and Whisper Units

In [None]:
!pip install -r requirements.txt

In [None]:
!git clone https://huggingface.co/spaces/sarulab-speech/UTMOS-demo src/utmos

!patch src/utmos/lightning_module.py src/patch/utmos_lightning_module.patch

In [None]:
!wget -t 0 -c -P data https://www.openslr.org/resources/141/test_clean.tar.gz
!tar zxf data/test_clean.tar.gz -C data

In [None]:
import torchaudio
from IPython.display import Audio

from src.flow_matching.models import ConditionalFlowMatchingWithBigVGan
from src.flow_matching.utils.whisper import WhisperFeatureExtractor, WhisperEncoder

In [None]:
# load model and processor
feature_extractor = WhisperFeatureExtractor.from_pretrained("ryota-komatsu/whisper-large-v3-tokenizer")
encoder = WhisperEncoder.from_pretrained("ryota-komatsu/whisper-large-v3-tokenizer").cuda()

# download a pretrained model from hugging face hub
decoder = ConditionalFlowMatchingWithBigVGan.from_pretrained("ryota-komatsu/flow_matching_with_bigvgan").cuda()

In [None]:
wav_path = "data/LibriTTS_R/test-clean/121/121726/121_121726_000004_000003.wav"

load a waveform

In [None]:
waveform, sr = torchaudio.load(wav_path)
waveform = torchaudio.functional.resample(waveform, sr, 16000)

encode a waveform into pseudo-phonetic units

In [None]:
input_features = feature_extractor(
    waveform.squeeze(0).numpy(),
    return_tensors="pt",
    sampling_rate=16000,
    device="cuda",
    padding="do_not_pad",
).input_features.to("cuda")

units = encoder.encode(input_features)
units = units.unsqueeze(0) + 1  # 0: pad

resynthesis

In [None]:
audio_values = decoder(units)[0]
audio_values = audio_values.cpu()

original speech

In [None]:
Audio(waveform, rate=16000)

sampled speech

In [None]:
Audio(audio_values, rate=16000)