<a href="https://colab.research.google.com/github/hussainturii/TTS/blob/main/f5_tts2(with_tacotron).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

mini f5 starts from here

In [None]:
# Colab: installs GPU builds (cu121). If you have different CUDA locally,
# install matching torch/torchaudio wheels for your system.
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q librosa soundfile

In [None]:
import torch
import torchaudio
from IPython.display import Audio, display

# Load Tacotron2 + WaveRNN pipeline
bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH

device = "cuda" if torch.cuda.is_available() else "cpu"

# Get components
processor = bundle.get_text_processor()
tacotron2 = bundle.get_tacotron2().to(device)
vocoder = bundle.get_vocoder().to(device)

# Get sample rate from vocoder
sr = vocoder.sample_rate

# Example text
text = "Hello sir, I love You."

# Prepare input
inputs, lengths = processor(text)
inputs = inputs.to(device)
lengths = lengths.to(device)

# Inference
with torch.inference_mode():
    spec, spec_lengths, _ = tacotron2.infer(inputs, lengths)
    waveform, _ = vocoder(spec, spec_lengths)

waveform = waveform.squeeze(0).cpu()

# Save + play
torchaudio.save("tts_output.wav", waveform.unsqueeze(0), sr)
print(f"Saved tts_output.wav at {sr} Hz")
display(Audio(waveform.numpy(), rate=sr))


Downloading: "https://download.pytorch.org/torchaudio/models/tacotron2_english_characters_1500_epochs_wavernn_ljspeech.pth" to /root/.cache/torch/hub/checkpoints/tacotron2_english_characters_1500_epochs_wavernn_ljspeech.pth


100%|██████████| 107M/107M [00:00<00:00, 194MB/s]


Downloading: "https://download.pytorch.org/torchaudio/models/wavernn_10k_epochs_8bits_ljspeech.pth" to /root/.cache/torch/hub/checkpoints/wavernn_10k_epochs_8bits_ljspeech.pth


100%|██████████| 16.7M/16.7M [00:00<00:00, 79.2MB/s]


Saved tts_output.wav at 22050 Hz


  s = torchaudio.io.StreamWriter(uri, format=muxer, buffer_size=buffer_size)


In [None]:
import torch
import torchaudio
from IPython.display import Audio, display

device = "cuda" if torch.cuda.is_available() else "cpu"

# 1. Load Tacotron2 (spectrogram generator)
tacotron_bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH
processor = tacotron_bundle.get_text_processor()
tacotron2 = tacotron_bundle.get_tacotron2().to(device)

# 2. Load HiFi-GAN vocoder (instead of WaveRNN)
from torchaudio.prototype.pipelines import HIFIGAN_VOCODER_V3_LJSPEECH as hifigan_bundle
vocoder = hifigan_bundle.get_vocoder().to(device)
sr = hifigan_bundle.sample_rate   # vocoder output sample rate

# 3. Input text
text = "Hello, david! I love you more naturally."
inputs, lengths = processor(text)
inputs, lengths = inputs.to(device), lengths.to(device)

# 4. Generate mel spectrogram with Tacotron2
with torch.inference_mode():
    spec, spec_lengths, _ = tacotron2.infer(inputs, lengths)

# 5. Convert spectrogram → waveform using HiFi-GAN
with torch.inference_mode():
    waveform = vocoder(spec)

waveform = waveform.squeeze(0).cpu()

# 6. Save + play (fix shape for torchaudio)
waveform = waveform.squeeze()        # make sure it's 1D [time]
waveform = waveform.unsqueeze(0)     # add channel dim → [1, time]

torchaudio.save("tts_hifigan.wav", waveform, sr)
print(f"Saved tts_hifigan.wav at {sr} Hz")

# playback
from IPython.display import Audio
display(Audio(waveform.squeeze().numpy(), rate=sr))


  vocoder = hifigan_bundle.get_vocoder().to(device)


Saved tts_hifigan.wav at 22050 Hz
