<a href="https://colab.research.google.com/github/hirdeshkumar2407/NLP_Group_Assigment/blob/main/extension/3_TTS_working_maybe_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports:

Run once, restart session, run again:

In [1]:
# --- Step 1: Ensure GPU Runtime and Install Coqui TTS ---
# Make sure your Colab runtime is set to GPU (Runtime -> Change runtime type -> T4 GPU or similar)

print("Installing Coqui TTS...")
!pip install TTS soundfile==0.12.1 # soundfile version might need to be pinned for compatibility
print("Coqui TTS installed.")

# Install espeak-ng for phonemization
print("Installing espeak-ng...")
!apt-get update && apt-get install -y espeak-ng
print("espeak-ng installed.")


Installing Coqui TTS...
Coqui TTS installed.
Installing espeak-ng...
Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
R

In [2]:
import torch
from TTS.api import TTS
import soundfile as sf
from IPython.display import Audio
import os # For checking file existence

Checking the devicec:

In [3]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

# --- Step 2: Determine Device ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

PyTorch version: 2.6.0+cu124
CUDA available: True
Using device: cuda


In [4]:
# --- Step 3: Load the VITS Model ---
# This will download the model (approx. 200-300MB) the first time.
# The 'tts_models/en/ljspeech/vits' model is a single-speaker model,
# so it doesn't require a speaker_wav for voice cloning.
print("\nLoading VITS model for LJSpeech (this may take a moment)...")
try:
    tts_vits = TTS("tts_models/en/ljspeech/vits", gpu=True if device == "cuda" else False)
    print("VITS model loaded successfully.")
except Exception as e:
    print(f"Error loading VITS model: {e}")
    print("Please ensure GPU is available and you have enough memory.")
    print("Falling back to CPU if GPU failed, but generation will be much slower.")
    tts_vits = TTS("tts_models/en/ljspeech/vits", gpu=False)

# --- Step 4: Synthesize Speech ---
text_to_synthesize = "Start, did it cut the voice or not?."
output_audio_file = "vits_output_ljspeech.wav"




Loading VITS model for LJSpeech (this may take a moment)...
 > Downloading model to /root/.local/share/tts/tts_models--en--ljspeech--vits


 95%|█████████▍| 138M/146M [00:01<00:00, 83.1MiB/s]

 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
VITS model loaded successfully.


In [8]:
def TTS_input (text_to_synthesize):
  print(f"\nGenerating speech for: '{text_to_synthesize}'")
  try:
      tts_vits.tts_to_file(
          text=text_to_synthesize,
          file_path=output_audio_file
          # No speaker_wav or speaker_id needed for this single-speaker model!
      )
      print(f"Speech saved to {output_audio_file}")

      # --- Step 5: Play the Generated Audio ---
      if os.path.exists(output_audio_file):
          print("\nPlaying generated audio:")
          display(Audio(output_audio_file))
      else:
          print(f"Error: Output file {output_audio_file} was not created.")

  except Exception as e:
      print(f"An error occurred during speech generation: {e}")

**Input text goes here:**

In [9]:
phrase = "Start! Hello there! This is a very natural sounding voice generated by a VITS model. It's quite fast too."

TTS_input(phrase)


Generating speech for: 'Start! Hello there! This is a very natural sounding voice generated by a VITS model. It's quite fast too.'
 > Text splitted to sentences.
['Start!', 'Hello there!', 'This is a very natural sounding voice generated by a VITS model.', "It's quite fast too."]
 > Processing time: 0.668487548828125
 > Real-time factor: 0.0729537063057301
Speech saved to vits_output_ljspeech.wav

Playing generated audio:
