<a href="https://colab.research.google.com/github/kxlki/Text_To_Speech-Website/blob/main/Aiml_Project_Text_to_speech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 🚀 STEP 1: CLEAN INSTALL (run this after a fresh runtime or force crash)
!pip uninstall -y numpy TTS elevenlabs gradio pydub python-dotenv -q
!pip install numpy==1.23.5 --no-cache-dir --force-reinstall -q
!pip install elevenlabs gradio pydub python-dotenv TTS -q
!apt install -y ffmpeg -qq



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m157.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
xarray 2025.1.2 requires numpy>=1.24, but you have numpy 1.23.5 which is incompatible.
albumentations 2.0.5 requires numpy>=1.24.4, but you have numpy 1.23.5 which is incompatible.
chex 0.1.89 requires numpy>=1.24.1, but you have numpy 1.23.5 which is incompatible.
bigframes 1.38.0 requires numpy>=1.24.0, but you have numpy 1.23.5 which is incompatible.
langchain 0.3.19 requires numpy<2,>=1.26.4; python_version < "3.12", but you have numpy 1.23.5 which is incompatible.
blosc2 3.2.0 requires numpy>=1.26, but you have numpy 1.23.5 which is incompatible.
scikit-image 0.25.2 requires numpy>=1.24, but you have numpy 1.23.5 which is incompatible.
albucore 0.0.23 requires numpy>=1.24.4, but y

In [None]:
# 🚀 STEP 2: MAIN TTS APP CODE

import os
import gradio as gr
from pydub import AudioSegment
from elevenlabs.client import ElevenLabs
from TTS.api import TTS

# 🔐 SET YOUR ELEVENLABS API KEY
os.environ["ELEVEN_API_KEY"] = "sk_c96f9aa4d40343e7b6a9661c092fdb5fbe50f16f000fe606"  # ← Replace with your actual API key
client = ElevenLabs(api_key=os.environ["ELEVEN_API_KEY"])
tacotron = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False)

# 🔊 Get voices
voices = client.voices.get_all().voices
voice_names = [v.name for v in voices]
accents = ["None", "British", "American", "Indian", "Australian"]

def get_voice_id(name):
    for v in voices:
        if v.name == name:
            return v.voice_id
    return voices[0].voice_id

def apply_accent_prompt(text, accent):
    return text if accent == "None" else f"This is a {accent} accent: {text}"

def change_speed(audio_path, speed=1.0):
    sound = AudioSegment.from_file(audio_path)
    altered = sound._spawn(sound.raw_data, overrides={
        "frame_rate": int(sound.frame_rate * speed)
    }).set_frame_rate(sound.frame_rate)
    output = "output_speed.mp3"
    altered.export(output, format="mp3")
    return output

def tts_engine(engine, text, voice_name, stability, similarity, speed, accent):
    if engine == "ElevenLabs":
        voice_id = get_voice_id(voice_name)
        prompt = apply_accent_prompt(text, accent)
        audio = client.text_to_speech.convert(
            voice_id=voice_id,
            text=prompt,
            model_id="eleven_multilingual_v2",
            voice_settings={
                "stability": stability,
                "similarity_boost": similarity
            }
        )
        out_file = "output_eleven.mp3"
        with open(out_file, "wb") as f:
            for chunk in audio:
                f.write(chunk)
    else:
        tacotron.tts_to_file(text=text, file_path="output_taco.wav")
        out_file = "output_taco.wav"

    return change_speed(out_file, speed) if speed != 1.0 else out_file

# 🎛️ Gradio Interface
iface = gr.Interface(
    fn=tts_engine,
    inputs=[
        gr.Radio(["ElevenLabs", "Tacotron 2"], label="TTS Engine", value="ElevenLabs"),
        gr.Textbox(label="Enter Text"),
        gr.Dropdown(choices=voice_names, label="Voice (ElevenLabs)", value=voice_names[0]),
        gr.Slider(0, 1, value=0.5, step=0.1, label="Stability (ElevenLabs)"),
        gr.Slider(0, 1, value=0.5, step=0.1, label="Similarity Boost (ElevenLabs)"),
        gr.Slider(0.5, 1.5, value=1.0, step=0.1, label="Speed"),
        gr.Dropdown(choices=accents, label="Accent (Simulated)", value="None")
    ],
    outputs=gr.Audio(type="filepath", label="🎧 Output Audio"),
    title="🎙️ Text-to-Speech App: ElevenLabs + Tacotron 2",
    description="Switch between high-quality TTS engines. Adjust voice, speed, clarity, accent."
)

iface.launch()


 > Downloading model to /root/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC
 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Downloading model to /root/.local/share/tts/vocoder_models--en--ljspeech--hifigan_v2
 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linea

  return torch.load(f, map_location=map_location, **kwargs)


 > Model's reduction rate `r` is set to: 1
 > Vocoder Model: hifigan
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Generator Model: hifigan_generator
 > Discriminator Model: hifigan_discriminator
Removing weight norm...
Running Gradio in a Colab notebook requires sharing enabled. Auto

