# 🚀 General Environment Setup
This section handles the complete one-time setup. It installs all required dependencies, system libraries, and downloads the necessary models to run the project.
If you run this locally, you will skip this part next time (Use Venv if possible)

In [1]:
#@title Setup Environment
import os

# Clone OpenVoice repository
!git clone -q https://github.com/myshell-ai/OpenVoice.git > /dev/null 2>&1
os.chdir('OpenVoice')

# Install dependencies
!pip install -q nltk flask pyngrok flask-cors waitress faster-whisper pydub torch torchvision torchaudio "google-generativeai" -e . "git+https://github.com/myshell-ai/MeloTTS.git" --extra-index-url https://download.pytorch.org/whl/cu118 > /dev/null 2>&1

# Download NLTK data
!python -m nltk.downloader -q punkt averaged_perceptron_tagger > /dev/null 2>&1
!python -m nltk.downloader -q averaged_perceptron_tagger_eng > /dev/null 2>&1


# Download and set up pre-trained models
!wget -q -O checkpoints_v2.zip https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip > /dev/null 2>&1
!unzip -q -o checkpoints_v2.zip -d checkpoints_v2 > /dev/null 2>&1
!mv /content/OpenVoice/checkpoints_v2/checkpoints_v2/* /content/OpenVoice/checkpoints_v2/ > /dev/null 2>&1
!rmdir /content/OpenVoice/checkpoints_v2/checkpoints_v2 > /dev/null 2>&1
!rm checkpoints_v2.zip > /dev/null 2>&1

# Download unidic dictionary for MeloTTS
!python -m unidic download > /dev/null 2>&1

print("✅ Done")

✅ Done


In [2]:
#@title Setup System Libraries (GPU bug correction)
import os

package_name = "libcudnn8"
required_version = "8.9.2.26-1+cuda11.8"

check_command = f"dpkg-query -W -f='{{Status}} {{Version}}' {package_name} 2>/dev/null | grep -q 'install ok installed {required_version}'"

if os.system(check_command) != 0:
    !apt-get update -qq > /dev/null
    !apt-get install -y -qq libcusparse-dev-11-8 libcublas-dev-11-8 libcudnn8={required_version} > /dev/null

os.environ['LD_LIBRARY_PATH'] = os.environ.get('LD_LIBRARY_PATH', '') + ':/usr/lib/x86_64-linux-gnu'

print("✅ System libraries OK")

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
✅ System libraries OK


In [None]:
#@title Verify NumPy Version (Bug correction, Colab will restart)
import numpy as np
import os

required_version = "1.26.4"

if np.__version__ != required_version:
    !pip install numpy=={required_version} --quiet > /dev/null 2>&1
    os.kill(os.getpid(), 9)
else:
    print("✅ NumPy OK")

# ▶️ Run the Application
This is the final step. Execute the following cells to launch the main program. A public URL will be generated in the output below, allowing you to access the interface.

In [None]:
import os
import torch
import gc
import zipfile
from google.colab import files
import google.generativeai as genai
from openvoice import se_extractor
from openvoice.api import ToneColorConverter
from melo.api import TTS
from faster_whisper import WhisperModel
import nltk


# --- 1. UPLOAD & EXTRACT ASSETS ZIP ---
# This block runs only if the asset files don't already exist to prevent re-uploads.
if not os.path.exists('/content/cliente_final.html'):
    print("👇 Please upload your 'assets.zip' file.")
    print("   This ZIP should contain: cliente_final.html, reference.mp3, and all .vrm, .fbx, .hdr files.")
    uploaded_zip = files.upload()

    if not uploaded_zip:
        raise Exception("❌ No ZIP file was uploaded. Please run the cell again.")
    else:
        zip_name = list(uploaded_zip.keys())[0]
        print(f"\n✅ File '{zip_name}' uploaded. Extracting to /content/...")
        with zipfile.ZipFile(zip_name, 'r') as zip_ref:
            zip_ref.extractall('/content/')
        os.remove(zip_name) # Clean up the zip file after extraction
        print("✅ Assets successfully extracted.")
else:
    print("✅ Assets already exist in /content/. Skipping ZIP upload.")


# @title 🔑 API Key, AI Personality & Language Configuration
# @markdown Paste your API keys here. They are required for Google AI and Ngrok services.
GOOGLE_API_KEY = "" #@param {type:"string"}
NGROK_AUTHTOKEN = "" #@param {type:"string"}


#  AI Personality & Language Configuration
# @markdown Define the AI's personality for each language and set the active audio language.
SYSTEM_PROMPT_ES = "Eres un asistente de IA conversacional. Tu personalidad es la de una chica de anime amigable y servicial. Responde siempre de forma breve y directa en espa\u00F1ol. Tus respuestas no deben exceder las dos frases. S\u00E9 muy concisa." #@param {type:"string"}
SYSTEM_PROMPT_EN = "You are a conversational AI assistant. Your personality is that of a friendly and helpful anime girl. Always respond briefly and directly in English. Your answers should not exceed two sentences. Be very concise." #@param {type:"string"}
TTS_LANGUAGE = "English" #@param ["Spanish", "English"]


# --- Select the active system prompt based on the chosen language ---
ACTIVE_SYSTEM_PROMPT = SYSTEM_PROMPT_EN if TTS_LANGUAGE == "English" else SYSTEM_PROMPT_ES
print(f"✅ Active AI Personality Language: {TTS_LANGUAGE}")

# --- API Validation & Setup ---
genai.configure(api_key=GOOGLE_API_KEY)
print("\n✅ Google API Key configured.")

# --- 4. LOAD AI MODELS ---
print("\n🧠 Loading all AI models (this may take a moment)...")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"✅ Selected device: {device}")

# Gemini Model (now using the DYNAMIC system prompt)
print("🧠 Configuring Gemini...")
gemini_model = genai.GenerativeModel('gemini-2.5-flash', system_instruction=ACTIVE_SYSTEM_PROMPT)
chat = gemini_model.start_chat(history=[])
print("✅ Gemini model ready.")

# OpenVoice Model
print("🧠 Loading OpenVoice...")
# Change directory to ensure relative paths for models are correct
%cd /content/OpenVoice
ckpt_converter = '/content/OpenVoice/checkpoints_v2/converter'
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
print("✅ OpenVoice model loaded.")

# Faster Whisper Model
print("🧠 Loading Faster Whisper...")
compute_type = "float16" if "cuda" in device else "int8"
whisper_model = WhisperModel("medium", device=device.split(':')[0], compute_type=compute_type)
print("✅ Faster Whisper model loaded.")

# MeloTTS Model (using the configured language)
print(f"🧠 Loading MeloTTS in '{TTS_LANGUAGE}'...")
LANGUAGE_CODE_MAP = {
    "Spanish": "ES",
    "English": "EN"
}
selected_language = LANGUAGE_CODE_MAP[TTS_LANGUAGE]

melo_model = TTS(language=selected_language, device=device)
speaker_ids = melo_model.hps.data.spk2id
print("✅ MeloTTS model loaded.")

# --- 5. EXTRACT VOICE TIMBRE FROM REFERENCE FILE ---
print("\n🔊 Processing fixed voice sample from 'reference.mp3'...")
reference_speaker_path = "/content/reference.mp3"

if not os.path.exists(reference_speaker_path):
    raise Exception(f"❌ File not found: '{reference_speaker_path}'. Ensure it was included in your ZIP file.")
else:
    target_se, _ = se_extractor.get_se(
        reference_speaker_path,
        tone_color_converter,
        target_dir='/content/OpenVoice/processed',
        vad=True
    )
    print("✅ Voice timbre successfully extracted. System is ready to run!")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
  WeightNorm.apply(module, name, dim)


✅ Assets already exist in /content/. Skipping ZIP upload.
✅ Active AI Personality Language: English

✅ Google API Key configured.

🧠 Loading all AI models (this may take a moment)...
✅ Selected device: cuda:0
🧠 Configuring Gemini...
✅ Gemini model ready.
🧠 Loading OpenVoice...
/content/OpenVoice
Loaded checkpoint '/content/OpenVoice/checkpoints_v2/converter/checkpoint.pth'
missing/unexpected keys: [] []
✅ OpenVoice model loaded.
🧠 Loading Faster Whisper...
✅ Faster Whisper model loaded.
🧠 Loading MeloTTS in 'English'...
✅ MeloTTS model loaded.

🔊 Processing fixed voice sample from 'reference.mp3'...
OpenVoice version: v2
[(1.582, 3.026), (10.03, 16.082), (17.006, 22.29)]
after vad: dur = 12.78
✅ Voice timbre successfully extracted. System is ready to run!


In [None]:
#@title ▶️ Run Web Server & Application


#@markdown ### ⚙️ Server & Ngrok Configuration
#@markdown Choose the server region closest to you for the best performance.
SERVER_REGION = "South America (Sao Paulo)" #@param ["United States (Ohio)", "Europe (Frankfurt)", "Asia/Pacific (Singapore)", "Australia (Sydney)", "South America (Sao Paulo)", "India (Mumbai)", "Japan (Tokyo)"]

import os
import uuid
import traceback
import io
import gc
import random
from flask import Flask, request, jsonify, send_from_directory
from pyngrok import ngrok, conf
from flask_cors import CORS
from waitress import serve
from pydub import AudioSegment

# --- 1. SERVER CONFIGURATION ---
print("\n🚀 Configuring Flask server...")

# --- Map user-friendly region names to official ngrok codes ---
REGION_MAP = {
    "United States (Ohio)": "us",
    "Europe (Frankfurt)": "eu",
    "Asia/Pacific (Singapore)": "ap",
    "Australia (Sydney)": "au",
    "South America (Sao Paulo)": "sa",
    "India (Mumbai)": "in",
    "Japan (Tokyo)": "jp"
}
selected_region_code = REGION_MAP[SERVER_REGION]

# --- Set ngrok configuration from user parameters ---
conf.get_default().auth_token = NGROK_AUTHTOKEN
conf.get_default().region = selected_region_code
print(f"✅ Ngrok region set to: {SERVER_REGION} ({selected_region_code})")

ROOT_DIR = '/content/'
app = Flask(__name__)
# Allow all origins for Cross-Origin Resource Sharing (CORS)
CORS(app, resources={r"/*": {"origins": "*"}})
# Pre-defined list of animation files for random selection
ANIMATION_FILES = [f'anim_{i}.fbx' for i in range(1, 4)]
print("✅ Flask server initialized with CORS.")

# --- 2. AI PROCESSING FUNCTIONS ---
def reason_with_gemini(user_text):
    """Sends user text to the Gemini model and returns its response."""
    print(f"🧠 Sending to Gemini: '{user_text}'")
    try:
        response = chat.send_message(user_text)
        return response.text
    except Exception as e:
        print(f"🚨 Gemini API Error: {e}")
        return "Sorry, I encountered a problem while generating my response."

def generate_cloned_audio(text, output_filename, request_id):
    """Generates cloned voice audio from text using MeloTTS and OpenVoice."""
    # Logic to select the correct voice model based on the chosen language
    if selected_language == 'ES':
        speaker_id_key = 'ES'
        embedding_file = 'es'
    else: # English
        speaker_id_key = 'EN-US'
        embedding_file = 'en-us'

    print(f"   - Using voice: {speaker_id_key}")

    src_path = os.path.join(ROOT_DIR, f"{request_id}_tmp_melo.wav")
    save_path = os.path.join(ROOT_DIR, output_filename)

    try:
        # Load the base speaker embedding for the source voice
        source_se = torch.load(f'/content/OpenVoice/checkpoints_v2/base_speakers/ses/{embedding_file}.pth', map_location=device)
        # Generate the initial audio with MeloTTS
        melo_model.tts_to_file(text, speaker_ids[speaker_id_key], src_path, speed=1.0)
        # Convert the tone color to the target voice using OpenVoice
        tone_color_converter.convert(
            audio_src_path=src_path,
            src_se=source_se,
            tgt_se=target_se,
            output_path=save_path,
            message="@MyShell"
        )
        print(f"🔊 Audio generated and saved to: {save_path}")
        return output_filename
    finally:
        # Ensure the temporary source file is always removed
        if os.path.exists(src_path):
            os.remove(src_path)

# --- 3. API ROUTES (ENDPOINTS) ---
@app.route('/')
def index():
    """Serves the main HTML client file."""
    return send_from_directory(ROOT_DIR, 'cliente_final.html')

@app.route('/assets/<path:filename>')
def serve_asset(filename):
    """Serves static assets like audio files from the root directory."""
    return send_from_directory(ROOT_DIR, filename,
                               mimetype='audio/wav' if filename.endswith('.wav') else None)

@app.route('/process_audio', methods=['POST'])
def process_audio_endpoint():
    """Main endpoint to process user audio, get a response, and return generated audio."""
    request_id = str(uuid.uuid4())
    print(f"\n🎤 Request {request_id} received...")

    try:
        if 'audio' not in request.files:
            return jsonify({"error": "No audio file found in the request"}), 400

        # --- IN-MEMORY PROCESSING ---
        audio_data = request.files['audio'].read()
        print(f"   - Step 1: Audio received in memory ({len(audio_data)} bytes).")

        sound = AudioSegment.from_file(io.BytesIO(audio_data))
        wav_buffer = io.BytesIO()
        sound.export(wav_buffer, format="wav")
        wav_buffer.seek(0)
        print("   - Step 2: Converted to WAV format in memory.")

        # --- TEMPORARY FILE FOR WHISPER ---
        input_path = os.path.join(ROOT_DIR, f"{request_id}_input.wav")
        with open(input_path, "wb") as f:
            f.write(wav_buffer.getvalue())
        print(f"   - Step 3: Temp WAV file saved for processing: {input_path}")

        # --- DYNAMIC LANGUAGE CONFIGURATION FOR WHISPER ---
        whisper_language_code = "en" if TTS_LANGUAGE == "English" else "es"
        print(f"   - Whisper language set to: '{whisper_language_code}'")

        # --- TRANSCRIPTION WITH FASTER-WHISPER  ---
        segments, info = whisper_model.transcribe(
            input_path,                   # The audio file to process.
            beam_size=5,                  # Improves transcription accuracy.
            language=whisper_language_code, # Sets the language ('en', 'es', etc.).
            vad_filter=True               # Removes periods of silence/noise.
        )

        transcribed_text = "".join(seg.text for seg in segments).strip()
        print(f"   - Step 4: Transcribed text: '{transcribed_text}'")

        # --- CLEANUP INPUT FILE ---
        os.remove(input_path)

        if not transcribed_text:
            return jsonify({"error": "Could not detect any text in the audio."}), 400

        # --- GENERATE RESPONSE ---
        response_text = reason_with_gemini(transcribed_text)
        output_filename = f"{request_id}_response.wav"
        generated_file = generate_cloned_audio(
            response_text, output_filename, request_id
        )
        if generated_file is None:
            raise Exception("Failed to generate the final audio file.")

        response_data = {
            "audio_file": generated_file,
            "animation_file": random.choice(ANIMATION_FILES)
        }
        print(f"✅ Sending response: {response_data}")
        return jsonify(response_data)

    except Exception as e:
        print(f"🔥 Endpoint Error: {e}")
        traceback.print_exc()
        return jsonify({"error": "An internal server error occurred"}), 500

    finally:
        # --- MEMORY RELEASE ---
        print(f"🧹 Clearing memory for request {request_id}")
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

# --- 4. WARM-UP CALL (TO PRE-COMPILE MODELS) ---
print("\n" + "="*50)
print("🔥 Performing a warm-up call to pre-compile AI models...")
# Select warm-up text based on the configured language
warmup_text = "Initializing systems." if TTS_LANGUAGE == "English" else "Inicializando sistemas."
print(f"   - Using warm-up text: '{warmup_text}'")
try:
    # Define dummy filenames for the warm-up call
    warmup_output_file = "warmup_response.wav"
    warmup_request_id = "warmup_request"

    # Execute the main audio generation function
    generated_file = generate_cloned_audio(
        warmup_text,
        warmup_output_file,
        warmup_request_id
    )
    # Clean up the generated warm-up file
    if generated_file and os.path.exists(os.path.join(ROOT_DIR, generated_file)):
        os.remove(os.path.join(ROOT_DIR, generated_file))
        print("   - Warm-up file cleaned up.")
    print("✅ Models are now warmed up and ready for real-time requests.")
except Exception as e:
    print(f"⚠️ Warning: Warm-up call failed, the first user request might be slow. Error: {e}")

# --- 5. START SERVER ---
print("\n" + "="*50)
print("🚇 Starting ngrok tunnel...")
# Kill any existing ngrok tunnels before starting a new one
ngrok.kill()
public_url = ngrok.connect(5000)
print("✅ SERVER IS READY AND RUNNING")
print(f"🔗 Public URL: {public_url}")
print("   Open this URL in your browser!")
print("="*50)
print("\n⚙️  Starting production server with Waitress...")
serve(app, host='0.0.0.0', port=5000)


🚀 Configuring Flask server...
✅ Ngrok region set to: South America (Sao Paulo) (sa)
✅ Flask server initialized with CORS.

🔥 Performing a warm-up call to pre-compile AI models...
   - Using warm-up text: 'Initializing systems.'
   - Using voice: EN-US
 > Text split to sentences.
Initializing systems.




model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1/1 [00:12<00:00, 12.99s/it]


🔊 Audio generated and saved to: /content/warmup_response.wav
   - Warm-up file cleaned up.
✅ Models are now warmed up and ready for real-time requests.

🚇 Starting ngrok tunnel...
✅ SERVER IS READY AND RUNNING
🔗 Public URL: NgrokTunnel: "https://1c23886c7fe7.ngrok-free.app" -> "http://localhost:5000"
   Open this URL in your browser!

⚙️  Starting production server with Waitress...

🎤 Request d2feda9c-f19c-435c-981c-004136966ee9 received...
   - Step 1: Audio received in memory (66815 bytes).
   - Step 2: Converted to WAV format in memory.
   - Step 3: Temp WAV file saved for processing: /content/d2feda9c-f19c-435c-981c-004136966ee9_input.wav
   - Whisper language set to: 'en'
   - Step 4: Transcribed text: 'Good morning, Vietnam!'
🧠 Sending to Gemini: 'Good morning, Vietnam!'
   - Using voice: EN-US
 > Text split to sentences.
Good morning to you too! Have a nice day!


100%|██████████| 1/1 [00:00<00:00,  5.33it/s]


🔊 Audio generated and saved to: /content/d2feda9c-f19c-435c-981c-004136966ee9_response.wav
✅ Sending response: {'audio_file': 'd2feda9c-f19c-435c-981c-004136966ee9_response.wav', 'animation_file': 'anim_3.fbx'}
🧹 Clearing memory for request d2feda9c-f19c-435c-981c-004136966ee9

🎤 Request 54c9c4ca-5e8f-43de-baf1-677e11bf5a59 received...
   - Step 1: Audio received in memory (89033 bytes).
   - Step 2: Converted to WAV format in memory.
   - Step 3: Temp WAV file saved for processing: /content/54c9c4ca-5e8f-43de-baf1-677e11bf5a59_input.wav
   - Whisper language set to: 'en'
   - Step 4: Transcribed text: 'Well, well seems like this works'
🧠 Sending to Gemini: 'Well, well seems like this works'
   - Using voice: EN-US
 > Text split to sentences.
Great! Is there anything I can help you with today?


100%|██████████| 1/1 [00:00<00:00,  6.86it/s]


🔊 Audio generated and saved to: /content/54c9c4ca-5e8f-43de-baf1-677e11bf5a59_response.wav
✅ Sending response: {'audio_file': '54c9c4ca-5e8f-43de-baf1-677e11bf5a59_response.wav', 'animation_file': 'anim_3.fbx'}
🧹 Clearing memory for request 54c9c4ca-5e8f-43de-baf1-677e11bf5a59
