# Qwen3-TTS Voice Clone Demo

This notebook demonstrates how to run Qwen3-TTS Voice Cloning.

In [None]:
# 1. Install Dependencies
# Install system dependencies first (fixes 'sox: not found' errors)
!sudo apt-get update && sudo apt-get install -y sox libsox-dev ffmpeg

!pip install -U qwen-tts
# flash-attn is recommended for performance
!pip install -U flash-attn --no-build-isolation
!pip install pyngrok
!pip install modelscope
!pip install boto3 requests beautifulsoup4 pysbd


In [None]:
# 2. Imports
import torch
import soundfile as sf
from IPython.display import Audio
from qwen_tts import Qwen3TTSModel
import os
import threading
import time

In [None]:
# 3. Load Model (Voice Clone Base 1.7B) - Dual GPU Support
import torch
from qwen_tts import Qwen3TTSModel

models_pool = []
gpu_count = torch.cuda.device_count()

print(f"Detected {gpu_count} GPU(s).")

if gpu_count >= 2:
    print("üöÄ Dual-GPU Mode Activated!")
    # Load Model 1 on GPU 0
    print("Loading Model 0 on cuda:0 ...")
    model_0 = Qwen3TTSModel.from_pretrained(
        "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
        device_map="cuda:0",
        dtype=torch.bfloat16,
        attn_implementation="flash_attention_2",
    )
    models_pool.append(model_0)
    
    # Load Model 2 on GPU 1
    print("Loading Model 1 on cuda:1 ...")
    model_1 = Qwen3TTSModel.from_pretrained(
        "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
        device_map="cuda:1",
        dtype=torch.bfloat16,
        attn_implementation="flash_attention_2",
    )
    models_pool.append(model_1)
    
    # For backward compatibility if single model is referenced
    model = model_0 

else:
    print("Single-GPU Mode.")
    print("Loading Model on cuda:0 ...")
    model = Qwen3TTSModel.from_pretrained(
        "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
        device_map="cuda:0",
        dtype=torch.bfloat16,
        attn_implementation="flash_attention_2",
    )
    models_pool.append(model)

print(f"‚úÖ Loaded {len(models_pool)} model instance(s).")


In [None]:
# 5. Batch Blog Scraper
import requests
from bs4 import BeautifulSoup
import re
import pysbd
from urllib.parse import urljoin

# --- BATCH CONFIGURATION ---
ARCHIVES_URL = "https://www.hung-truong.com/blog/archives/"
POST_OFFSET = 0    # Skip the first N posts
BATCH_LIMIT = 5    # Max posts to process in one run

def get_post_links(archives_url):
    try:
        response = requests.get(archives_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        links = []
        for a in soup.find_all('a', href=True):
            href = a['href']
            if re.match(r'.*/blog/\d{4}/\d{2}/\d{2}/.+', href):
                full_url = href if href.startswith("http") else urljoin("https://www.hung-truong.com", href)
                if full_url not in links:
                    links.append(full_url)
        return links
    except Exception as e:
        print(f"Error fetching archives: {e}")
        return []

def parse_post_content(url):
    print(f"Checking {url}...")
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        if soup.find('div', class_='audio-header'):
            print(f"‚è© Skipping {url} (Audio header found)")
            return None
        content_div = soup.find('div', class_='content')
        if not content_div:
            return None
        slug = url.rstrip('/').split('/')[-1]
        post_data = {'url': url, 'slug': slug, 'title': None, 'segments': []}
        seg = pysbd.Segmenter(language="en", clean=False)
        for unwanted in content_div.find_all(['figcaption', 'pre']):
            unwanted.decompose()
        title_tag = soup.find('h1', class_='postTitle')
        if title_tag:
            post_data['title'] = title_tag.get_text().strip()
            post_data['segments'].append({'text': post_data['title'], 'type': 'header'})
        meta_tag = soup.find('p', class_='meta')
        if meta_tag:
            meta_text = meta_tag.get_text().strip()
            date_text = meta_text.split('|')[0].strip() if "|" in meta_text else meta_text
            post_data['segments'].append({'text': f"Published on {date_text}.", 'type': 'paragraph_end'})
        tags_to_find = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'blockquote']
        all_elements = content_div.find_all(tags_to_find)
        elements = [el for el in all_elements if not any(parent in all_elements for parent in el.parents)]
        for el in elements:
            if 'meta' in el.get('class', []): continue
            text = el.get_text(separator=' ', strip=True)
            text = re.sub(r'\s+', ' ', text)
            if not text: continue
            if el.name.startswith('h'):
                if "Leave a Comment" in text: break
                if post_data['title'] and text == post_data['title']: continue
                post_data['segments'].append({'text': text, 'type': 'header'})
            else:
                sentences = seg.segment(text)
                processed_sentences = []
                current_chunk = ""
                for sent in sentences:
                    sent = sent.strip()
                    if not sent: continue
                    if current_chunk: current_chunk += " " + sent
                    else: current_chunk = sent
                    if len(current_chunk.split()) >= 3:
                        processed_sentences.append(current_chunk)
                        current_chunk = ""
                if current_chunk:
                    if processed_sentences: processed_sentences[-1] += " " + current_chunk
                    else: processed_sentences.append(current_chunk)
                for i, s in enumerate(processed_sentences):
                    t = 'paragraph_end' if i == len(processed_sentences) - 1 else 'sentence'
                    post_data['segments'].append({'text': s, 'type': t})
        return post_data
    except Exception as e:
        print(f"‚ùå Error parsing {url}: {e}")
        return None


In [None]:
from kaggle_secrets import UserSecretsClient
import requests
import boto3
import os
import numpy as np
import soundfile as sf
import subprocess
from IPython.display import Audio
from datetime import datetime
import time
import concurrent.futures
import threading
import queue

# --- SECRETS & CONFIG ---
user_secrets = UserSecretsClient()
S3_ACCESS_KEY = user_secrets.get_secret("S3_ACCESS_KEY")
S3_BUCKET_NAME = user_secrets.get_secret("S3_BUCKET_NAME")
S3_ENDPOINT_URL = user_secrets.get_secret("S3_ENDPOINT_URL")
S3_SECRET_KEY = user_secrets.get_secret("S3_SECRET_KEY")

GITHUB_REPO = "hungtruong/jekyll-blog"
GITHUB_TOKEN = user_secrets.get_secret("GITHUB_TOKEN")
PUBLIC_URL_BASE = "https://pub-2289fc0aae4245debaa2fd741bdf5605.r2.dev/blogaudio/"

# Check Models
if 'models_pool' not in locals() or not models_pool:
    raise ValueError("‚ùå No models found! Please run Cell 3 first.")

print(f"‚ö° Using {len(models_pool)} model instance(s) for generation.")

# Helper for VTT Time
def format_vtt_time(seconds):
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return f"{int(h):02d}:{int(m):02d}:{s:06.3f}"

def get_audio_duration(filename):
    try:
        result = subprocess.run(
            ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", filename],
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True
        )
        try:
            return float(result.stdout.strip())
        except ValueError:
            return 0.0
    except Exception as e:
        print(f"Error checking duration: {e}")
        return 0.0

# Reference Audio
local_ref_audio = "/kaggle/input/voice-cloning-dataset/longblog2.wav"
ref_text_content = "I was at Whole Foods today getting some groceries when I came across this mini food testing area at the end of an aisle. There were two nice sales people (one lady and one dude) who were hawking cereal. The type of cereal was super organic and it came in a pouch. The lady bragged that all of the ingredients were on the front of the bag in large type. The cereal was available for testing in cereal form, baked into a cookie, and blended into a smoothie (which was apparently made with apple cider and yogurt or something). Sidenote: While I was deciding what to taste test (I eventually went with the smoothie and it was not bad, and followed up with a chunk of cookie), an old Asian lady walked up to me and started talking in Chinese. I tried to tell her that I don‚Äôt really speak Chinese, but I forgot how to say ‚ÄúI don‚Äôt know Chinese‚Äù in Chinese. It‚Äôs kind of absurd, anyway, to say you don‚Äôt speak a language in that very language you‚Äôre saying you don't speak. Anyway, she mumbled some more stuff and then said ‚ÄúChinese.‚Äù Like, yeah, lady, we're both Chinese. I guess she walked away after that. So anyway, here's the real part of the story. I'm tasting the cookie and am about to leave when another woman walks up to the food tasting area. The sales guy asks if she wants to buy some cereal and she's like ‚Äúoh, I already have some at home! I love it! I'm just going to have some samples.‚Äù"

# --- WORKER FUNCTION ---
def generate_segment(task):
    """
    Task tuple: (index, text, item_type)
    Returns: (index, audio_array, silence_array, sr, text)
    """
    idx, text, item_type = task
    
    try:
        # Get a model from the pool (Queue)
        model_instance = model_queue.get() 
        
        try:
            # Generate
            wavs, sr = model_instance.generate_voice_clone(
                text=text,
                language="English",
                ref_audio=local_ref_audio,
                ref_text=ref_text_content,
            )
            audio_chunk = wavs[0]
            
            # Silence
            if item_type == 'header': silence_dur = 1.5
            elif item_type == 'paragraph_end': silence_dur = 1.0
            else: silence_dur = 0.5
            
            silence_samples = int(silence_dur * sr)
            silence_chunk = np.zeros(silence_samples, dtype=np.float32)
            
            return (idx, audio_chunk, silence_chunk, sr, text)
            
        finally:
            # Always return model to queue
            model_queue.put(model_instance)
            
    except Exception as e:
        print(f"‚ùå Error in segment {idx}: {e}")
        return (idx, None, None, 24000, text) # Return placeholder on fail

# --- BATCH EXECUTION ---
if 'batch_queue' not in locals() or not batch_queue:
    print("‚ö†Ô∏è No posts in batch_queue.")
elif S3_ACCESS_KEY == "YOUR_ACCESS_KEY":
    print("‚ö†Ô∏è PLEASE SET YOUR S3 CONFIGURATIONS IN SECRETS ‚ö†Ô∏è")
else:
    print(f"üöÄ Starting Dual-GPU Batch Generation for {len(batch_queue)} posts...")

    # Init Model Queue
    model_queue = queue.Queue()
    for m in models_pool:
        model_queue.put(m)

    # Init S3 Client
    try:
        s3 = boto3.client(
            's3',
            endpoint_url=S3_ENDPOINT_URL,
            aws_access_key_id=S3_ACCESS_KEY,
            aws_secret_access_key=S3_SECRET_KEY
        )
    except Exception as e:
        print(f"‚ùå Failed to init S3 client: {e}")
        s3 = None

    if s3:
        for idx_post, post in enumerate(batch_queue):
            print(f"\n[{idx_post+1}/{len(batch_queue)}] Processing: {post['slug']}")
            
            lines_to_process = post['segments']
            tasks = []
            valid_count = 0
            
            # Prepare Tasks
            for i, item in enumerate(lines_to_process):
                text = item.get('text', '').strip()
                if not text or len(text) < 2: continue
                tasks.append((valid_count, text, item.get('type', 'sentence')))
                valid_count += 1
            
            print(f"   Generating {len(tasks)} segments with {len(models_pool)} threads...")
            
            # Parallel Execution
            results = []
            with concurrent.futures.ThreadPoolExecutor(max_workers=len(models_pool)) as executor:
                # Submit all
                futures = {executor.submit(generate_segment, task): task for task in tasks}
                
                # Collect as they complete
                for i, future in enumerate(concurrent.futures.as_completed(futures)):
                    res = future.result()
                    results.append(res)
                    if i % 10 == 0:
                        print(f"   Completed {i}/{len(tasks)} segments...")

            # Sort by original index
            results.sort(key=lambda x: x[0])
            
            # Reassemble
            all_wavs = []
            vtt_lines = ["WEBVTT\n"]
            total_samples = 0
            sr = None # Will determine from first valid segment
            
            for res in results:
                idx, audio_chunk, silence_chunk, res_sr, text = res
                if audio_chunk is None: continue
                
                # Check SR consistency
                if sr is None:
                    sr = res_sr
                    print(f"   Detected Sample Rate: {sr} Hz")
                elif sr != res_sr:
                    print(f"‚ö†Ô∏è Warning: Sample Rate Mismatch at segment {idx}. Expected {sr}, got {res_sr}. Timing will be off!")
                
                # Audio Length
                audio_len = len(audio_chunk)
                
                # VTT Times
                start_time_str = format_vtt_time(total_samples / sr)
                end_time_str = format_vtt_time((total_samples + audio_len) / sr)
                
                vtt_lines.append(f"{start_time_str} --> {end_time_str}")
                vtt_lines.append(f"{text}\n")
                
                # Append Audio + Silence
                all_wavs.append(audio_chunk)
                all_wavs.append(silence_chunk)
                
                total_samples += audio_len + len(silence_chunk)

            # --- SAVING ---
            if all_wavs and sr:
                BASE_FILENAME = post['slug']
                OUTPUT_FILENAME_MP3 = f"{BASE_FILENAME}.mp3"
                OUTPUT_FILENAME_VTT = f"{BASE_FILENAME}.vtt"

                # WAV
                temp_wav = "temp_output.wav"
                final_wav = np.concatenate(all_wavs)
                sf.write(temp_wav, final_wav, sr)
                final_duration_wav = len(final_wav) / sr
                
                print(f"   WAV Duration: {final_duration_wav:.3f}s")
                
                # MP3 (Use CBR 192k and explicit AR to avoid drift)
                subprocess.run(
                    [
                        "ffmpeg", "-y", "-i", temp_wav, 
                        "-codec:a", "libmp3lame", 
                        "-b:a", "192k",       # Constant Bitrate for better timing consistency
                        "-ar", str(sr),       # Enforce same sample rate
                        "-map_metadata", "-1", 
                        OUTPUT_FILENAME_MP3
                    ],
                    check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
                )
                
                # Check Drift from MP3 encoding
                final_duration_mp3 = get_audio_duration(OUTPUT_FILENAME_MP3)
                drift = final_duration_mp3 - final_duration_wav
                print(f"   MP3 Duration: {final_duration_mp3:.3f}s (Drift: {drift:.4f}s)")
                
                if abs(drift) > 0.5:
                     print("‚ö†Ô∏è Significant MP3 drift detected! VTT might be desynchronized.")

                # VTT
                with open(OUTPUT_FILENAME_VTT, "w", encoding="utf-8") as f:
                    f.write("\n".join(vtt_lines))
                    
                # Upload
                print(f"   Uploading...")
                with open(OUTPUT_FILENAME_MP3, "rb") as f:
                    s3.upload_fileobj(f, S3_BUCKET_NAME, os.path.basename(OUTPUT_FILENAME_MP3))
                with open(OUTPUT_FILENAME_VTT, "rb") as f:
                    s3.upload_fileobj(f, S3_BUCKET_NAME, os.path.basename(OUTPUT_FILENAME_VTT))
                    
                # GitHub Dispatch
                if GITHUB_TOKEN and "YOUR_GITHUB" not in GITHUB_TOKEN:
                    print(f"   Triggering Workflow...")
                    dispatch_url = f"https://api.github.com/repos/{GITHUB_REPO}/dispatches"
                    headers = {
                        "Accept": "application/vnd.github.v3+json",
                        "Authorization": f"token {GITHUB_TOKEN}"
                    }
                    payload = {
                        "event_type": "audio-ready",
                        "client_payload": {
                            "slug": BASE_FILENAME,
                            "mp3_url": f"{PUBLIC_URL_BASE}{os.path.basename(OUTPUT_FILENAME_MP3)}",
                            "vtt_url": f"{PUBLIC_URL_BASE}{os.path.basename(OUTPUT_FILENAME_VTT)}"
                        }
                    }
                    r = requests.post(dispatch_url, headers=headers, json=payload)
                    if r.status_code == 204:
                         print("   ‚úÖ Dispatch Sent.")
                    else:
                         print(f"   ‚ùå Dispatch Failed: {r.status_code}")
                         
                print(f"‚úÖ Finished: {post['slug']}")

    print("\\nüèÅ Dual-GPU Batch Processing Complete.")
