## Original

In [None]:
import os
import time
import google.generativeai as genai

from TTS.api import TTS
from pydub import AudioSegment
from tqdm import tqdm

# Initialize genai
genai.configure(api_key="AIzaSyBS-2pbdjYouOkcqHaX4ZI5HHPpSSmq3iw")
model = genai.GenerativeModel("gemini-2.5-flash")

def detect_gender(name):
    prompt = f"Determine the gender of this person from their name: {name}. Answer 'male' or 'female'."
    response = model.generate_content(prompt)
    text = response.text.lower()
    if "female" in text:
        return "female"
    else:
        return "male"
    
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(model_name)

# Get available speakers
all_speakers = tts.speakers

# Split speakers into male/female
male_speakers = [s for s in all_speakers if any(n in s.lower() for n in ["david","andrew","badr","damien","gilberto","ilkin","kazuhiko","ludvig","suad","torcull","viktor","zacharie","xavier","luis","marcos"])]
female_speakers = [s for s in all_speakers if any(n in s.lower() for n in ["claribel","daisy","tammie","alison","ana","annmarie","asya","brenda","gitta","henriette","sofia","tammy","tanja","nova","maja","uta","lidiya","chandra","szofi","camilla","lilya","zofija","narelle","barbora","alexandra","alma","rosemary","ige","filip","damjan","vjollca"])]
narrator_voice = "Gracie Wise"

transcript_file = "set/set1/transcripts.txt"
with open(transcript_file, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f.readlines() if line.strip()]


sections = []
current_section = []
for line in lines:
    if line.lower().startswith("section"):
        if current_section:
            sections.append(current_section)
            current_section = []
    current_section.append(line)
if current_section:
    sections.append(current_section)

final_audio = AudioSegment.silent(duration=0)

for section_idx, section_lines in enumerate(sections):
    section_audio = AudioSegment.silent(duration=0)
    
    # Parse lines into (speaker, text)
    speaker_lines = []
    for line in section_lines:
        if ':' in line:
            speaker, text = line.split(':', 1)
            speaker_lines.append((speaker.strip(), text.strip()))
        else:
            speaker_lines.append(('NARRATOR', line.strip()))
    
    # Cache speaker -> assigned voice
    speaker_voice_map = {}
    
    # Generate TTS for the section once
    print(f"Processing Section {section_idx+1}...")
    for speaker, text in tqdm(speaker_lines, desc=f"Section {section_idx+1} lines"):
        # Assign voice
        if speaker == "NARRATOR":
            voice_name = narrator_voice
        else:
            if speaker not in speaker_voice_map:
                gender = detect_gender(speaker)
                if gender == "male":
                    voice_name = male_speakers[hash(speaker) % len(male_speakers)]
                else:
                    voice_name = female_speakers[hash(speaker) % len(female_speakers)]
                speaker_voice_map[speaker] = voice_name
            else:
                voice_name = speaker_voice_map[speaker]

        # Generate TTS for this line
        temp_wav = f"temp_{section_idx}_{hash(speaker+text)}.wav"
        tts.tts_to_file(language="en", text=text, speaker=voice_name, file_path=temp_wav)
        
        line_audio = AudioSegment.from_wav(temp_wav)
        os.remove(temp_wav)
        
        # Add small pause after line (0.5 sec)
        line_audio += AudioSegment.silent(duration=500)
        
        section_audio += line_audio
    
    # Repeat the section audio twice
    section_audio = section_audio + section_audio
    
    # Add section pause (30 sec)
    section_audio += AudioSegment.silent(duration=30000)
    
    final_audio += section_audio

output_file = "set/Set1/audio.wav"
final_audio.export(output_file, format="wav")
print(f"Audio generated successfully: {output_file}")


## Modified Test

In [1]:
import os
import time
import nltk
import google.generativeai as genai

from TTS.api import TTS
from pydub import AudioSegment
from tqdm import tqdm

# ----------------------------------------
# INITIAL SETUP
# ----------------------------------------

nltk.download("punkt")

genai.configure(api_key="AIzaSyBS-2pbdjYouOkcqHaX4ZI5HHPpSSmq3iw")
model = genai.GenerativeModel("gemini-2.5-flash")

model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(model_name)

all_speakers = tts.speakers

male_speakers = [s for s in all_speakers if any(n in s.lower() for n in [
    "david", "andrew", "badr", "damien", "gilberto", "ilkin", "kazuhiko",
    "ludvig", "suad", "torcull", "viktor", "zacharie", "xavier", "luis", "marcos"
])]
female_speakers = [s for s in all_speakers if any(n in s.lower() for n in [
    "claribel", "daisy", "tammie", "alison", "ana", "annmarie", "asya", "brenda",
    "gitta", "henriette", "sofia", "tammy", "tanja", "nova", "maja", "uta",
    "lidiya", "chandra", "szofi", "camilla", "lilya", "zofija", "narelle",
    "barbora", "alexandra", "alma", "rosemary", "ige", "filip", "damjan", "vjollca"
])]
narrator_voice = "Gracie Wise"


# ----------------------------------------
# FUNCTIONS
# ----------------------------------------

def detect_gender(name):
    """Use Gemini API to detect gender"""
    prompt = f"Determine the gender of this person from their name: {name}. Answer only 'male' or 'female'."
    response = model.generate_content(prompt)
    text = response.text.lower()
    return "female" if "female" in text else "male"


def split_into_sentences(text):
    """Split text into sentences."""
    return nltk.sent_tokenize(text)


def split_long_sentence(sentence, max_len=230):
    """Split long sentences into smaller parts."""
    if len(sentence) <= max_len:
        return [sentence]

    parts = []

    # Try splitting by commas first
    chunks = [c.strip() for c in sentence.split(",")]

    for chunk in chunks:
        if len(chunk) <= max_len:
            parts.append(chunk)
        else:
            # Split by words
            words = chunk.split()
            current = ""
            for w in words:
                if len(current) + len(w) + 1 <= max_len:
                    current += (" " + w if current else w)
                else:
                    parts.append(current)
                    current = w
            if current:
                parts.append(current)
    return parts


def safe_split_text(text):
    """Split text -> sentences -> smaller chunks."""
    final = []
    sentences = split_into_sentences(text)

    for s in sentences:
        final.extend(split_long_sentence(s))
    return final


def load_transcript_sections(filepath):
    """Load transcript file and split by sections."""
    with open(filepath, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f.readlines() if line.strip()]

    sections = []
    current = []

    for line in lines:
        if line.lower().startswith("section"):
            if current:
                sections.append(current)
                current = []
        current.append(line)

    if current:
        sections.append(current)

    return sections


def parse_speaker_lines(section_lines):
    """Convert lines into (speaker, text)."""
    result = []
    for line in section_lines:
        if ":" in line:
            speaker, text = line.split(":", 1)
            result.append((speaker.strip(), text.strip()))
        else:
            result.append(("NARRATOR", line.strip()))
    return result


def assign_voice(speaker, speaker_voice_map):
    """Get or assign a voice based on gender."""
    if speaker == "NARRATOR":
        return narrator_voice

    if speaker in speaker_voice_map:
        return speaker_voice_map[speaker]

    gender = detect_gender(speaker)
    if gender == "male":
        voice = male_speakers[hash(speaker) % len(male_speakers)]
    else:
        voice = female_speakers[hash(speaker) % len(female_speakers)]

    speaker_voice_map[speaker] = voice
    return voice


def generate_tts_chunk(text, voice, temp_name):
    """Generate a single TTS chunk."""
    tts.tts_to_file(
        language="en",
        text=text,
        speaker=voice,
        file_path=temp_name
    )
    audio = AudioSegment.from_wav(temp_name)
    os.remove(temp_name)
    return audio


# ----------------------------------------
# MAIN AUDIO GENERATION
# ----------------------------------------

transcript_file = "set/set1/transcripts.txt"
sections = load_transcript_sections(transcript_file)

final_audio = AudioSegment.silent(duration=0)

for section_idx, section_lines in enumerate(sections):

    print(f"\n--- Processing Section {section_idx + 1} ---")
    speaker_lines = parse_speaker_lines(section_lines)
    speaker_voice_map = {}
    section_audio = AudioSegment.silent(duration=0)

    for speaker, text in tqdm(speaker_lines, desc=f"Section {section_idx+1} lines"):

        voice_name = assign_voice(speaker, speaker_voice_map)

        # Split text into safe chunks
        chunks = safe_split_text(text)

        for chunk in chunks:
            temp_wav = f"temp_{section_idx}_{hash(speaker + chunk)}.wav"
            line_audio = generate_tts_chunk(chunk, voice_name, temp_wav)

            # Add pause
            line_audio += AudioSegment.silent(duration=400)
            section_audio += line_audio

    # Repeat section twice
    section_audio = section_audio + section_audio

    # Add 30 sec break
    section_audio += AudioSegment.silent(duration=30000)

    final_audio += section_audio


output_file = "set/Set1/audio.wav"
final_audio.export(output_file, format="wav")

print(f"\nAudio generated successfully: {output_file}")


  from .autonotebook import tqdm as notebook_tqdm
Skipping import of cpp extensions due to incompatible torch version 2.8.0+cpu for torchao version 0.14.1             Please see https://github.com/pytorch/ao/issues/2919 for more info
W1130 21:15:14.613000 9472 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



--- Processing Section 1 ---


Section 1 lines: 100%|██████████| 20/20 [23:10<00:00, 69.53s/it] 



--- Processing Section 2 ---


Section 2 lines: 100%|██████████| 30/30 [10:53<00:00, 21.77s/it]



--- Processing Section 3 ---


Section 3 lines: 100%|██████████| 16/16 [16:42<00:00, 62.64s/it] 



--- Processing Section 4 ---


Section 4 lines: 100%|██████████| 13/13 [18:15<00:00, 84.31s/it] 



Audio generated successfully: set/Set1/audio.wav


## Slow

In [2]:
# Slow down by 10% (0.9x speed)
slower_audio = final_audio._spawn(final_audio.raw_data, overrides={
    "frame_rate": int(final_audio.frame_rate * 0.95)
}).set_frame_rate(final_audio.frame_rate)

# Export the slower audio
output_file = "set/Set1/audio_slow.wav"
slower_audio.export(output_file, format="wav")
print(f"Slower audio generated successfully: {output_file}")


Slower audio generated successfully: set/Set1/audio_slow.wav
