# Audio Generation

In [1]:
# Import necessary libraries
import os
import io
import nltk
import google.generativeai as genai
import soundfile as sf
import numpy as np

from TTS.api import TTS
from pydub import AudioSegment
from IPython.display import Audio

nltk.download("punkt")

  from .autonotebook import tqdm as notebook_tqdm
Skipping import of cpp extensions due to incompatible torch version 2.8.0+cpu for torchao version 0.14.1             Please see https://github.com/pytorch/ao/issues/2919 for more info
W1208 23:41:16.228000 20400 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Initialize GenAi
genai.configure(api_key="AIzaSyBS-2pbdjYouOkcqHaX4ZI5HHPpSSmq3iw")
model = genai.GenerativeModel("gemini-2.5-flash")


In [3]:
# Initialize Coqui-TTS
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(model_name)

In [4]:
# Determine Speaker
all_speakers = tts.speakers

male_speakers = [s for s in all_speakers if any(n in s.lower() for n in [
    "david","andrew","badr","damien","gilberto","ilkin","kazuhiko",
    "ludvig","torcull","viktor","zacharie","xavier","luis","marcos"
])]

female_speakers = [s for s in all_speakers if any(n in s.lower() for n in [
    "claribel","daisy","tammie","alison","ana","annmarie","asya","brenda",
    "gitta","henriette","sofia","tammy","tanja","nova","maja","uta",
    "lidiya","chandra","szofi","camilla","lilya","zofija"
])]

male_index = 0
female_index = 0

In [5]:
# Utility Function
def detect_gender(name):
    try:
        r = model.generate_content(
            f"Determine gender (male or female). Name: {name}. Answer only male or female."
        )
        t = r.text.lower()
        if "male" in t: return "male"
        if "female" in t: return "female"
    except:
        pass
    return None

def narrator_voice():
    for s in female_speakers:
        if "daisy" in s.lower():
            return s
    return female_speakers[0]

def assign_voice(name):

    global male_index, female_index
    gender = detect_gender(name)

    if gender == "male":
        v = male_speakers[male_index % len(male_speakers)]
        male_index += 1
        return v

    elif gender == "female":
        v = female_speakers[female_index % len(female_speakers)]
        female_index += 1
        return v

    if male_index <= female_index:
        v = male_speakers[male_index % len(male_speakers)]
        male_index += 1
        return v
    else:
        v = female_speakers[female_index % len(female_speakers)]
        female_index += 1
        return v

In [6]:
def create_audio(file_path):

    folder = os.path.dirname(file_path)
    final_path = os.path.join(folder, "audio.wav")
    os.makedirs(folder, exist_ok=True)

    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    lines = lines[3:]
    voices = {}
    final_audio = AudioSegment.silent(1500)

    current_line = 0
    total_line = len(lines)

    for raw_line in lines:
        current_line += 1

        line = raw_line.strip()
        if not line:
            continue

        if line.lower().startswith("section"):
            final_audio += AudioSegment.silent(1500)
            continue

        if line.lower().startswith("narrator:"):
            spoken = line.split(":", 1)[1].strip()
            speaker = "narrator"
            voices[speaker] = narrator_voice()

        else:
            if ":" in line:
                speaker, spoken = line.split(":", 1)
                speaker = speaker.strip()
                spoken = spoken.strip()
            else:
                speaker = "unknown"
                spoken = line

            if speaker not in voices:
                voices[speaker] = narrator_voice() if speaker == "unknown" else assign_voice(speaker)

        print(f"Converting -> {current_line} / {total_line}")

        audio_np = np.asarray(
            tts.tts(
                text=spoken,
                speaker=voices[speaker],
                language="en"
            )
        )

        wav_buffer = io.BytesIO()
        sf.write(wav_buffer, audio_np, 22050, format="wav")
        wav_buffer.seek(0)
        audio_seg = AudioSegment.from_wav(wav_buffer)

        final_audio += audio_seg

        final_audio += AudioSegment.silent(300)

        if len(spoken.split()) > 18:
            final_audio += AudioSegment.silent(400)

    final_audio.export(final_path, format="wav")
    print("Saved:", final_path)

In [7]:
create_audio("sets/set1/transcript.txt")
create_audio("sets/set2/transcript.txt")


Converting -> 2 / 193
Converting -> 3 / 193
Converting -> 5 / 193
Converting -> 7 / 193
Converting -> 9 / 193
Converting -> 11 / 193
Converting -> 13 / 193
Converting -> 15 / 193
Converting -> 17 / 193
Converting -> 19 / 193
Converting -> 21 / 193
Converting -> 23 / 193
Converting -> 25 / 193
Converting -> 27 / 193
Converting -> 29 / 193
Converting -> 31 / 193
Converting -> 33 / 193
Converting -> 35 / 193
Converting -> 37 / 193
Converting -> 39 / 193
Converting -> 41 / 193
Converting -> 43 / 193
Converting -> 45 / 193
Converting -> 47 / 193
Converting -> 49 / 193
Converting -> 51 / 193
Converting -> 52 / 193
Converting -> 54 / 193
Converting -> 56 / 193
Converting -> 57 / 193
Converting -> 59 / 193
Converting -> 61 / 193
Converting -> 62 / 193
Converting -> 63 / 193
Converting -> 64 / 193
Converting -> 65 / 193
Converting -> 66 / 193
Converting -> 67 / 193
Converting -> 68 / 193
Converting -> 69 / 193
Converting -> 70 / 193
Converting -> 71 / 193
Converting -> 72 / 193
Converting -> 74

The text length exceeds the character limit of 250 for language 'en', this might cause truncated audio.


Converting -> 185 / 193
Converting -> 187 / 193
Converting -> 189 / 193
Converting -> 191 / 193
Converting -> 193 / 193
Saved: sets/set1\audio.wav
Converting -> 2 / 253
Converting -> 3 / 253
Converting -> 7 / 253
Converting -> 11 / 253
Converting -> 13 / 253
Converting -> 15 / 253
Converting -> 17 / 253
Converting -> 19 / 253
Converting -> 23 / 253
Converting -> 27 / 253
Converting -> 29 / 253
Converting -> 31 / 253
Converting -> 33 / 253
Converting -> 35 / 253
Converting -> 37 / 253
Converting -> 39 / 253
Converting -> 41 / 253
Converting -> 43 / 253
Converting -> 45 / 253
Converting -> 47 / 253
Converting -> 49 / 253
Converting -> 51 / 253
Converting -> 53 / 253
Converting -> 55 / 253
Converting -> 57 / 253
Converting -> 59 / 253
Converting -> 61 / 253
Converting -> 63 / 253
Converting -> 65 / 253
Converting -> 67 / 253
Converting -> 69 / 253
Converting -> 71 / 253
Converting -> 73 / 253
Converting -> 75 / 253
Converting -> 77 / 253
Converting -> 79 / 253
Converting -> 81 / 253
Conve

The text length exceeds the character limit of 250 for language 'en', this might cause truncated audio.


Converting -> 237 / 253
Converting -> 239 / 253
Converting -> 241 / 253
Converting -> 243 / 253


The text length exceeds the character limit of 250 for language 'en', this might cause truncated audio.


Converting -> 245 / 253
Converting -> 247 / 253
Converting -> 249 / 253
Converting -> 251 / 253


The text length exceeds the character limit of 250 for language 'en', this might cause truncated audio.


Converting -> 253 / 253
Saved: sets/set2\audio.wav
