<a href="https://colab.research.google.com/github/jaikumar-j/Face-Detect/blob/master/TTS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Environment Setup
!pip install -q kokoro>=0.9.4 soundfile pandas
!apt-get -qq -y install espeak-ng > /dev/null 2>&1


In [None]:
# Import libraries
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf
import pandas as pd
import torch
import os
import numpy as np
from scipy.spatial.distance import cosine

In [None]:
os.makedirs("outputs", exist_ok=True)
# 2. Data Preparation

csv_path = "/metadata.csv" # Change this to your actual file path
df = pd.read_csv(csv_path)
print(df.columns)

df = df.dropna(subset=['wav_file'])  # Changed 'text' to 'wav_file'
df = df[df['wav_file'].str.strip() != ''] # Changed 'text' to 'wav_file'
print(f"Loaded {len(df)} text samples from CSV.")

# 3. Model Selection & Setup

pipeline = KPipeline(lang_code='a')  # 'a' = auto language detection

# 4. Audio Synthesis

synthesized_audio_paths = []

for idx, row in df.iterrows():
    text = row['wav_file']
    print(f"Generating audio for row {idx}: {text}")

    generator = pipeline(text, voice='af_heart')

    for i, (gs, ps, audio) in enumerate(generator):
        filename = f'outputs/sample_{idx}_{i}.wav'
        sf.write(filename, audio, 48000)
        synthesized_audio_paths.append(filename)
        if i == 0:
            display(Audio(data=audio, rate=48000, autoplay=True))

# 5. Evaluation: Audio Similarity Metrics (Cosine Similarity on embeddings)
from python_speech_features import mfcc
import scipy.io.wavfile as wav

def compute_audio_similarity(file1, file2):
    rate1, sig1 = wav.read(file1)
    rate2, sig2 = wav.read(file2)
    mfcc1 = np.mean(mfcc(sig1, rate1), axis=0)
    mfcc2 = np.mean(mfcc(sig2, rate2), axis=0)
    similarity = 1 - cosine(mfcc1, mfcc2)
    return similarity

# Example similarity test between first two generated audios
if len(synthesized_audio_paths) >= 2:
    sim_score = compute_audio_similarity(synthesized_audio_paths[0], synthesized_audio_paths[1])
    print(f"Similarity between first two audio samples: {sim_score:.4f}")

In [None]:

# 6. Report
report = {
    "total_inputs": len(df),
    "total_outputs": len(synthesized_audio_paths),
    "sample_similarity": sim_score if len(synthesized_audio_paths) >= 2 else "N/A",
}

print("\n--- PROJECT REPORT ---")
for k, v in report.items():
    print(f"{k}: {v}")