# Training the model on specific emotional datasets (Cloning)








In [None]:
!pip install TTS
!pip install TTS
!pip install coqui-tts
!pip uninstall torchvision -y

In [1]:
import kagglehub
import os
from glob import glob

# 1. Download the full dataset
data_path = kagglehub.dataset_download("uwrfkaggler/ravdess-emotional-speech-audio")

# The actual audio files are usually nested inside a specific folder within the download path
# Use glob to find all .wav files in the downloaded structure
all_files = glob(os.path.join(data_path, '**', '*.wav'), recursive=True)

# 2. Define target emotions and actor
# We will choose a single actor (e.g., Actor 12 - female)
# And a set of target emotions we want to test
TARGET_ACTOR = '12'

# We will use the 'strong' intensity (02) for maximum style transfer
# Emotion IDs: 01=Neutral, 03=Happy, 04=Sad, 05=Angry, 06=Fearful
TARGET_EMOTIONS = ['01', '03', '04', '05', '06']

print(f"Total files downloaded: {len(all_files)}")

Using Colab cache for faster access to the 'ravdess-emotional-speech-audio' dataset.
Total files downloaded: 2880


In [2]:
# 3. Create a dictionary to store the best reference clip found for each emotion
reference_clips = {}
EMOTION_MAP = {
    '01': 'Neutral', '02': 'Calm', '03': 'Happy', '04': 'Sad',
    '05': 'Angry', '06': 'Fearful', '07': 'Disgust', '08': 'Surprised'
}

# 4. Loop through all files and apply the filters
for file_path in all_files:
    filename = os.path.basename(file_path)

    # Split the filename identifier (e.g., ['03', '01', '06', '01', '02', '01', '12'])
    parts = filename.split('.')[0].split('-')

    # Sanity check: ensure it's a speech file
    if parts[1] != '01': # Vocal channel (01 = speech)
        continue

    emotion_id = parts[2]
    intensity_id = parts[3]
    actor_id = parts[6]

    # Filter 1: Check if the file is from the target actor
    if actor_id != TARGET_ACTOR:
        continue

    # Filter 2: Check if the file is a target emotion
    if emotion_id not in TARGET_EMOTIONS:
        continue

    # Filter 3: Prioritize Strong intensity (02), except for Neutral (which only has 01)
    # We will take the first one found that meets the criteria
    if (emotion_id == '01' and intensity_id == '01') or \
       (emotion_id != '01' and intensity_id == '02'):

        emotion_name = EMOTION_MAP[emotion_id]

        # We only need one example per emotion
        if emotion_name not in reference_clips:
            reference_clips[emotion_name] = file_path

# 5. Print the selected files and create a clean directory for them
output_dir = 'ravdess_reference_clips'
os.makedirs(output_dir, exist_ok=True)
print("\n--- Selected Reference Clips ---")

for emotion, source_path in reference_clips.items():

    # Create a clean, easy-to-use filename for the Coqui model
    target_filename = f"{emotion.lower()}_ref_actor{TARGET_ACTOR}.wav"
    target_path = os.path.join(output_dir, target_filename)

    # Copy the file to the new directory
    os.system(f"cp \"{source_path}\" \"{target_path}\"")

    print(f"[{emotion}]: {os.path.basename(source_path)} copied to {target_path}")

print(f"\nPreparation complete. {len(reference_clips)} clips ready in the '{output_dir}' directory.")


--- Selected Reference Clips ---
[Sad]: 03-01-04-02-02-02-12.wav copied to ravdess_reference_clips/sad_ref_actor12.wav
[Neutral]: 03-01-01-01-02-02-12.wav copied to ravdess_reference_clips/neutral_ref_actor12.wav
[Happy]: 03-01-03-02-02-01-12.wav copied to ravdess_reference_clips/happy_ref_actor12.wav
[Angry]: 03-01-05-02-01-01-12.wav copied to ravdess_reference_clips/angry_ref_actor12.wav
[Fearful]: 03-01-06-02-02-01-12.wav copied to ravdess_reference_clips/fearful_ref_actor12.wav

Preparation complete. 5 clips ready in the 'ravdess_reference_clips' directory.


In [3]:

from TTS.api import TTS
import torch
import os

# 1. Define Model and Target Text
MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
TARGET_TEXT = "The fox jumps over the lazy dog."
LANGUAGE = "en" # English

# 2. Get device (use CUDA if available for speed)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# 3. Initialize XTTS model
# This will download the model weights if not already present (can take a few minutes the first time)
print("Loading XTTS-v2 model...")
tts = TTS(MODEL_NAME).to(device)
print("Model loaded successfully.")

# 4. Define your reference and output directories
REF_DIR = 'ravdess_reference_clips'
OUTPUT_DIR_XTTS = 'generated_xtts_cloning'
os.makedirs(OUTPUT_DIR_XTTS, exist_ok=True)

# List of your prepared reference files:
# This list corresponds to the files you generated in the previous step's output.
reference_files = [f for f in os.listdir(REF_DIR) if f.endswith('.wav')]

Using device: cpu
Loading XTTS-v2 model...
 > You must confirm the following:
 | > "I have purchased a commercial license from Coqui: licensing@coqui.ai"
 | > "Otherwise, I agree to the terms of the non-commercial CPML: https://coqui.ai/cpml" - [y/n]
 | | > y


100%|██████████| 1.87G/1.87G [00:39<00:00, 47.8MiB/s]
4.37kiB [00:00, 3.70MiB/s]
361kiB [00:00, 27.8MiB/s]
100%|██████████| 32.0/32.0 [00:00<00:00, 53.2kiB/s]
100%|██████████| 7.75M/7.75M [00:00<00:00, 49.8MiB/s]


Model loaded successfully.


In [4]:
# 5. Define target text, directories, and list of references
TARGET_TEXT = "Mert is amazing, I love Karim."
REF_DIR = 'ravdess_reference_clips'
OUTPUT_DIR_XTTS = 'generated_xtts_cloning'
# Note: You already created the output directory and defined reference_files earlier,
# but we'll redefine the files list here in case the kernel was reset.

import os
# Assumes files are in the ravdess_reference_clips directory
reference_files = [f for f in os.listdir(REF_DIR) if f.endswith('.wav')]
os.makedirs(OUTPUT_DIR_XTTS, exist_ok=True)

# 6. Loop through each reference clip and generate the emotional output
print("\n--- Starting XTTS Generation (Voice Cloning) ---")
for ref_file in reference_files:
    # Example ref_file: sad_ref_actor12.wav
    emotion_name = ref_file.split('_')[0].capitalize()
    ref_path = os.path.join(REF_DIR, ref_file)

    output_filename = f"{emotion_name}_XTTS_Clone.wav"
    output_path = os.path.join(OUTPUT_DIR_XTTS, output_filename)

    print(f"Generating [{emotion_name}] using {ref_file}...")

    try:
        # XTTS-v2 Generation command: uses the ref_path as the speaker_wav
        tts.tts_to_file(
            text=TARGET_TEXT,
            speaker_wav=ref_path,
            language="en", # Must match the language used for the model and text
            file_path=output_path,
        )
        print(f"SUCCESS: {output_filename}")

    except Exception as e:
        print(f"ERROR generating {emotion_name}: {e}")

print(f"\nXTTS generation complete. {len(reference_files)} samples saved in '{OUTPUT_DIR_XTTS}'.")


--- Starting XTTS Generation (Voice Cloning) ---
Generating [Neutral] using neutral_ref_actor12.wav...


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


SUCCESS: Neutral_XTTS_Clone.wav
Generating [Angry] using angry_ref_actor12.wav...
SUCCESS: Angry_XTTS_Clone.wav
Generating [Happy] using happy_ref_actor12.wav...
SUCCESS: Happy_XTTS_Clone.wav
Generating [Fearful] using fearful_ref_actor12.wav...
SUCCESS: Fearful_XTTS_Clone.wav
Generating [Sad] using sad_ref_actor12.wav...
SUCCESS: Sad_XTTS_Clone.wav

XTTS generation complete. 5 samples saved in 'generated_xtts_cloning'.


In [5]:
import os
import torch
import torchaudio
import numpy as np
from transformers import pipeline

# --- NEW CONFIGURATION ---
# The folder containing your synthesized emotional audio files (from the XTTS step)
output_folder_xtts = "generated_xtts_cloning"
generated_files_xtts = [os.path.join(output_folder_xtts, f) for f in os.listdir(output_folder_xtts) if f.endswith(".wav")]

# Define your expected emotions for comparison (using the XTTS naming convention)
EXPECTED_EMOTIONS_XTTS = {
    'Angry_XTTS_Clone.wav': 'Angry',
    'Fearful_XTTS_Clone.wav': 'Fearful',
    'Happy_XTTS_Clone.wav': 'Happy',
    'Neutral_XTTS_Clone.wav': 'Neutral',
    'Sad_XTTS_Clone.wav': 'Sad',
}
# -------------------------

# --- 1. Load the Speech Emotion Recognition (SER) Pipeline (REUSED) ---
# NOTE: If the kernel has been restarted, you must re-run the SER model loading part first!
print("Loading Speech Emotion Recognition model...")
try:
    emotion_classifier = pipeline(
        "audio-classification",
        model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
        device=-1 # Run on CPU
    )
    TARGET_SR = emotion_classifier.feature_extractor.sampling_rate
    print(f"SER model loaded successfully. Target Sample Rate: {TARGET_SR} Hz")
except Exception as e:
    print(f"Failed to load SER model. Error: {e}")
    exit()

# --- 2. Predict Emotion for Each XTTS Generated File ---

print("\n=== XTTS Emotion Recognition Results ===")
print("{:<25} {:<10} {:<10} {:<15}".format("Audio File", "Target", "Predicted", "Confidence"))
print("-" * 60)

for file in generated_files_xtts:
    filename = os.path.basename(file)
    target_emotion = EXPECTED_EMOTIONS_XTTS.get(filename, "N/A")

    try:
        # Load audio using torchaudio (the safe, pure-Python way)
        audio_data, sr = torchaudio.load(file)

        # Resample if needed to match the SER model's requirement
        if sr != TARGET_SR:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=TARGET_SR)
            audio_data = resampler(audio_data)

        # Convert to a 1D NumPy array for the classifier
        raw_audio_array = audio_data.mean(dim=0).cpu().numpy()

    except Exception as e:
        print(f"Error loading {filename} with torchaudio: {e}")
        continue

    # Pass the raw NumPy array (not the file path) to the classifier
    prediction = emotion_classifier(raw_audio_array)

    # Process the top prediction
    if prediction:
        predicted_emotion = prediction[0]['label'].capitalize()
        confidence = prediction[0]['score']

        match_status = "✅ Match" if predicted_emotion == target_emotion else "❌ Mismatch"

        print("{:<25} {:<10} {:<10} {:<15.3f} ({})".format(
            filename,
            target_emotion,
            predicted_emotion,
            confidence,
            match_status
        ))

print("\nXTTS Evaluation complete. Compare these results to the Chatterbox results!")

Loading Speech Emotion Recognition model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]



model.safetensors:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition were not used when initializing Wav2Vec2ForSequenceClassification: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.output.bias', 'classifier.output.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', '

preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

Device set to use cpu


SER model loaded successfully. Target Sample Rate: 16000 Hz

=== XTTS Emotion Recognition Results ===
Audio File                Target     Predicted  Confidence     
------------------------------------------------------------
Angry_XTTS_Clone.wav      Angry      Sad        0.140           (❌ Mismatch)
Fearful_XTTS_Clone.wav    Fearful    Fearful    0.131           (✅ Match)
Neutral_XTTS_Clone.wav    Neutral    Neutral    0.133           (✅ Match)
Happy_XTTS_Clone.wav      Happy      Sad        0.131           (❌ Mismatch)
Sad_XTTS_Clone.wav        Sad        Sad        0.139           (✅ Match)

XTTS Evaluation complete. Compare these results to the Chatterbox results!


In [7]:
!pip install openai-whisper jiwer

Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hd

In [8]:

import whisper
import os
from jiwer import wer
import torchaudio
import torch

# Load Whisper
model = whisper.load_model("base")

# Your target text
TARGET_TEXT = "Mert is amazing, I love Karim." # Corrected case/punctuation for consistency

# Folder with your generated audios
# VVVV --- CHANGE THIS LINE --- VVVV
output_folder = "generated_xtts_cloning"
# ^^^^ ------------------------ ^^^^
generated_files = [os.path.join(output_folder, f) for f in os.listdir(output_folder) if f.endswith(".wav")]

print("=== XTTS WER Evaluation (Using torchaudio to bypass FFmpeg) ===")
for file in generated_files:
    try:
        # Load audio data and its sample rate (sr)
        audio_data, sr = torchaudio.load(file)

        # Resample to 16000 Hz, which is Whisper's required sample rate
        if sr != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
            audio_data = resampler(audio_data)

        # Take the mean across channels (if stereo) and convert to 1D tensor/array
        audio_tensor = audio_data.mean(dim=0)

    except Exception as e:
        print(f"Error loading {os.path.basename(file)} with torchaudio: {e}")
        continue

    # Transcribe using the loaded audio tensor (bypasses whisper's internal audio loader)
    # Whisper expects a numpy array
    result = model.transcribe(audio_tensor.cpu().numpy())

    transcription = result["text"]
    # Calculate Word Error Rate
    score = wer(TARGET_TEXT.lower(), transcription.lower())

    print(f"{os.path.basename(file)} -> WER: {score:.3f}, Transcription: {transcription}")

print("\nEvaluation complete. XTTS WER results are ready!")

100%|███████████████████████████████████████| 139M/139M [00:05<00:00, 26.5MiB/s]


=== XTTS WER Evaluation (Using torchaudio to bypass FFmpeg) ===


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Angry_XTTS_Clone.wav -> WER: 0.333, Transcription:  MIRT is amazing. I love Karim.
Fearful_XTTS_Clone.wav -> WER: 0.500, Transcription:  Merism-bazing, I love Karim.
Neutral_XTTS_Clone.wav -> WER: 0.333, Transcription:  Merch is amazing, I love Kareem.
Happy_XTTS_Clone.wav -> WER: 0.500, Transcription:  MIRT is amazing. I love KERIP.
Sad_XTTS_Clone.wav -> WER: 0.500, Transcription:  Merit is amazing. I love current.

Evaluation complete. XTTS WER results are ready!
