# Training the model on specific emotional datasets (Cloning)








In [3]:
!pip install kagglehub

import kagglehub
import os
from glob import glob

# 1. Download the full dataset
data_path = kagglehub.dataset_download("uwrfkaggler/ravdess-emotional-speech-audio")

# The actual audio files are usually nested inside a specific folder within the download path
# Use glob to find all .wav files in the downloaded structure
all_files = glob(os.path.join(data_path, '**', '*.wav'), recursive=True)

# 2. Define target emotions and actor
# We will choose a single actor (e.g., Actor 12 - female)
# And a set of target emotions we want to test
TARGET_ACTOR = '12'

# We will use the 'strong' intensity (02) for maximum style transfer
# Emotion IDs: 01=Neutral, 03=Happy, 04=Sad, 05=Angry, 06=Fearful
TARGET_EMOTIONS = ['01', '03', '04', '05', '06']

print(f"Total files downloaded: {len(all_files)}")


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\Karim\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Total files downloaded: 2880


In [4]:
# 3. Create a dictionary to store the best reference clip found for each emotion
reference_clips = {}
EMOTION_MAP = {
    '01': 'Neutral', '02': 'Calm', '03': 'Happy', '04': 'Sad',
    '05': 'Angry', '06': 'Fearful', '07': 'Disgust', '08': 'Surprised'
}

# 4. Loop through all files and apply the filters
for file_path in all_files:
    filename = os.path.basename(file_path)

    # Split the filename identifier (e.g., ['03', '01', '06', '01', '02', '01', '12'])
    parts = filename.split('.')[0].split('-')

    # Sanity check: ensure it's a speech file
    if parts[1] != '01': # Vocal channel (01 = speech)
        continue

    emotion_id = parts[2]
    intensity_id = parts[3]
    actor_id = parts[6]

    # Filter 1: Check if the file is from the target actor
    if actor_id != TARGET_ACTOR:
        continue

    # Filter 2: Check if the file is a target emotion
    if emotion_id not in TARGET_EMOTIONS:
        continue

    # Filter 3: Prioritize Strong intensity (02), except for Neutral (which only has 01)
    # We will take the first one found that meets the criteria
    if (emotion_id == '01' and intensity_id == '01') or \
       (emotion_id != '01' and intensity_id == '02'):

        emotion_name = EMOTION_MAP[emotion_id]

        # We only need one example per emotion
        if emotion_name not in reference_clips:
            reference_clips[emotion_name] = file_path

# 5. Print the selected files and create a clean directory for them
output_dir = 'ravdess_reference_clips'
os.makedirs(output_dir, exist_ok=True)
print("\n--- Selected Reference Clips ---")

for emotion, source_path in reference_clips.items():

    # Create a clean, easy-to-use filename for the Coqui model
    target_filename = f"{emotion.lower()}_ref_actor{TARGET_ACTOR}.wav"
    target_path = os.path.join(output_dir, target_filename)

    # Copy the file to the new directory
    os.system(f"cp \"{source_path}\" \"{target_path}\"")

    print(f"[{emotion}]: {os.path.basename(source_path)} copied to {target_path}")

print(f"\nPreparation complete. {len(reference_clips)} clips ready in the '{output_dir}' directory.")


--- Selected Reference Clips ---
[Neutral]: 03-01-01-01-01-01-12.wav copied to ravdess_reference_clips\neutral_ref_actor12.wav
[Happy]: 03-01-03-02-01-01-12.wav copied to ravdess_reference_clips\happy_ref_actor12.wav
[Sad]: 03-01-04-02-01-01-12.wav copied to ravdess_reference_clips\sad_ref_actor12.wav
[Angry]: 03-01-05-02-01-01-12.wav copied to ravdess_reference_clips\angry_ref_actor12.wav
[Fearful]: 03-01-06-02-01-01-12.wav copied to ravdess_reference_clips\fearful_ref_actor12.wav

Preparation complete. 5 clips ready in the 'ravdess_reference_clips' directory.


In [5]:
!pip install TTS
import torch
import os
!pip install coqui-tts
from TTS.api import TTS

# 1. Define Model and Target Text
MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
TARGET_TEXT = "The fox jumps over the lazy dog."
LANGUAGE = "en" # English

# 2. Get device (use CUDA if available for speed)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# 3. Initialize XTTS model
# This will download the model weights if not already present (can take a few minutes the first time)
print("Loading XTTS-v2 model...")
tts = TTS(MODEL_NAME).to(device)
print("Model loaded successfully.")

# 4. Define your reference and output directories
REF_DIR = 'ravdess_reference_clips'
OUTPUT_DIR_XTTS = 'generated_xtts_cloning'
os.makedirs(OUTPUT_DIR_XTTS, exist_ok=True)

# List of your prepared reference files:
# This list corresponds to the files you generated in the previous step's output.
reference_files = [f for f in os.listdir(REF_DIR) if f.endswith('.wav')]


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\Karim\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Collecting TTS
  Downloading TTS-0.22.0.tar.gz (1.7 MB)
     ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
      --------------------------------------- 0.0/1.7 MB 217.9 kB/s eta 0:00:08
     -- ------------------------------------- 0.1/1.7 MB 599.1 kB/s eta 0:00:03
     ---------- ----------------------------- 0.4/1.7 MB 1.8 MB/s eta 0:00:01
     ------------------- -------------------- 0.8/1.7 MB 2.9 MB/s eta 0:00:01
     --------------------------------- ------ 1.4/1.7 MB 4.2 MB/s eta 0:00:01
     ---------------------------------------  1.6/1.7 MB 4.6 MB/s eta 0:00:01
     ---------------------------------------  1.6/1.7 MB 4.6 MB/s eta 0:00:01
     ---------------------------------------- 1.7/1.7 MB 3.9 MB/s eta 0:00:00
  Installing build dependencies:

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tts 0.22.0 requires gruut[de,es,fr]==2.2.3, but you have gruut 2.4.0 which is incompatible.

[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\Karim\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


ImportError: coqui-tts switched to a forked version of Coqpit, but you still have the original package installed. Run the following to avoid conflicts:
  pip uninstall coqpit
  pip install coqpit-config

In [6]:
# 5. Define target text, directories, and list of references
TARGET_TEXT = "Karim is amazing, I love Karim."
REF_DIR = 'ravdess_reference_clips'
OUTPUT_DIR_XTTS = 'generated_xtts_cloning'
# Note: You already created the output directory and defined reference_files earlier,
# but we'll redefine the files list here in case the kernel was reset.

import os
# Assumes files are in the ravdess_reference_clips directory
reference_files = [f for f in os.listdir(REF_DIR) if f.endswith('.wav')]
os.makedirs(OUTPUT_DIR_XTTS, exist_ok=True)

# 6. Loop through each reference clip and generate the emotional output
print("\n--- Starting XTTS Generation (Voice Cloning) ---")
for ref_file in reference_files:
    # Example ref_file: sad_ref_actor12.wav
    emotion_name = ref_file.split('_')[0].capitalize()
    ref_path = os.path.join(REF_DIR, ref_file)

    output_filename = f"{emotion_name}_XTTS_Clone.wav"
    output_path = os.path.join(OUTPUT_DIR_XTTS, output_filename)

    print(f"Generating [{emotion_name}] using {ref_file}...")

    try:
        # XTTS-v2 Generation command: uses the ref_path as the speaker_wav
        tts.tts_to_file(
            text=TARGET_TEXT,
            speaker_wav=ref_path,
            language="en", # Must match the language used for the model and text
            file_path=output_path,
        )
        print(f"SUCCESS: {output_filename}")

    except Exception as e:
        print(f"ERROR generating {emotion_name}: {e}")

print(f"\nXTTS generation complete. {len(reference_files)} samples saved in '{OUTPUT_DIR_XTTS}'.")


--- Starting XTTS Generation (Voice Cloning) ---

XTTS generation complete. 0 samples saved in 'generated_xtts_cloning'.


# Suno Bark Generation (Text Prompting)