<a href="https://colab.research.google.com/github/hussainturii/TTS/blob/main/clone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Check GPU availability
!nvidia-smi

# Clone the repository
!git clone https://github.com/SWivid/F5-TTS.git
%cd F5-TTS

Tue Oct  7 21:11:34 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   60C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# Install required packages
!pip install -e .
!pip install cached_path
!pip install librosa soundfile torchaudio
!pip install gradio

Obtaining file:///content/F5-TTS
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting bitsandbytes>0.37.0 (from f5-tts==1.1.9)
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting cached_path (from f5-tts==1.1.9)
  Downloading cached_path-1.8.0-py3-none-any.whl.metadata (19 kB)
Collecting ema_pytorch>=0.5.2 (from f5-tts==1.1.9)
  Downloading ema_pytorch-0.7.7-py3-none-any.whl.metadata (689 bytes)
Collecting hydra-core>=1.3.0 (from f5-tts==1.1.9)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Collecting pydantic<=2.10.6 (from f5-tts==1.1.9)
  Downloading pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)
Collecting pypinyin (from f5-tts==1.1.9)
  Downloading pypinyin-0.55.0-py2.py3-none-any.whl.metadata (12 kB)
Coll

In [18]:
# Create a proper dataset directory
!mkdir -p /content/dataset/audio
!mkdir -p /content/dataset/metadata

# Move TSV files to metadata folder
!mv /content/F5-TTS/*.tsv /content/dataset/metadata/

In [3]:
from huggingface_hub import login
import os

In [4]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # 1. Gather raw audio features
        input_feats = [feat["input_features"] for feat in features]
        # 2. Pad them (this returns both 'input_features' and 'attention_mask')
        batch_inputs = self.processor.feature_extractor.pad(
            {"input_features": input_feats},
            padding=True,
            return_tensors="pt",
            return_attention_mask=True,
        )

        # 3. Gather label sequences
        label_ids = [feat["labels"] for feat in features]
        # 4. Pad them (this returns 'input_ids' and its 'attention_mask')
        label_batch = self.processor.tokenizer.pad(
            {"input_ids": label_ids},
            padding=True,
            return_tensors="pt",
            return_attention_mask=True,
        )

        # 5. Replace pad token ids in labels with -100 so they're ignored in loss
        labels = label_batch["input_ids"].masked_fill(
            label_batch["attention_mask"].ne(1), -100
        )

        # 6. If a bos token was prepended earlier, drop it here
        if labels.size(1) > 0 and torch.all(labels[:, 0] == self.decoder_start_token_id):
            labels = labels[:, 1:]

        # 7. Package everything up
        batch_inputs["labels"] = labels
        # and give the decoder its own attention mask
        batch_inputs["decoder_attention_mask"] = label_batch["attention_mask"]

        return batch_inputs

In [18]:
# ----------------------------------
# Configuration – edit as you like
# ----------------------------------
MODEL_ID = "openai/whisper-base"
HF_USERNAME = "newModl"               #  <-- CHANGE
PUSH_MODEL_ID = f"{HF_USERNAME}/whisper-base-urdu-full"
LANG_ID = "ur"
SAMPLING_RATE = 24_000
SEED = 666

In [7]:
pip install datasets==3.6.0

Collecting datasets==3.6.0
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
Successfully installed datasets-3.6.0


In [8]:
import os, re, random, torch, numpy as np
from datasets import load_dataset, Audio
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch

In [9]:
# Load Common Voice 17.0 Urdu subset
dataset = load_dataset(
    "mozilla-foundation/common_voice_17_0",
    LANG_ID,
    split={
        "train": "train+validation",
        "test":  "test[:600]"
    },
    cache_dir="./hf_cache",
    trust_remote_code=True
)

# Remove imports with missing audio
dataset = dataset.remove_columns(
    [col for col in dataset["train"].column_names if col not in ("audio","sentence")]
)

print(dataset)

README.md:   0%|          | 0.00/12.7k [00:00<?, ?B/s]

common_voice_17_0.py:   0%|          | 0.00/8.19k [00:00<?, ?B/s]

languages.py:   0%|          | 0.00/3.92k [00:00<?, ?B/s]

release_stats.py:   0%|          | 0.00/132k [00:00<?, ?B/s]

n_shards.json:   0%|          | 0.00/17.5k [00:00<?, ?B/s]

audio/ur/train/ur_train_0.tar:   0%|          | 0.00/140M [00:00<?, ?B/s]

audio/ur/dev/ur_dev_0.tar:   0%|          | 0.00/102M [00:00<?, ?B/s]

audio/ur/test/ur_test_0.tar:   0%|          | 0.00/116M [00:00<?, ?B/s]

audio/ur/other/ur_other_0.tar:   0%|          | 0.00/982M [00:00<?, ?B/s]

audio/ur/other/ur_other_1.tar:   0%|          | 0.00/953M [00:00<?, ?B/s]

audio/ur/other/ur_other_2.tar:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

audio/ur/other/ur_other_3.tar:   0%|          | 0.00/465M [00:00<?, ?B/s]

audio/ur/invalidated/ur_invalidated_0.ta(…):   0%|          | 0.00/177M [00:00<?, ?B/s]

audio/ur/validated/ur_validated_0.tar:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

audio/ur/validated/ur_validated_1.tar:   0%|          | 0.00/323M [00:00<?, ?B/s]

train.tsv:   0%|          | 0.00/1.75M [00:00<?, ?B/s]

dev.tsv:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

test.tsv:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

transcript/ur/other.tsv:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

invalidated.tsv:   0%|          | 0.00/2.30M [00:00<?, ?B/s]

transcript/ur/validated.tsv:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 5368it [00:00, 122443.45it/s]


Generating validation split: 0 examples [00:00, ? examples/s]


Reading metadata...: 4057it [00:00, 98167.14it/s]


Generating test split: 0 examples [00:00, ? examples/s]


Reading metadata...: 4056it [00:00, 71524.48it/s]


Generating other split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 13043it [00:00, 130413.02it/s][A
Reading metadata...: 26085it [00:00, 116598.30it/s][A
Reading metadata...: 37841it [00:00, 94765.29it/s] [A
Reading metadata...: 47698it [00:00, 86036.35it/s][A
Reading metadata...: 56546it [00:00, 82936.38it/s][A
Reading metadata...: 64967it [00:00, 80214.95it/s][A
Reading metadata...: 73051it [00:00, 79040.38it/s][A
Reading metadata...: 80986it [00:00, 77193.20it/s][A
Reading metadata...: 88717it [00:01, 75696.08it/s][A
Reading metadata...: 96288it [00:01, 75594.70it/s][A
Reading metadata...: 103848it [00:01, 75375.11it/s][A
Reading metadata...: 111772it [00:01, 76498.06it/s][A
Reading metadata...: 119533it [00:01, 76821.02it/s][A
Reading metadata...: 127219it [00:01, 76799.96it/s][A
Reading metadata...: 135861it [00:01, 80540.29it/s]


Generating invalidated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 6818it [00:00, 104080.96it/s]


Generating validated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 7872it [00:00, 78708.07it/s][A
Reading metadata...: 16216it [00:00, 81484.23it/s][A
Reading metadata...: 24456it [00:00, 81895.97it/s][A
Reading metadata...: 32646it [00:00, 78402.42it/s][A
Reading metadata...: 40508it [00:00, 75994.21it/s][A
Reading metadata...: 53858it [00:00, 77949.19it/s]


DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 9425
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 600
    })
})


In [10]:
import os
import pandas as pd
from datasets import load_dataset
import soundfile as sf
from tqdm import tqdm

print("=" * 70)
print("Preparing Urdu Common Voice Dataset for F5-TTS Training")
print("=" * 70)

# Step 1: Load the dataset from HuggingFace
print("\n[1/4] Loading dataset from HuggingFace...")
dataset = load_dataset("mozilla-foundation/common_voice_17_0", "ur", split="train")
test_dataset = load_dataset("mozilla-foundation/common_voice_17_0", "ur", split="test")

print(f"✓ Train samples: {len(dataset)}")
print(f"✓ Test samples: {len(test_dataset)}")

# Step 2: Create output directories
print("\n[2/4] Creating directories...")
OUTPUT_DIR = "/content/f5_tts_dataset"
AUDIO_DIR = f"{OUTPUT_DIR}/audio"
os.makedirs(AUDIO_DIR, exist_ok=True)
print(f"✓ Created: {OUTPUT_DIR}")

# Step 3: Extract and save audio files + metadata
print("\n[3/4] Extracting audio files and creating metadata...")
print("This may take a few minutes...")

metadata = []
error_count = 0

# Process training data
for idx, item in enumerate(tqdm(dataset, desc="Processing train")):
    try:
        # Get audio data
        audio_data = item['audio']
        audio_array = audio_data['array']
        sample_rate = audio_data['sampling_rate']

        # Get transcription
        transcription = item['sentence']

        # Skip if transcription is empty
        if not transcription or len(transcription.strip()) == 0:
            continue

        # Create filename
        audio_filename = f"train_{idx:06d}.wav"
        audio_path = os.path.join(AUDIO_DIR, audio_filename)

        # Save audio file
        sf.write(audio_path, audio_array, sample_rate)

        # Add to metadata
        metadata.append({
            'audio_path': f"audio/{audio_filename}",
            'text': transcription,
            'duration': len(audio_array) / sample_rate
        })

    except Exception as e:
        error_count += 1
        if error_count <= 5:
            print(f"Error processing sample {idx}: {e}")

print(f"✓ Processed {len(metadata)} training samples")
print(f"✓ Errors: {error_count}")

# Step 4: Filter by duration (optional but recommended)
print("\n[4/4] Filtering by duration...")
df = pd.DataFrame(metadata)

print(f"Original samples: {len(df)}")
print(f"Duration range: {df['duration'].min():.2f}s - {df['duration'].max():.2f}s")

# Filter: keep only 1-15 second clips (recommended for TTS)
df_filtered = df[(df['duration'] >= 1.0) & (df['duration'] <= 15.0)]
print(f"After filtering (1-15s): {len(df_filtered)} samples")

# Calculate total duration
total_hours = df_filtered['duration'].sum() / 3600
print(f"Total audio duration: {total_hours:.2f} hours")

# Save metadata
metadata_path = f"{OUTPUT_DIR}/metadata.csv"
df_filtered[['audio_path', 'text']].to_csv(metadata_path, index=False)
print(f"✓ Metadata saved to: {metadata_path}")

# Step 5: Create train/validation split
print("\n[5/5] Creating train/validation split...")
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df_filtered, test_size=0.1, random_state=42)

train_metadata_path = f"{OUTPUT_DIR}/train_metadata.csv"
val_metadata_path = f"{OUTPUT_DIR}/val_metadata.csv"

train_df[['audio_path', 'text']].to_csv(train_metadata_path, index=False)
val_df[['audio_path', 'text']].to_csv(val_metadata_path, index=False)

print(f"✓ Train samples: {len(train_df)} ({train_df['duration'].sum()/3600:.2f} hours)")
print(f"✓ Val samples: {len(val_df)} ({val_df['duration'].sum()/3600:.2f} hours)")
print(f"✓ Train metadata: {train_metadata_path}")
print(f"✓ Val metadata: {val_metadata_path}")

# Display sample data
print("\n" + "=" * 70)
print("Sample Data:")
print("=" * 70)
print(train_df[['audio_path', 'text', 'duration']].head(3))

# Verify files exist
print("\n" + "=" * 70)
print("Verification:")
print("=" * 70)
sample_path = os.path.join(OUTPUT_DIR, train_df.iloc[0]['audio_path'])
if os.path.exists(sample_path):
    print(f"✓ Audio files verified: {sample_path}")
else:
    print(f"✗ Error: Sample file not found: {sample_path}")

print("\n" + "=" * 70)
print("Dataset Preparation Complete!")
print("=" * 70)
print(f"\nYour dataset is ready at: {OUTPUT_DIR}")
print(f"- Audio files: {AUDIO_DIR}")
print(f"- Train metadata: {train_metadata_path}")
print(f"- Val metadata: {val_metadata_path}")
print(f"\nTotal training data: {len(train_df)} samples, {train_df['duration'].sum()/3600:.2f} hours")
print("\n✓ You can now proceed with F5-TTS fine-tuning!")

Preparing Urdu Common Voice Dataset for F5-TTS Training

[1/4] Loading dataset from HuggingFace...


Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 5368it [00:00, 66082.27it/s]


Generating validation split: 0 examples [00:00, ? examples/s]


Reading metadata...: 4057it [00:00, 61865.95it/s]


Generating test split: 0 examples [00:00, ? examples/s]


Reading metadata...: 4056it [00:00, 109236.77it/s]


Generating other split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 12737it [00:00, 127354.33it/s][A
Reading metadata...: 25473it [00:00, 126343.54it/s][A
Reading metadata...: 38527it [00:00, 128246.27it/s][A
Reading metadata...: 51353it [00:00, 120074.70it/s][A
Reading metadata...: 63479it [00:00, 120481.04it/s][A
Reading metadata...: 75716it [00:00, 121101.59it/s][A
Reading metadata...: 87860it [00:00, 119769.57it/s][A
Reading metadata...: 99861it [00:00, 119168.41it/s][A
Reading metadata...: 111794it [00:00, 116087.19it/s][A
Reading metadata...: 123990it [00:01, 117840.66it/s][A
Reading metadata...: 135861it [00:01, 119329.51it/s]


Generating invalidated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 6818it [00:00, 116108.72it/s]


Generating validated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 12959it [00:00, 129567.88it/s][A
Reading metadata...: 25916it [00:00, 128829.37it/s][A
Reading metadata...: 38830it [00:00, 128952.49it/s][A
Reading metadata...: 53858it [00:00, 123129.57it/s]


✓ Train samples: 5368
✓ Test samples: 4056

[2/4] Creating directories...
✓ Created: /content/f5_tts_dataset

[3/4] Extracting audio files and creating metadata...
This may take a few minutes...


Processing train: 100%|██████████| 5368/5368 [01:27<00:00, 61.36it/s]


✓ Processed 5368 training samples
✓ Errors: 0

[4/4] Filtering by duration...
Original samples: 5368
Duration range: 0.32s - 10.48s
After filtering (1-15s): 5367 samples
Total audio duration: 6.28 hours
✓ Metadata saved to: /content/f5_tts_dataset/metadata.csv

[5/5] Creating train/validation split...
✓ Train samples: 4830 (5.64 hours)
✓ Val samples: 537 (0.64 hours)
✓ Train metadata: /content/f5_tts_dataset/train_metadata.csv
✓ Val metadata: /content/f5_tts_dataset/val_metadata.csv

Sample Data:
                  audio_path  \
1344  audio/train_001344.wav   
3085  audio/train_003085.wav   
1244  audio/train_001244.wav   

                                                   text  duration  
1344               میں آپ کو حوالات میں بند کروا دوں گا     3.240  
3085  اٹھارہویں صدی میں قائم ہونے والے کوئٹہ کو آب و...     9.108  
1244  کامن ویلتھ گیمزپاکستانی ریسلر نے کانسی کا تمغہ...     5.436  

Verification:
✓ Audio files verified: /content/f5_tts_dataset/audio/train_001344.wav

Dataset Pr

In [10]:
# Mount Google Drive if not already mounted
from google.colab import drive
drive.mount('/content/drive')

# Copy the entire dataset to Drive
print("Copying dataset to Google Drive...")
!cp -r /content/f5_tts_dataset /content/drive/MyDrive/F5-TTS-Urdu-Dataset

print("✓ Dataset saved to Google Drive!")
print("Location: /content/drive/MyDrive/F5-TTS-Urdu-Dataset")

Mounted at /content/drive
Copying dataset to Google Drive...
✓ Dataset saved to Google Drive!
Location: /content/drive/MyDrive/F5-TTS-Urdu-Dataset


In [11]:
# Create a zip file
print("Creating zip archive...")
!cd /content && zip -r f5_tts_dataset.zip f5_tts_dataset/

# Download the zip
from google.colab import files
print("Starting download...")
files.download('/content/f5_tts_dataset.zip')

print("✓ Download started! Check your browser downloads.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: f5_tts_dataset/audio/train_000475.wav (deflated 49%)
  adding: f5_tts_dataset/audio/train_001521.wav (deflated 17%)
  adding: f5_tts_dataset/audio/train_001542.wav (deflated 16%)
  adding: f5_tts_dataset/audio/train_003038.wav (deflated 23%)
  adding: f5_tts_dataset/audio/train_004291.wav (deflated 22%)
  adding: f5_tts_dataset/audio/train_003394.wav (deflated 23%)
  adding: f5_tts_dataset/audio/train_003402.wav (deflated 25%)
  adding: f5_tts_dataset/audio/train_004699.wav (deflated 25%)
  adding: f5_tts_dataset/audio/train_005081.wav (deflated 33%)
  adding: f5_tts_dataset/audio/train_001693.wav (deflated 35%)
  adding: f5_tts_dataset/audio/train_003778.wav (deflated 42%)
  adding: f5_tts_dataset/audio/train_001497.wav (deflated 23%)
  adding: f5_tts_dataset/audio/train_002903.wav (deflated 31%)
  adding: f5_tts_dataset/audio/train_004441.wav (deflated 49%)
  adding: f5_tts_dataset/audio/train_001570.wav (defl

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✓ Download started! Check your browser downloads.


In [11]:
# Navigate to F5-TTS directory
%cd /content/F5-TTS

# List available scripts
!ls -la

# Check for training scripts
!find . -name "*train*" -type f | head -10

# Check the src structure
!ls -la src/f5_tts/ 2>/dev/null || ls -la f5_tts/ 2>/dev/null

/content/F5-TTS
total 72
drwxr-xr-x 8 root root 4096 Oct  7 21:16 .
drwxr-xr-x 1 root root 4096 Oct  7 21:23 ..
drwxr-xr-x 2 root root 4096 Oct  7 21:11 ckpts
drwxr-xr-x 3 root root 4096 Oct  7 21:11 data
-rw-r--r-- 1 root root  865 Oct  7 21:11 Dockerfile
drwxr-xr-x 8 root root 4096 Oct  7 21:11 .git
drwxr-xr-x 4 root root 4096 Oct  7 21:11 .github
-rw-r--r-- 1 root root 3202 Oct  7 21:11 .gitignore
-rw-r--r-- 1 root root  115 Oct  7 21:11 .gitmodules
drwxr-xr-x 4 root root 4096 Oct  7 21:18 hf_cache
-rw-r--r-- 1 root root 1068 Oct  7 21:11 LICENSE
-rw-r--r-- 1 root root  413 Oct  7 21:11 .pre-commit-config.yaml
-rw-r--r-- 1 root root 1565 Oct  7 21:11 pyproject.toml
-rw-r--r-- 1 root root 9529 Oct  7 21:11 README.md
-rw-r--r-- 1 root root  198 Oct  7 21:11 ruff.toml
drwxr-xr-x 5 root root 4096 Oct  7 21:11 src
./src/f5_tts/train/train.py
./src/f5_tts/model/trainer.py
./hf_cache/mozilla-foundation___common_voice_17_0/ur/17.0.0/9d10386a731ff6e6ed4ec973a4dc204a9820e8c842fbe388bdba0dd205

In [12]:
# Make sure all dependencies are installed
!pip install -e . --quiet

# Install additional requirements if needed
!pip install accelerate --quiet
!pip install wandb --quiet  # Optional: for tracking training

  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
  Building editable for f5-tts (pyproject.toml) ... [?25l[?25hdone


In [14]:
# Verify your dataset is ready
import pandas as pd
import os

TRAIN_METADATA = "/content/f5_tts_dataset/train_metadata.csv"
VAL_METADATA = "/content/f5_tts_dataset/val_metadata.csv"

# Load and check metadata
train_df = pd.read_csv(TRAIN_METADATA)
val_df = pd.read_csv(VAL_METADATA)

print(f"✓ Training samples: {len(train_df)}")
print(f"✓ Validation samples: {len(val_df)}")
print(f"\nSample data:")
print(train_df.head(2))

# Verify first audio file exists
first_audio = os.path.join("/content/f5_tts_dataset", train_df.iloc[0]['audio_path'])
print(f"\n✓ First audio file exists: {os.path.exists(first_audio)}")
print(f"  Path: {first_audio}")

✓ Training samples: 4830
✓ Validation samples: 537

Sample data:
               audio_path                                               text
0  audio/train_001344.wav               میں آپ کو حوالات میں بند کروا دوں گا
1  audio/train_003085.wav  اٹھارہویں صدی میں قائم ہونے والے کوئٹہ کو آب و...

✓ First audio file exists: True
  Path: /content/f5_tts_dataset/audio/train_001344.wav


In [15]:
%cd /content/F5-TTS

# Check README for training instructions
!cat README.md | grep -A 20 -i "train" | head -40

# Look for example scripts
!ls examples/ 2>/dev/null || echo "No examples folder"
!ls scripts/ 2>/dev/null || echo "No scripts folder"

# Check for training files
!find . -name "*.py" | grep -E "(train|finetune)" | head -10

# Check the main source structure
!ls -la src/f5_tts/ 2>/dev/null || ls -la f5_tts/

/content/F5-TTS
**F5-TTS**: Diffusion Transformer with ConvNeXt V2, faster trained and inference.

**E2 TTS**: Flat-UNet Transformer, closest reproduction from [paper](https://arxiv.org/abs/2406.18009).

**Sway Sampling**: Inference-time flow step sampling strategy, greatly improves performance

### Thanks to all the contributors !

## News
- **2025/03/12**: 🔥 F5-TTS v1 base model with better training and inference performance. [Few demo](https://swivid.github.io/F5-TTS_updates).
- **2024/10/08**: F5-TTS & E2 TTS base models on [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS), [🤖 Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), [🟣 Wisemodel](https://wisemodel.cn/models/SJTU_X-LANCE/F5-TTS_Emilia-ZH-EN).

## Installation

### Create a separate environment if needed

```bash
# Create a conda env with python_version>=3.10  (you could also use virtualenv)
conda create -n f5-tts python=3.11
conda activate f5-tts
```

### Install PyTorch with matched device

<

In [11]:
# Check GPU status
!nvidia-smi

# Check available memory
!free -h

# Test PyTorch CUDA
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

Tue Oct  7 16:12:48 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   43C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [13]:
# This prepares your dataset in F5-TTS format
%cd /content/F5-TTS

/content/F5-TTS


In [14]:
"""
F5-TTS Fine-tuning Pipeline for Urdu Dataset
Run this after enabling GPU in Colab
"""

import os
import json

print("=" * 70)
print("F5-TTS Fine-tuning Setup for Urdu Dataset")
print("=" * 70)

# Step 1: Verify GPU
import torch
print("\n[1] GPU Check:")
print(f"   CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("   ⚠️  WARNING: GPU not detected!")
    print("   Go to Runtime → Change runtime type → Select GPU")

# Step 2: Dataset paths
DATASET_DIR = "/content/f5_tts_dataset"
TRAIN_METADATA = f"{DATASET_DIR}/train_metadata.csv"
VAL_METADATA = f"{DATASET_DIR}/val_metadata.csv"
CHECKPOINT_DIR = "/content/checkpoints"
EXPERIMENT_NAME = "urdu_tts_finetune"

# Create directories
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(f"{CHECKPOINT_DIR}/logs", exist_ok=True)

print(f"\n[2] Dataset Info:")
print(f"   Dataset dir: {DATASET_DIR}")
print(f"   Train metadata: {TRAIN_METADATA}")
print(f"   Validation metadata: {VAL_METADATA}")
print(f"   Checkpoint dir: {CHECKPOINT_DIR}")

# Step 3: Prepare dataset in F5-TTS format
print(f"\n[3] Preparing dataset for F5-TTS...")

# F5-TTS expects a specific format, let's create it
import pandas as pd
import shutil

# Read our metadata
train_df = pd.read_csv(TRAIN_METADATA)
val_df = pd.read_csv(VAL_METADATA)

# Create F5-TTS compatible structure
f5_data_dir = "/content/f5_tts_formatted"
os.makedirs(f"{f5_data_dir}/wavs", exist_ok=True)

# Copy audio files and create metadata.list
with open(f"{f5_data_dir}/metadata.list", "w", encoding="utf-8") as f:
    for idx, row in train_df.iterrows():
        audio_src = os.path.join(DATASET_DIR, row['audio_path'])
        audio_filename = os.path.basename(row['audio_path'])
        audio_dst = f"{f5_data_dir}/wavs/{audio_filename}"

        # Create symlink instead of copying (saves space)
        if not os.path.exists(audio_dst):
            os.symlink(audio_src, audio_dst)

        # Write metadata in F5-TTS format: audio_path|text
        f.write(f"wavs/{audio_filename}|{row['text']}\n")

print(f"   ✓ Created F5-TTS formatted dataset at: {f5_data_dir}")
print(f"   ✓ Total training samples: {len(train_df)}")

# Step 4: Training configuration
print(f"\n[4] Training Configuration:")
config = {
    "exp_name": EXPERIMENT_NAME,
    "learning_rate": 7.5e-5,  # F5-TTS recommended
    "batch_size_per_gpu": 4,  # Reduce to 2 if OOM
    "batch_size_type": "frame",
    "max_samples": 64,
    "grad_accumulation_steps": 1,
    "max_grad_norm": 1.0,
    "epochs": 20,
    "num_warmup_updates": 2000,
    "save_per_updates": 1000,
    "log_per_updates": 10,
    "mixed_precision": "bf16",  # or "fp16"
    "dataset_name": "urdu_cv",
    "tokenizer": "pinyin",  # We'll use char tokenizer for Urdu
}

for key, value in config.items():
    print(f"   {key:25s}: {value}")

# Save config
config_path = f"{CHECKPOINT_DIR}/training_config.json"
with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)

print(f"\n[5] ✓ Setup Complete!")
print(f"   Config saved: {config_path}")

# Step 5: Print training commands
print("\n" + "=" * 70)
print("TRAINING COMMANDS")
print("=" * 70)

print("""
# ============================================================
# METHOD 1: Using finetune_cli.py (RECOMMENDED)
# ============================================================
_


# ============================================================
# If you get OOM (Out of Memory) errors:
# ============================================================
# Reduce batch_size_per_gpu to 2 or 1
# Or reduce max_samples to 32
""")

print("\n" + "=" * 70)
print("MONITORING TRAINING")
print("=" * 70)
print("""
# View training logs in real-time
!tail -f /content/checkpoints/logs/train.log

# Or check checkpoint directory
!ls -lh /content/checkpoints/

# Monitor GPU usage
!watch -n 1 nvidia-smi
""")

print("\n" + "=" * 70)
print("NEXT STEPS:")
print("=" * 70)
print("1. ⚠️  ENABLE GPU: Runtime → Change runtime type → GPU")
print("2. Run METHOD 1 command above to start training")
print("3. Training will take several hours depending on GPU")
print("4. Checkpoints saved to: /content/checkpoints/")
print("5. Monitor progress with: !tail -f /content/checkpoints/logs/train.log")
print("=" * 70)

F5-TTS Fine-tuning Setup for Urdu Dataset

[1] GPU Check:
   CUDA available: True
   GPU: Tesla T4
   GPU Memory: 14.74 GB

[2] Dataset Info:
   Dataset dir: /content/f5_tts_dataset
   Train metadata: /content/f5_tts_dataset/train_metadata.csv
   Validation metadata: /content/f5_tts_dataset/val_metadata.csv
   Checkpoint dir: /content/checkpoints

[3] Preparing dataset for F5-TTS...
   ✓ Created F5-TTS formatted dataset at: /content/f5_tts_formatted
   ✓ Total training samples: 4830

[4] Training Configuration:
   exp_name                 : urdu_tts_finetune
   learning_rate            : 7.5e-05
   batch_size_per_gpu       : 4
   batch_size_type          : frame
   max_samples              : 64
   grad_accumulation_steps  : 1
   max_grad_norm            : 1.0
   epochs                   : 20
   num_warmup_updates       : 2000
   save_per_updates         : 1000
   log_per_updates          : 10
   mixed_precision          : bf16
   dataset_name             : urdu_cv
   tokenizer         

In [15]:
%cd /content/F5-TTS
!python src/f5_tts/train/finetune_cli.py --help

/content/F5-TTS
  re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
  re_skip_default = re.compile("(\r\n|\s)", re.U)
  re_skip = re.compile("([a-zA-Z0-9]+(?:\.\d+)?%?)")
usage: finetune_cli.py [-h] [--exp_name {F5TTS_v1_Base,F5TTS_Base,E2TTS_Base}]
                       [--dataset_name DATASET_NAME]
                       [--learning_rate LEARNING_RATE]
                       [--batch_size_per_gpu BATCH_SIZE_PER_GPU]
                       [--batch_size_type {frame,sample}]
                       [--max_samples MAX_SAMPLES]
                       [--grad_accumulation_steps GRAD_ACCUMULATION_STEPS]
                       [--max_grad_norm MAX_GRAD_NORM] [--epochs EPOCHS]
                       [--num_warmup_updates NUM_WARMUP_UPDATES]
                       [--save_per_updates SAVE_PER_UPDATES]
                       [--keep_last_n_checkpoints KEEP_LAST_N_CHECKPOINTS]
                       [--last_per_updates LAST_PER_UPDATES] [--finetune]
                     

In [16]:
# Check if F5-TTS expects the data in a specific location/format
!ls -la src/f5_tts/train/datasets/
!cat src/f5_tts/train/finetune_cli.py | grep -A 10 "data"

total 48
drwxr-xr-x 2 root root  4096 Oct  7 21:11 .
drwxr-xr-x 3 root root  4096 Oct  7 21:11 ..
-rw-r--r-- 1 root root 10526 Oct  7 21:11 prepare_csv_wavs.py
-rw-r--r-- 1 root root  7430 Oct  7 21:11 prepare_emilia.py
-rw-r--r-- 1 root root  3245 Oct  7 21:11 prepare_emilia_v2.py
-rw-r--r-- 1 root root  3181 Oct  7 21:11 prepare_libritts.py
-rw-r--r-- 1 root root  2276 Oct  7 21:11 prepare_ljspeech.py
-rw-r--r-- 1 root root  4580 Oct  7 21:11 prepare_wenetspeech4tts.py
from f5_tts.model.dataset import load_dataset
from f5_tts.model.utils import get_tokenizer


# -------------------------- Dataset Settings --------------------------- #
target_sample_rate = 24000
n_mel_channels = 100
hop_length = 256
win_length = 1024
n_fft = 1024
mel_spec_type = "vocos"  # 'vocos' or 'bigvgan'
--
    parser.add_argument("--dataset_name", type=str, default="Emilia_ZH_EN", help="Name of the dataset to use")
    parser.add_argument("--learning_rate", type=float, default=1e-5, help="Learning rate for trai

In [17]:
# Check the training script to understand dataset loading
!grep -n "dataset" src/f5_tts/train/finetune_cli.py | head -20

9:from f5_tts.model.dataset import load_dataset
33:    parser.add_argument("--dataset_name", type=str, default="Emilia_ZH_EN", help="Name of the dataset to use")
84:    checkpoint_path = str(files("f5_tts").joinpath(f"../../ckpts/{args.dataset_name}"))
161:        tokenizer_path = args.dataset_name
197:        wandb_project=args.dataset_name,
205:    train_dataset = load_dataset(args.dataset_name, tokenizer, mel_spec_kwargs=mel_spec_kwargs)
208:        train_dataset,
209:        resumable_with_seed=666,  # seed for shuffling dataset


In [19]:
"""
Prepare Urdu dataset in F5-TTS expected format
"""

import os
import pandas as pd
import shutil
from pathlib import Path

print("=" * 70)
print("Preparing Urdu Dataset for F5-TTS Fine-tuning")
print("=" * 70)

# Source dataset
DATASET_DIR = "/content/f5_tts_dataset"
TRAIN_METADATA = f"{DATASET_DIR}/train_metadata.csv"
VAL_METADATA = f"{DATASET_DIR}/val_metadata.csv"

# F5-TTS expects data in a specific structure
# Let's create it under the F5-TTS data directory
F5_DATA_DIR = "/content/F5-TTS/data/urdu_cv"
os.makedirs(F5_DATA_DIR, exist_ok=True)
os.makedirs(f"{F5_DATA_DIR}/wavs", exist_ok=True)

print(f"\n[1] Loading metadata...")
train_df = pd.read_csv(TRAIN_METADATA)
val_df = pd.read_csv(VAL_METADATA)
print(f"   Train samples: {len(train_df)}")
print(f"   Val samples: {len(val_df)}")

# F5-TTS typically expects a metadata.list file with format:
# audio_path|transcription|speaker_id (optional)
print(f"\n[2] Creating metadata.list for training...")

train_count = 0
val_count = 0

# Create training metadata
with open(f"{F5_DATA_DIR}/metadata.list", "w", encoding="utf-8") as f:
    for idx, row in train_df.iterrows():
        # Source audio path
        audio_src = os.path.join(DATASET_DIR, row['audio_path'])

        if not os.path.exists(audio_src):
            continue

        # Target audio filename
        audio_filename = f"train_{idx:06d}.wav"
        audio_dst = f"{F5_DATA_DIR}/wavs/{audio_filename}"

        # Create symlink (saves disk space) or copy
        if not os.path.exists(audio_dst):
            try:
                os.symlink(audio_src, audio_dst)
            except:
                shutil.copy(audio_src, audio_dst)

        # Write metadata: audio_path|text
        text = row['text'].strip()
        f.write(f"wavs/{audio_filename}|{text}\n")
        train_count += 1

        if train_count % 500 == 0:
            print(f"   Processed {train_count}/{len(train_df)}...")

print(f"   ✓ Created training metadata: {train_count} samples")

# Create validation metadata (optional)
with open(f"{F5_DATA_DIR}/metadata_val.list", "w", encoding="utf-8") as f:
    for idx, row in val_df.iterrows():
        audio_src = os.path.join(DATASET_DIR, row['audio_path'])

        if not os.path.exists(audio_src):
            continue

        audio_filename = f"val_{idx:06d}.wav"
        audio_dst = f"{F5_DATA_DIR}/wavs/{audio_filename}"

        if not os.path.exists(audio_dst):
            try:
                os.symlink(audio_src, audio_dst)
            except:
                shutil.copy(audio_src, audio_dst)

        text = row['text'].strip()
        f.write(f"wavs/{audio_filename}|{text}\n")
        val_count += 1

print(f"   ✓ Created validation metadata: {val_count} samples")

# Verify the structure
print(f"\n[3] Verifying dataset structure...")
print(f"   Dataset directory: {F5_DATA_DIR}")
print(f"   Metadata file: {F5_DATA_DIR}/metadata.list")
print(f"   Audio directory: {F5_DATA_DIR}/wavs/")

# Check audio files
audio_files = list(Path(f"{F5_DATA_DIR}/wavs").glob("*.wav"))
print(f"   Total audio files: {len(audio_files)}")

# Display sample metadata
print(f"\n[4] Sample metadata (first 3 lines):")
with open(f"{F5_DATA_DIR}/metadata.list", "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i < 3:
            print(f"   {line.strip()}")

print("\n" + "=" * 70)
print("✓ Dataset preparation complete!")
print("=" * 70)
print(f"\nYour dataset is ready at: {F5_DATA_DIR}")
print(f"You can now start fine-tuning with the dataset_name: urdu_cv")

# Summary
print("\n" + "=" * 70)
print("DATASET SUMMARY")
print("=" * 70)
print(f"Training samples: {train_count}")
print(f"Validation samples: {val_count}")
# Calculate duration if available
if 'duration' in train_df.columns:
    print(f"Total duration: ~{train_df['duration'].sum()/3600:.2f} hours (estimated)")
print(f"Dataset location: {F5_DATA_DIR}")
print("=" * 70)

Preparing Urdu Dataset for F5-TTS Fine-tuning

[1] Loading metadata...
   Train samples: 4830
   Val samples: 537

[2] Creating metadata.list for training...
   Processed 500/4830...
   Processed 1000/4830...
   Processed 1500/4830...
   Processed 2000/4830...
   Processed 2500/4830...
   Processed 3000/4830...
   Processed 3500/4830...
   Processed 4000/4830...
   Processed 4500/4830...
   ✓ Created training metadata: 4830 samples
   ✓ Created validation metadata: 537 samples

[3] Verifying dataset structure...
   Dataset directory: /content/F5-TTS/data/urdu_cv
   Metadata file: /content/F5-TTS/data/urdu_cv/metadata.list
   Audio directory: /content/F5-TTS/data/urdu_cv/wavs/
   Total audio files: 5367

[4] Sample metadata (first 3 lines):
   wavs/train_000000.wav|میں آپ کو حوالات میں بند کروا دوں گا
   wavs/train_000001.wav|اٹھارہویں صدی میں قائم ہونے والے کوئٹہ کو آب و ہوا ماحول اور خوبصورتی کی وجہ سے انگریز لٹل لندن کہتے تھے
   wavs/train_000002.wav|کامن ویلتھ گیمزپاکستانی ریسلر نے 

In [28]:
metadata_file = "/content/F5-TTS/data/urdu_cv/metadata.list"
vocab_output_path = "/content/F5-TTS/data/urdu_cv_char/vocab.txt"

unique_chars = set()
with open(metadata_file, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split("|")
        if len(parts) == 2:
            text = parts[1]
            unique_chars.update(text)

# Remove space if accidentally in set, we will add it explicitly as first line
unique_chars.discard(' ')

# Sort remaining characters
unique_chars = sorted(unique_chars)

import os
os.makedirs(os.path.dirname(vocab_output_path), exist_ok=True)

with open(vocab_output_path, "w", encoding="utf-8") as f:
    # Write space as index 0
    f.write(" \n")
    # Write all other chars
    for ch in unique_chars:
        f.write(ch + "\n")

print(f"Vocabulary file created with space as the first token at: {vocab_output_path}")

Vocabulary file created with space as the first token at: /content/F5-TTS/data/urdu_cv_char/vocab.txt


In [34]:
from datasets.arrow_writer import ArrowWriter
from tqdm import tqdm
from pathlib import Path
import json
import os

metadata_path = "/content/F5-TTS/data/urdu_cv/metadata.list"
audio_base_dir = "/content/F5-TTS/data/urdu_cv/wavs"
output_dir = "/content/F5-TTS/data/urdu_cv_char"

def read_metadata(metadata_file):
    audio_text_pairs = []
    with open(metadata_file, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split("|")
            if len(parts) == 2:
                audio_path = os.path.join(audio_base_dir, parts[0])
                text = parts[1]
                if os.path.isfile(audio_path):
                    audio_text_pairs.append({"audio_path": audio_path, "text": text})
    return audio_text_pairs

def save_prepared_dataset(dataset_entries, output_folder):
    out_dir = Path(output_folder)
    out_dir.mkdir(parents=True, exist_ok=True)
    raw_arrow_path = out_dir / "raw.arrow"
    duration_list = []

    # Writing raw.arrow with ArrowWriter
    with ArrowWriter(path=str(raw_arrow_path), writer_batch_size=1) as writer:
        for entry in tqdm(dataset_entries, desc="Writing raw.arrow"):
            writer.write(entry)
            # Duration is optional, skipping for now

    # Save vocab.txt, add space as index 0
    vocab_set = set()
    for entry in dataset_entries:
        vocab_set.update(set(entry["text"]))
    if ' ' in vocab_set:
        vocab_set.remove(' ')
    vocab = [' '] + sorted(vocab_set)

    vocab_path = out_dir / "vocab.txt"
    with open(vocab_path, "w", encoding="utf-8") as f:
        for token in vocab:
            f.write(token + "\n")

    print(f"Saved raw.arrow and vocab.txt to {output_folder}")

# Run preparation
metadata_entries = read_metadata(metadata_path)
save_prepared_dataset(metadata_entries, output_dir)


Writing raw.arrow: 0it [00:00, ?it/s]

Saved raw.arrow and vocab.txt to /content/F5-TTS/data/urdu_cv_char





In [20]:
# Run the dataset preparation
%cd /content

# Execute the preparation script (from the artifact above)
# Copy and paste the artifact code, or run it directly

/content


In [34]:
from google.colab import drive
drive.mount('/content/drive')

# Create backup directory in Drive
import os
os.makedirs('/content/drive/MyDrive/F5-TTS-Checkpoints', exist_ok=True)

# Make checkpoints save directly to Drive
!rm -rf /content/F5-TTS/ckpts
!ln -s /content/drive/MyDrive/F5-TTS-Checkpoints /content/F5-TTS/ckpts

print("✓ Checkpoints will now save directly to Google Drive!")
print("Location: /content/drive/MyDrive/F5-TTS-Checkpoints")

Mounted at /content/drive
✓ Checkpoints will now save directly to Google Drive!
Location: /content/drive/MyDrive/F5-TTS-Checkpoints


In [27]:
# @title
# Quick fix - create symlink with _char suffix
import os
os.makedirs("/content/F5-TTS/data/urdu_cv_char", exist_ok=True)
!ln -sf /content/F5-TTS/data/urdu_cv/raw.arrow /content/F5-TTS/data/urdu_cv_char/raw.arrow
!ln -sf /content/F5-TTS/data/urdu_cv/vocab.txt /content/F5-TTS/data/urdu_cv_char/vocab.txt
!ln -sf /content/F5-TTS/data/urdu_cv/duration.json /content/F5-TTS/data/urdu_cv_char/duration.json
print("✓ Fixed! Now start training.")

✓ Fixed! Now start training.


In [30]:
# @title
# Quick patch for the trainer bug
import os

trainer_file = "/content/F5-TTS/src/f5_tts/model/trainer.py"

# Read the file
with open(trainer_file, 'r') as f:
    content = f.read()

# Add a check to prevent division by zero
original_line = "skipped_epoch = int(start_step // orig_epoch_step)"
patched_line = "skipped_epoch = int(start_step // orig_epoch_step) if orig_epoch_step > 0 else 0"

if original_line in content:
    content = content.replace(original_line, patched_line)
    with open(trainer_file, 'w') as f:
        f.write(content)
    print("✓ Trainer patched! Now try training again.")
else:
    print("Line not found or already patched.")

✓ Trainer patched! Now try training again.


In [None]:
# First, clear GPU memory
import torch
torch.cuda.empty_cache()

# Then restart training with smaller batch size
%cd /content/F5-TTS

!python src/f5_tts/train/finetune_cli.py \
    --exp_name F5TTS_Base \
    --dataset_name urdu_cv \
    --learning_rate 7.5e-5 \
    --batch_size_per_gpu 4 \
    --batch_size_type sample \
    --grad_accumulation_steps 4 \
    --max_grad_norm 1.0 \
    --epochs 20 \
    --num_warmup_updates 1000 \
    --save_per_updates 500 \
    --last_per_updates 250 \
    --finetune \
    --tokenizer char \
    --log_samples \
    --logger tensorboard

/content/F5-TTS
copy checkpoint for finetune

vocab :  2545

vocoder :  vocos
2025-10-07 22:09:04.135205: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759874944.169886   25178 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759874944.182022   25178 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1759874944.207874   25178 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1759874944.207910   25178 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:

In [29]:
# @title
%cd /content/F5-TTS

!python src/f5_tts/train/finetune_cli.py \
    --exp_name F5TTS_Base \
    --dataset_name urdu_cv \
    --learning_rate 7.5e-5 \
    --batch_size_per_gpu 2 \
    --batch_size_type frame \
    --max_samples 32 \
    --grad_accumulation_steps 2 \
    --max_grad_norm 1.0 \
    --epochs 20 \
    --num_warmup_updates 1000 \
    --save_per_updates 500 \
    --last_per_updates 250 \
    --finetune \
    --tokenizer char \
    --log_samples \
    --logger tensorboard

/content/F5-TTS

vocab :  2545

vocoder :  vocos
2025-10-07 21:59:05.169925: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759874345.189798   22334 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759874345.195946   22334 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1759874345.212065   22334 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1759874345.212094   22334 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1759874345.212099   22334 

In [20]:
# @title
"""
Complete F5-TTS Dataset Preparation
Creates all required files: vocab.txt, preprocessed data, etc.
"""

import os
import json
import pandas as pd
from pathlib import Path
from collections import Counter
import librosa
import soundfile as sf
from tqdm import tqdm

print("=" * 70)
print("Complete F5-TTS Dataset Preparation for Urdu")
print("=" * 70)

# ============================================
# Configuration
# ============================================
DATASET_DIR = "/content/f5_tts_dataset"
TRAIN_METADATA = f"{DATASET_DIR}/train_metadata.csv"
VAL_METADATA = f"{DATASET_DIR}/val_metadata.csv"

F5_DATA_DIR = "/content/F5-TTS/data/urdu_cv"
os.makedirs(F5_DATA_DIR, exist_ok=True)
os.makedirs(f"{F5_DATA_DIR}/wavs", exist_ok=True)

print(f"\n[1/5] Loading metadata...")
train_df = pd.read_csv(TRAIN_METADATA)
val_df = pd.read_csv(VAL_METADATA)
print(f"   Train samples: {len(train_df)}")
print(f"   Val samples: {len(val_df)}")

# ============================================
# Step 1: Create vocab.txt (Character vocabulary)
# ============================================
print(f"\n[2/5] Creating vocab.txt...")

# Collect all unique characters from transcriptions
all_chars = set()
for text in train_df['text']:
    all_chars.update(text)
for text in val_df['text']:
    all_chars.update(text)

# Sort characters
vocab = sorted(list(all_chars))

# Add special tokens
special_tokens = ['<pad>', '<unk>', '<bos>', '<eos>']
vocab = special_tokens + vocab

# Save vocab.txt
vocab_path = f"{F5_DATA_DIR}/vocab.txt"
with open(vocab_path, 'w', encoding='utf-8') as f:
    for char in vocab:
        f.write(f"{char}\n")

print(f"   ✓ Created vocab.txt with {len(vocab)} characters")
print(f"   Path: {vocab_path}")
print(f"   Sample vocab: {vocab[:20]}")

# ============================================
# Step 2: Create metadata.list with audio info
# ============================================
print(f"\n[3/5] Creating metadata.list with audio information...")

def get_audio_info(audio_path):
    """Get audio duration and sample rate"""
    try:
        info = sf.info(audio_path)
        return info.duration, info.samplerate
    except:
        return 0, 0

metadata_list = []
durations = []

print("   Processing training data...")
for idx, row in tqdm(train_df.iterrows(), total=len(train_df), desc="   Train"):
    audio_src = os.path.join(DATASET_DIR, row['audio_path'])

    if not os.path.exists(audio_src):
        continue

    # Get audio info
    duration, sr = get_audio_info(audio_src)
    if duration == 0:
        continue

    durations.append(duration)

    # Create filename
    audio_filename = f"train_{idx:06d}.wav"
    audio_dst = f"{F5_DATA_DIR}/wavs/{audio_filename}"

    # Create symlink or copy
    if not os.path.exists(audio_dst):
        try:
            os.symlink(audio_src, audio_dst)
        except:
            import shutil
            shutil.copy(audio_src, audio_dst)

    # Format: audio_path|text|speaker|duration|sample_rate
    text = row['text'].strip()
    metadata_list.append(f"wavs/{audio_filename}|{text}|urdu_speaker|{duration:.2f}|{sr}")

# Save metadata
metadata_path = f"{F5_DATA_DIR}/metadata.list"
with open(metadata_path, 'w', encoding='utf-8') as f:
    for line in metadata_list:
        f.write(line + '\n')

print(f"   ✓ Created metadata.list with {len(metadata_list)} samples")
print(f"   Total duration: {sum(durations)/3600:.2f} hours")

# ============================================
# Step 3: Create dataset info JSON
# ============================================
print(f"\n[4/5] Creating dataset_info.json...")

dataset_info = {
    "name": "urdu_cv",
    "description": "Urdu Common Voice dataset for F5-TTS",
    "version": "1.0",
    "num_samples": len(metadata_list),
    "total_duration_hours": sum(durations) / 3600,
    "sample_rate": 24000,  # Target sample rate
    "vocab_size": len(vocab),
    "language": "urdu",
    "speakers": ["urdu_speaker"],
    "train_samples": len(train_df),
    "val_samples": len(val_df)
}

info_path = f"{F5_DATA_DIR}/dataset_info.json"
with open(info_path, 'w', encoding='utf-8') as f:
    json.dump(dataset_info, f, indent=2, ensure_ascii=False)

print(f"   ✓ Created dataset_info.json")
print(f"   Dataset info: {dataset_info}")

# ============================================
# Step 4: Create character-to-index mapping
# ============================================
print(f"\n[5/5] Creating char2idx mapping...")

char2idx = {char: idx for idx, char in enumerate(vocab)}
idx2char = {idx: char for char, idx in char2idx.items()}

# Save mappings
char2idx_path = f"{F5_DATA_DIR}/char2idx.json"
idx2char_path = f"{F5_DATA_DIR}/idx2char.json"

with open(char2idx_path, 'w', encoding='utf-8') as f:
    json.dump(char2idx, f, indent=2, ensure_ascii=False)

with open(idx2char_path, 'w', encoding='utf-8') as f:
    json.dump(idx2char, f, indent=2, ensure_ascii=False)

print(f"   ✓ Created character mappings")

# ============================================
# Verification
# ============================================
print("\n" + "=" * 70)
print("VERIFICATION")
print("=" * 70)

required_files = [
    'vocab.txt',
    'metadata.list',
    'dataset_info.json',
    'char2idx.json',
    'idx2char.json'
]

print(f"\nDataset directory: {F5_DATA_DIR}")
print(f"\nRequired files:")
for filename in required_files:
    filepath = os.path.join(F5_DATA_DIR, filename)
    exists = "✓" if os.path.exists(filepath) else "✗"
    size = os.path.getsize(filepath) if os.path.exists(filepath) else 0
    print(f"  {exists} {filename:20s} ({size:>10,} bytes)")

# Check audio files
audio_files = list(Path(f"{F5_DATA_DIR}/wavs").glob("*.wav"))
print(f"\n  ✓ Audio files: {len(audio_files)}")

# Sample metadata
print(f"\n[Sample metadata.list (first 3 lines)]:")
with open(metadata_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i < 3:
            print(f"  {line.strip()}")

# Sample vocab
print(f"\n[Sample vocab.txt (first 20 chars)]:")
with open(vocab_path, 'r', encoding='utf-8') as f:
    chars = [line.strip() for line in f]
    print(f"  {chars[:20]}")

print("\n" + "=" * 70)
print("✓ DATASET PREPARATION COMPLETE!")
print("=" * 70)
print(f"\nYour dataset is ready at: {F5_DATA_DIR}")
print(f"Total samples: {len(metadata_list)}")
print(f"Vocabulary size: {len(vocab)} characters")
print(f"Total duration: {sum(durations)/3600:.2f} hours")
print("\n✓ You can now start training!")
print("=" * 70)

# Print training command
print("\n" + "=" * 70)
print("START TRAINING WITH THIS COMMAND:")

Complete F5-TTS Dataset Preparation for Urdu

[1/5] Loading metadata...
   Train samples: 4830
   Val samples: 537

[2/5] Creating vocab.txt...
   ✓ Created vocab.txt with 96 characters
   Path: /content/F5-TTS/data/urdu_cv/vocab.txt
   Sample vocab: ['<pad>', '<unk>', '<bos>', '<eos>', ' ', '!', '"', "'", ',', '-', '.', ':', '`', '،', 'ؑ', 'ؓ', 'ؔ', '؛', '؟', 'ء']

[3/5] Creating metadata.list with audio information...
   Processing training data...


   Train: 100%|██████████| 4830/4830 [00:01<00:00, 3398.30it/s]

   ✓ Created metadata.list with 4830 samples
   Total duration: 5.64 hours

[4/5] Creating dataset_info.json...
   ✓ Created dataset_info.json
   Dataset info: {'name': 'urdu_cv', 'description': 'Urdu Common Voice dataset for F5-TTS', 'version': '1.0', 'num_samples': 4830, 'total_duration_hours': 5.64275, 'sample_rate': 24000, 'vocab_size': 96, 'language': 'urdu', 'speakers': ['urdu_speaker'], 'train_samples': 4830, 'val_samples': 537}

[5/5] Creating char2idx mapping...
   ✓ Created character mappings

VERIFICATION

Dataset directory: /content/F5-TTS/data/urdu_cv

Required files:
  ✓ vocab.txt            (       308 bytes)
  ✓ metadata.list        (   551,563 bytes)
  ✓ dataset_info.json    (       311 bytes)
  ✓ char2idx.json        (     1,165 bytes)
  ✓ idx2char.json        (     1,357 bytes)

  ✓ Audio files: 5367

[Sample metadata.list (first 3 lines)]:
  wavs/train_000000.wav|میں آپ کو حوالات میں بند کروا دوں گا|urdu_speaker|3.24|48000
  wavs/train_000001.wav|اٹھارہویں صدی میں ق




In [25]:
# @title
"""
Prepare Urdu dataset using F5-TTS's official prepare_csv_wavs.py script
This creates all required files: raw.arrow, vocab.txt, duration.json
"""

import os
import pandas as pd
from pathlib import Path
import shutil

print("=" * 70)
print("Preparing Dataset for F5-TTS (Official Method)")
print("=" * 70)

# ============================================
# Step 1: Create CSV in F5-TTS Expected Format
# ============================================
print("\n[1/4] Creating metadata.csv in F5-TTS format...")

# Source dataset
DATASET_DIR = "/content/f5_tts_dataset"
TRAIN_METADATA = f"{DATASET_DIR}/train_metadata.csv"

# Read our metadata
train_df = pd.read_csv(TRAIN_METADATA)
print(f"   Loaded {len(train_df)} training samples")

# Create F5-TTS compatible directory structure
F5_INPUT_DIR = "/content/f5_input_dataset"
os.makedirs(F5_INPUT_DIR, exist_ok=True)
os.makedirs(f"{F5_INPUT_DIR}/wavs", exist_ok=True)

# F5-TTS expects metadata.csv with format: audio_path|text
# First line should be a header (but it gets skipped)
metadata_lines = ["audio_path|text\n"]  # Header

print("   Creating symlinks for audio files...")
for idx, row in train_df.iterrows():
    audio_src = os.path.join(DATASET_DIR, row['audio_path'])

    if not os.path.exists(audio_src):
        continue

    # Create audio filename
    audio_filename = f"{idx:06d}.wav"
    audio_dst = f"{F5_INPUT_DIR}/wavs/{audio_filename}"

    # Create symlink (saves space)
    if not os.path.exists(audio_dst):
        try:
            os.symlink(audio_src, audio_dst)
        except:
            shutil.copy(audio_src, audio_dst)

    # Add to metadata: relative_path|text
    text = row['text'].strip()
    metadata_lines.append(f"wavs/{audio_filename}|{text}\n")

    if (idx + 1) % 500 == 0:
        print(f"   Processed {idx + 1}/{len(train_df)}...")

# Save metadata.csv
metadata_csv_path = f"{F5_INPUT_DIR}/metadata.csv"
with open(metadata_csv_path, 'w', encoding='utf-8') as f:
    f.writelines(metadata_lines)

print(f"   ✓ Created metadata.csv with {len(metadata_lines)-1} samples")
print(f"   ✓ Input directory: {F5_INPUT_DIR}")

# ============================================
# Step 2: Verify Input Format
# ============================================
print("\n[2/4] Verifying input format...")

# Check structure
has_metadata = os.path.exists(f"{F5_INPUT_DIR}/metadata.csv")
has_wavs_dir = os.path.exists(f"{F5_INPUT_DIR}/wavs") and os.path.isdir(f"{F5_INPUT_DIR}/wavs")
audio_count = len(list(Path(f"{F5_INPUT_DIR}/wavs").glob("*.wav")))

print(f"   metadata.csv exists: {'✓' if has_metadata else '✗'}")
print(f"   wavs/ directory exists: {'✓' if has_wavs_dir else '✗'}")
print(f"   Audio files found: {audio_count}")

if has_metadata and has_wavs_dir and audio_count > 0:
    print("   ✓ Input format is correct!")
else:
    print("   ✗ Input format verification failed!")
    exit(1)

# Sample metadata
print(f"\n   Sample metadata.csv (first 3 lines):")
with open(metadata_csv_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i < 3:
            print(f"     {line.strip()}")

# ============================================
# Step 3: Run Official prepare_csv_wavs.py
# ============================================
print("\n[3/4] Running F5-TTS's prepare_csv_wavs.py script...")
print("   This will create: raw.arrow, vocab.txt, duration.json")
print("   This may take several minutes...")

# Output directory for prepared dataset
F5_OUTPUT_DIR = "/content/F5-TTS/data/urdu_cv"

# Install required package if not present
print("\n   Installing dependencies...")
os.system("pip install pyarrow -q")

# Run the official preparation script
print("\n   Processing dataset...")
%cd /content/F5-TTS

# Use --workers to speed up processing (adjust based on CPU)
!python src/f5_tts/train/datasets/prepare_csv_wavs.py \
    {F5_INPUT_DIR} \
    {F5_OUTPUT_DIR} \
    --workers 4

# ============================================
# Step 4: Verify Output
# ============================================
print("\n[4/4] Verifying prepared dataset...")

required_files = {
    'raw.arrow': 'Preprocessed dataset',
    'vocab.txt': 'Character vocabulary',
    'duration.json': 'Audio durations'
}

print(f"\nOutput directory: {F5_OUTPUT_DIR}")
print(f"\nRequired files:")

all_present = True
for filename, description in required_files.items():
    filepath = os.path.join(F5_OUTPUT_DIR, filename)
    exists = os.path.exists(filepath)
    size = os.path.getsize(filepath) if exists else 0
    status = "✓" if exists else "✗"
    print(f"  {status} {filename:20s} - {description:30s} ({size:>10,} bytes)")
    if not exists:
        all_present = False

if all_present:
    print("\n" + "=" * 70)
    print("✓ DATASET PREPARATION COMPLETE!")
    print("=" * 70)

    # Show dataset info
    import json
    with open(f"{F5_OUTPUT_DIR}/duration.json", 'r') as f:
        duration_data = json.load(f)
        total_hours = sum(duration_data['duration']) / 3600

    with open(f"{F5_OUTPUT_DIR}/vocab.txt", 'r', encoding='utf-8') as f:
        vocab_size = len(f.readlines())

    print(f"\nDataset Statistics:")
    print(f"  Total samples: {len(duration_data['duration'])}")
    print(f"  Total duration: {total_hours:.2f} hours")
    print(f"  Vocabulary size: {vocab_size} characters")
    print(f"  Dataset location: {F5_OUTPUT_DIR}")

    print("\n" + "=" * 70)
    print("START TRAINING WITH THIS COMMAND:")
    print("=" * 70)
    print("""
%cd /content/F5-TTS

!python src/f5_tts/train/finetune_cli.py \\
    --exp_name F5TTS_Base \\
    --dataset_name urdu_cv \\
    --learning_rate 7.5e-5 \\
    --batch_size_per_gpu 4 \\
    --batch_size_type frame \\
    --max_samples 64 \\
    --grad_accumulation_steps 1 \\
    --max_grad_norm 1.0 \\
    --epochs 50 \\
    --num_warmup_updates 2000 \\
    --save_per_updates 1000 \\
    --last_per_updates 500 \\
    --finetune \\
    --tokenizer char \\
    --log_samples \\
    --logger tensorboard
    """)
    print("=" * 70)

else:
    print("\n✗ Some required files are missing!")
    print("Check the error messages above.")

Preparing Dataset for F5-TTS (Official Method)

[1/4] Creating metadata.csv in F5-TTS format...
   Loaded 4830 training samples
   Creating symlinks for audio files...
   Processed 500/4830...
   Processed 1000/4830...
   Processed 1500/4830...
   Processed 2000/4830...
   Processed 2500/4830...
   Processed 3000/4830...
   Processed 3500/4830...
   Processed 4000/4830...
   Processed 4500/4830...
   ✓ Created metadata.csv with 4830 samples
   ✓ Input directory: /content/f5_input_dataset

[2/4] Verifying input format...
   metadata.csv exists: ✓
   wavs/ directory exists: ✓
   Audio files found: 4830
   ✓ Input format is correct!

   Sample metadata.csv (first 3 lines):
     audio_path|text
     wavs/000000.wav|میں آپ کو حوالات میں بند کروا دوں گا
     wavs/000001.wav|اٹھارہویں صدی میں قائم ہونے والے کوئٹہ کو آب و ہوا ماحول اور خوبصورتی کی وجہ سے انگریز لٹل لندن کہتے تھے

[3/4] Running F5-TTS's prepare_csv_wavs.py script...
   This will create: raw.arrow, vocab.txt, duration.json
   Th

In [21]:
# @title
# Let's check what the training script actually loads
%cd /content/F5-TTS
!grep -r "arrow" src/f5_tts/train/ --include="*.py"
!grep -r "load_dataset" src/f5_tts/train/ --include="*.py"
!grep -r "Dataset" src/f5_tts/train/finetune_cli.py | head -20

/content/F5-TTS
src/f5_tts/train/finetune_gradio.py:from datasets.arrow_writer import ArrowWriter
src/f5_tts/train/finetune_gradio.py:    file_raw = os.path.join(path_project, "raw.arrow")
src/f5_tts/train/finetune_gradio.py:    file_raw = os.path.join(path_project, "raw.arrow")
src/f5_tts/train/finetune_gradio.py:        f"prepare complete \nsamples : {len(text_list)}\ntime data : {format_seconds_to_hms(lenght)}\nmin sec : {min_second}\nmax sec : {max_second}\nfile_arrow : {file_raw}\nvocab : {vocab_size}\n{error_text}",
src/f5_tts/train/finetune_gradio.py:    file_arrow = os.path.join(path_project, "raw.arrow")
src/f5_tts/train/finetune_gradio.py:    if not os.path.isfile(file_arrow):
src/f5_tts/train/finetune_gradio.py:    dataset = Dataset_.from_file(file_arrow)
src/f5_tts/train/finetune_gradio.py:Skip this step if you have your dataset, raw.arrow, duration.json, and vocab.txt
src/f5_tts/train/datasets/prepare_wenetspeech4tts.py:    dataset.save_to_disk(f"{save_dir}/raw", max_shard

In [24]:
# @title
# Look at the dataset loading code
!cat src/f5_tts/train/finetune_cli.py | grep -A 30 "def load"

In [23]:
# @title
# Check the prepare scripts
!ls -la src/f5_tts/train/datasets/
!cat src/f5_tts/train/datasets/prepare_csv_wavs.py

total 48
drwxr-xr-x 2 root root  4096 Oct  7 21:11 .
drwxr-xr-x 3 root root  4096 Oct  7 21:11 ..
-rw-r--r-- 1 root root 10526 Oct  7 21:11 prepare_csv_wavs.py
-rw-r--r-- 1 root root  7430 Oct  7 21:11 prepare_emilia.py
-rw-r--r-- 1 root root  3245 Oct  7 21:11 prepare_emilia_v2.py
-rw-r--r-- 1 root root  3181 Oct  7 21:11 prepare_libritts.py
-rw-r--r-- 1 root root  2276 Oct  7 21:11 prepare_ljspeech.py
-rw-r--r-- 1 root root  4580 Oct  7 21:11 prepare_wenetspeech4tts.py
import concurrent.futures
import multiprocessing
import os
import shutil
import signal
import subprocess  # For invoking ffprobe
import sys
from contextlib import contextmanager


sys.path.append(os.getcwd())

import argparse
import csv
import json
from importlib.resources import files
from pathlib import Path

import torchaudio
from datasets.arrow_writer import ArrowWriter
from tqdm import tqdm

from f5_tts.model.utils import convert_char_to_pinyin


PRETRAINED_VOCAB_PATH = files("f5_tts").joinpath("../../data/Emilia_