In [None]:
!pip install datasets
!pip install torchaudio



In [None]:
from datasets import load_dataset
import os
import torchaudio
import torchaudio.transforms as T
import torch

# Load dataset
dataset = load_dataset("DTU54DL/common-accent", split="train")

# Create output directories
os.makedirs("xtts_data/wavs", exist_ok=True)

# Define accents to keep
target_accents = {
    "India and South Asia (India, Pakistan, Sri Lanka)",
}

# Prepare metadata list
metadata = []
resampler = T.Resample(orig_freq=16000, new_freq=16000)

# Loop and filter
max_samples = 500  # Adjust as needed
count = 0

for i, sample in enumerate(dataset):
    accent = sample.get("accent", "")
    if accent not in target_accents:
        continue

    audio_array = sample["audio"]["array"]
    sentence = sample["sentence"]
    filename = f"clip_{count:05d}.wav"
    filepath = f"xtts_data/wavs/{filename}"

    # Save audio
    waveform = torch.tensor(audio_array).unsqueeze(0)
    torchaudio.save(filepath, waveform, 16000)

    metadata.append(f"{filename.replace('.wav', '')}|{sentence}")
    count += 1

    if count >= max_samples:
        break

# Save metadata
with open("xtts_data/metadata.csv", "w", encoding="utf-8") as f:
    f.write("\n".join(metadata))

print(f"✅ Done. Saved {count} samples for fine-tuning.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ Done. Saved 500 samples for fine-tuning.


In [None]:
import pandas as pd

# Load correctly with 2 columns
df = pd.read_csv("xtts_data/metadata.csv", sep="|", header=None, names=["filename", "text"])

# Add third column by duplicating 'text'
df["normalized_text"] = df["text"]

# Save the updated metadata with 3 columns
df.to_csv("xtts_data/metadata.csv", sep="|", header=False, index=False)

# Check
df_check = pd.read_csv("xtts_data/metadata.csv", sep="|", header=None)
print(df_check.head())
print(f"Number of columns: {df_check.shape[1]}")


            0                                                  1  \
0  clip_00000  Men in orange vests are at work on a construct...   
1  clip_00001                  This child is getting a pedicure.   
2  clip_00002                             Did you get the Mayor?   
3  clip_00003  Finally I didn't have to feel guilty for using...   
4  clip_00004            I'd still like to have a word with her.   

                                                   2  
0  Men in orange vests are at work on a construct...  
1                  This child is getting a pedicure.  
2                             Did you get the Mayor?  
3  Finally I didn't have to feel guilty for using...  
4            I'd still like to have a word with her.  
Number of columns: 3


In [None]:
import os
import random

# Set your data directory and filenames
data_dir = "/content/xtts_data"
input_metadata = os.path.join(data_dir, "metadata.csv")
train_output = os.path.join(data_dir, "metadata_train.csv")
val_output = os.path.join(data_dir, "metadata_val.csv")

# Ratio to split (e.g., 90% train, 10% val)
train_ratio = 0.9

# Read the original metadata
with open(input_metadata, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Shuffle the lines randomly
random.shuffle(lines)

# Split the lines
split_index = int(len(lines) * train_ratio)
train_lines = lines[:split_index]
val_lines = lines[split_index:]

# Write to train and val files
with open(train_output, "w", encoding="utf-8") as f:
    f.writelines(train_lines)

with open(val_output, "w", encoding="utf-8") as f:
    f.writelines(val_lines)

print(f"✅ Split complete!")
print(f"  > Train samples: {len(train_lines)} → {train_output}")
print(f"  > Val samples:   {len(val_lines)} → {val_output}")


✅ Split complete!
  > Train samples: 450 → /content/xtts_data/metadata_train.csv
  > Val samples:   50 → /content/xtts_data/metadata_val.csv


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp -r /content/drive/MyDrive/indicdata ./xtts_data


In [None]:
%cd /content
!git clone -b dev https://github.com/coqui-ai/TTS.git
%cd TTS
!pip install -r requirements.txt
!pip install -e .



/content
fatal: destination path 'TTS' already exists and is not an empty directory.
/content/TTS
Ignoring numpy: markers 'python_version <= "3.10"' don't match your environment
Ignoring numba: markers 'python_version < "3.9"' don't match your environment
Obtaining file:///content/TTS
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: TTS
  Building editable for TTS (pyproject.toml) ... [?25l[?25hdone
  Created wheel for TTS: filename=tts-0.22.0-0.editable-cp311-cp311-linux_x86_64.whl size=15144 sha256=8d1f5637fb3908ec035aa83bbf26ffaab8e887adc280050ba8f8168f2dbf4976
  Stored in directory: /tmp/pip-ephem-wheel-cache-9am12td5/wheels/a1/31/b6/e4589a2b0e214f7f2ecd464f78aeb7998645a43ff4721bf68b
Successfully built TTS
Installing collected packages: 

In [None]:
%cd /content/


/content


In [None]:
!pip install transformers==4.30.2


Collecting transformers==4.30.2
  Downloading transformers-4.30.2-py3-none-any.whl.metadata (113 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/113.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.6/113.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30.2)
  Downloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall:

In [None]:
!grep -r "LJSpeech" . | grep "/raid"


./recipes/ljspeech/xtts_v2/train_gpt_xtts.py:    path="/raid/datasets/LJSpeech-1.1_24khz/",
./recipes/ljspeech/xtts_v2/train_gpt_xtts.py:    meta_file_train="/raid/datasets/LJSpeech-1.1_24khz/metadata.csv",
./recipes/ljspeech/xtts_v1/train_gpt_xtts.py:    path="/raid/datasets/LJSpeech-1.1_24khz/",
./recipes/ljspeech/xtts_v1/train_gpt_xtts.py:    meta_file_train="/raid/datasets/LJSpeech-1.1_24khz/metadata.csv",
grep: ./.git/objects/pack/pack-83ae766851c85d035ae6b6a163c1063c572e1187.pack: binary file matches
grep: ./tests/inputs/scale_stats.npy: binary file matches


In [None]:
import pandas as pd

df = pd.read_csv("/content/xtts_data/metadata_val.csv", sep="|", header=None)
print(df.head())
print(f"🔍 Found {len(df)} entries in metadata_val.csv")

# Optional sanity check
import os
missing = [path for path in df[0] if not os.path.exists(os.path.join("/content/xtts_data", path))]
print("❌ Missing files:", missing)



            0                                                  1  \
0  clip_00095  He also was the past owner of the Rainfair Com...   
1  clip_00136       She had no children by any of her marriages.   
2  clip_00313  The company's mascot is Atomic Betty, who appe...   
3  clip_00085  The founder of the hospital was William Bromfe...   
4  clip_00494                His replacement was Billy Campbell.   

                                                   2  
0  He also was the past owner of the Rainfair Com...  
1       She had no children by any of her marriages.  
2  The company's mascot is Atomic Betty, who appe...  
3  The founder of the hospital was William Bromfe...  
4                His replacement was Billy Campbell.  
🔍 Found 50 entries in metadata_val.csv
❌ Missing files: ['clip_00095', 'clip_00136', 'clip_00313', 'clip_00085', 'clip_00494', 'clip_00404', 'clip_00428', 'clip_00377', 'clip_00191', 'clip_00407', 'clip_00479', 'clip_00374', 'clip_00197', 'clip_00049', 'clip_00

In [None]:
!head /content/xtts_data/metadata.csv
!ls /content/xtts_data/wavs | head
!ls -lh /content/drive/MyDrive/xtts_finetune/checkpoints

clip_00000|Men in orange vests are at work on a construction site.|Men in orange vests are at work on a construction site.
clip_00001|This child is getting a pedicure.|This child is getting a pedicure.
clip_00002|Did you get the Mayor?|Did you get the Mayor?
clip_00003|Finally I didn't have to feel guilty for using my computer on weekends.|Finally I didn't have to feel guilty for using my computer on weekends.
clip_00004|I'd still like to have a word with her.|I'd still like to have a word with her.
clip_00005|This view is wrong though.|This view is wrong though.
clip_00006|Fedric was unable to find his lucky marble.|Fedric was unable to find his lucky marble.
clip_00007|Whistle-blower Edward Snowden still lives in exile in Russia.|Whistle-blower Edward Snowden still lives in exile in Russia.
clip_00008|I work at the hospital as a paediatrician.|I work at the hospital as a paediatrician.
clip_00009|The squirrel ran up the pine tree in a spiral.|The squirrel ran up the pine tree in a sp

In [None]:
!rm -rf /content/TTS/recipes/ljspeech/xtts_v2/run/training
!ln -s /content/drive/MyDrive/xtts_finetune/checkpoints /content/TTS/recipes/ljspeech/xtts_v2/run/training

In [None]:
# !CUDA_VISIBLE_DEVICES=0 python TTS/recipes/ljspeech/xtts_v2/train_gpt_xtts.py --config_path /content/config.json
# !python TTS/recipes/ljspeech/xtts_v2/train_gpt_xtts.py --config_path /content/config.json --debug > debug_log.txt 2>&1
# !tail -n 50 debug_log.txt
!python TTS/recipes/ljspeech/xtts_v2/train_gpt_xtts.py --config_path /content/config.json --debug -v


 > Downloading DVAE files!
  0% 0.00/1.07k [00:00<?, ?iB/s]
100% 1.07k/1.07k [00:00<00:00, 5.37kiB/s]

  1% 1.97M/211M [00:00<00:10, 19.7MiB/s][A
  5% 9.50M/211M [00:00<00:03, 52.4MiB/s][A
  8% 17.8M/211M [00:00<00:02, 66.2MiB/s][A
 12% 25.0M/211M [00:00<00:02, 68.5MiB/s][A
 15% 31.8M/211M [00:00<00:02, 65.0MiB/s][A
 18% 38.4M/211M [00:00<00:02, 63.0MiB/s][A
 21% 44.7M/211M [00:00<00:02, 62.3MiB/s][A
 24% 50.9M/211M [00:00<00:02, 58.8MiB/s][A
 27% 57.0M/211M [00:00<00:02, 59.5MiB/s][A
 30% 63.5M/211M [00:01<00:02, 61.1MiB/s][A
 33% 69.9M/211M [00:01<00:02, 61.8MiB/s][A
 36% 76.5M/211M [00:01<00:02, 63.2MiB/s][A
 39% 83.0M/211M [00:01<00:02, 63.5MiB/s][A
 42% 89.3M/211M [00:01<00:01, 62.9MiB/s][A
 45% 95.7M/211M [00:01<00:01, 63.3MiB/s][A
 49% 102M/211M [00:01<00:01, 63.7MiB/s] [A
 52% 109M/211M [00:01<00:01, 56.9MiB/s][A
 54% 114M/211M [00:01<00:01, 50.6MiB/s][A
 57% 120M/211M [00:02<00:01, 52.3MiB/s][A
 60% 126M/211M [00:02<00:01, 53.5MiB/s][A
 62% 131M/211M [00:02