In [None]:
!pip install numpy==1.24.4
!pip install nemo_toolkit --no-cache-dir
!pip install torch torchaudio
!pip install pyannote.metrics datasets webdataset tqdm braceexpand hydra-core omegaconf lightning lhotse jiwer pyannote.core
!pip install einops sentencepiece
!pip install editdistance

In [2]:
# Mount Google Drive (remove if running locally)
from google.colab import drive
drive.mount('/content/drive')

# Set paths
BASE_PATH = "/content/drive/MyDrive/creole_asr_project"

# Data paths
AUDIO_DIR = f"{BASE_PATH}/data/audio"
FINETUNE_DIR = f"{BASE_PATH}/data/finetune_eligible"
TRANSCRIPTS_DIR = f"{BASE_PATH}/data/transcripts"
MANIFESTS_DIR = f"{BASE_PATH}/data/manifests"

# Model paths
PRETRAINED_MODEL_DIR = f"{BASE_PATH}/models/pretrained"
CHECKPOINT_DIR = f"{BASE_PATH}/models/checkpoints"
FINAL_MODEL_DIR = f"{BASE_PATH}/models/final"

# Create directories
!mkdir -p "{AUDIO_DIR}" "{TRANSCRIPTS_DIR}" "{MANIFESTS_DIR}"
!mkdir -p "{PRETRAINED_MODEL_DIR}" "{CHECKPOINT_DIR}" "{FINAL_MODEL_DIR}"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Set paths (local version)
# Uncomment this stuff if running on actual hardware or cloud system other than Google Colab
# BASE_PATH = "creolese-audio-dataset"  # Base folder containing the dataset. Change to whatever you want

# Data paths (local version)
# AUDIO_DIR = f"{BASE_PATH}/Audio Files"
# FINETUNE_DIR = f"{AUDIO_DIR}/finetune_eligible"
# TRANSCRIPTS_DIR = BASE_PATH  # Transcripts are at base level
# MANIFESTS_DIR = f"{BASE_PATH}/manifests"

# Model paths (local version)
# PRETRAINED_MODEL_DIR = f"{BASE_PATH}/models/pretrained"
# CHECKPOINT_DIR = f"{BASE_PATH}/models/checkpoints"
# FINAL_MODEL_DIR = f"{BASE_PATH}/models/final"

# Create directories
# import os
# os.makedirs(MANIFESTS_DIR, exist_ok=True)
# os.makedirs(PRETRAINED_MODEL_DIR, exist_ok=True)
# os.makedirs(CHECKPOINT_DIR, exist_ok=True)
# os.makedirs(FINAL_MODEL_DIR, exist_ok=True)

In [3]:
# Manifest creation
import json
import librosa
import os
import random

def create_manifests_from_finetune(audio_dir, finetune_dir, output_train_path, output_val_path, val_split=0.2):
    """
    Creates train/val manifests using finetune_eligible data for training and remaining audio for validation
    Args:
        audio_dir: Path to all audio files
        finetune_dir: Path to finetune_eligible folder containing training data and transcripts.json
        output_train_path: Where to save train manifest
        output_val_path: Where to save val manifest
        val_split: Fraction of training data to use for validation
    """
    random.seed(42)

    # Load training data from finetune_eligible folder
    finetune_transcripts_path = f"{finetune_dir}/transcripts.json"
    with open(finetune_transcripts_path, 'r') as f:
        try:
            train_entries = json.load(f)  # This expects a JSON array
            if not isinstance(train_entries, list):
                raise ValueError("JSON file should contain an array of entries")
        except json.JSONDecodeError as e:
            raise ValueError(f"Invalid JSON format: {str(e)}")

    # Load all audio files in audio_dir
    all_audio_files = set([f for f in os.listdir(audio_dir) if f.endswith('.wav')])

    # Get audio files in finetune_eligible folder
    finetune_audio_files = set([entry['audio'] for entry in train_entries])

    # Get test audio files (those not in finetune_eligible)
    test_audio_files = all_audio_files - finetune_audio_files

    # Create test entries (we'll use these for validation)
    test_transcripts_path = f"{audio_dir}/transcripts.json"
    with open(test_transcripts_path, 'r') as f:
        try:
            all_entries = json.load(f)
            if not isinstance(all_entries, list):
                raise ValueError("JSON file should contain an array of entries")
            test_entries = [entry for entry in all_entries if entry['audio'] in test_audio_files]
        except json.JSONDecodeError as e:
            raise ValueError(f"Invalid JSON format: {str(e)}")

    # Split training data into train and validation
    random.shuffle(train_entries)
    split_idx = int(len(train_entries) * (1 - val_split))
    actual_train_entries = train_entries[:split_idx]
    actual_val_entries = train_entries[split_idx:]

    def write_manifest(entries, output_path, audio_base_dir):
        with open(output_path, 'w') as f:
            for entry in entries:
                audio_path = os.path.join(audio_base_dir, entry['audio'])

                # Skip if audio file doesn't exist
                if not os.path.exists(audio_path):
                    continue

                try:
                    duration = librosa.get_duration(filename=audio_path)
                    json.dump({
                        "audio_filepath": audio_path,
                        "text": entry['text'],
                        "duration": duration,
                        "language": entry.get('language', 'crs')  # Optional language field
                    }, f)
                    f.write('\n')
                except Exception as e:
                    print(f"Error processing {audio_path}: {str(e)}")
                    continue

    # Write training manifest (from finetune_eligible)
    write_manifest(actual_train_entries, output_train_path, audio_dir)

    # Write validation manifest (from finetune_eligible split)
    write_manifest(actual_val_entries, output_val_path, audio_dir)

    print(f"Created manifests with {len(actual_train_entries)} train and {len(actual_val_entries)} val samples (from finetune_eligible)")
    print(f"Note: There are {len(test_entries)} test samples available (not in finetune_eligible)")

# Create manifests using finetune_eligible data for training
create_manifests_from_finetune(
    audio_dir=AUDIO_DIR,
    finetune_dir=FINETUNE_DIR,
    output_train_path=f"{MANIFESTS_DIR}/train_manifest.json",
    output_val_path=f"{MANIFESTS_DIR}/val_manifest.json",
    val_split=0.2
)

	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=audio_path)


Created manifests with 17 train and 5 val samples (from finetune_eligible)
Note: There are 0 test samples available (not in finetune_eligible)


In [5]:
from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel
from omegaconf import OmegaConf, open_dict
import copy

# Load model
model = EncDecHybridRNNTCTCBPEModel.from_pretrained(
    "stt_multilingual_fastconformer_hybrid_large_pc"
)

# 1. First ensure the config structures exist
with open_dict(model.cfg):  # Allows modifying the config
    if not hasattr(model.cfg, 'train_ds'):
        model.cfg.train_ds = {}
    if not hasattr(model.cfg, 'validation_ds'):
        model.cfg.validation_ds = {}

# 2. Define only the essential parameters we need to modify
config_updates = {
    'train_ds': {
        'manifest_filepath': f"{MANIFESTS_DIR}/train_manifest.json",
        'batch_size': 2,
        'shuffle': True
    },
    'validation_ds': {
        'manifest_filepath': f"{MANIFESTS_DIR}/val_manifest.json",
        'batch_size': 2,
        'shuffle': False
    },
    'optim': {
        'lr': 0.0001,
        'sched': {
            'warmup_steps': 1000
        }
    }
}

# 3. Apply updates safely
with open_dict(model.cfg):
    for key, value in config_updates.items():
        if key in model.cfg:
            model.cfg[key] = OmegaConf.merge(model.cfg[key], value)
        else:
            model.cfg[key] = value

# 4. Verify
print("Final Training Config:")
print(OmegaConf.to_yaml(model.cfg.train_ds))
print("\nFinal Validation Config:")
print(OmegaConf.to_yaml(model.cfg.validation_ds))

# Save final model
model.save_to(f"{FINAL_MODEL_DIR}/creole_english_finetuned.nemo")
!ls -lh "{FINAL_MODEL_DIR}"

[NeMo I 2025-05-09 08:30:24 nemo_logging:393] Found existing object /root/.cache/torch/NeMo/NeMo_2.3.0/stt_multilingual_fastconformer_hybrid_large_pc/f8eb2579fe9c3c0c5bdde864d5661d65/stt_multilingual_fastconformer_hybrid_large_pc.nemo.
[NeMo I 2025-05-09 08:30:24 nemo_logging:393] Re-using file from: /root/.cache/torch/NeMo/NeMo_2.3.0/stt_multilingual_fastconformer_hybrid_large_pc/f8eb2579fe9c3c0c5bdde864d5661d65/stt_multilingual_fastconformer_hybrid_large_pc.nemo
[NeMo I 2025-05-09 08:30:24 nemo_logging:393] Instantiating model from pre-trained checkpoint
[NeMo I 2025-05-09 08:30:29 nemo_logging:393] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2025-05-09 08:30:29 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 256 tokens
[NeMo I 2025-05-09 08:30:29 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 256 tokens
[NeMo I 2025-05-09 08:30:29 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 256 tokens
[NeMo I 2025-05-09 0

[NeMo W 2025-05-09 08:30:36 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    is_concat: true
    concat_sampling_technique: random
    concat_sampling_temperature: 8
    concat_shuffle: true
    concat_sampling_scale: 1.0
    concat_samples: false
    concat_samples_count_as_one: true
    concat_samles_max_length: 22
    concat_samples_min_langth: 16
    concat_samples_joining_pause: 0.1
    manifest_filepath:
    - - /data/mml/by/tarred_train/pcstrip_sharded_manifests/manifest__OP_0..511_CL_.json
    - - /data/mml/es/nemo_sp_asr_set_3pt0/tarred_train/pcstrip_sharded_manifests/manifest__OP_0..511_CL_.json
    - - /data/mml/ru/v2/tarred_train/pcstrip_sharded_manifests/manifest__OP_0..511_CL_.json
    - - /data/mml/ua/tarred_train/pcstrip_sharded_manifests/manifest__OP_0..127_CL_.json
    - - /data/mml/en/tarred_train/pcstrip_sh

[NeMo I 2025-05-09 08:30:36 nemo_logging:393] PADDING: 0
[NeMo I 2025-05-09 08:30:37 nemo_logging:393] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}
[NeMo I 2025-05-09 08:30:37 nemo_logging:393] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}


[NeMo W 2025-05-09 08:30:37 nemo_logging:405] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: CUDA is not available


[NeMo I 2025-05-09 08:30:38 nemo_logging:393] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}


[NeMo W 2025-05-09 08:30:38 nemo_logging:405] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: CUDA is not available


[NeMo I 2025-05-09 08:30:39 nemo_logging:393] Model EncDecHybridRNNTCTCBPEModel was successfully restored from /root/.cache/torch/NeMo/NeMo_2.3.0/stt_multilingual_fastconformer_hybrid_large_pc/f8eb2579fe9c3c0c5bdde864d5661d65/stt_multilingual_fastconformer_hybrid_large_pc.nemo.
Final Training Config:
is_concat: true
concat_sampling_technique: random
concat_sampling_temperature: 8
concat_shuffle: true
concat_sampling_scale: 1.0
concat_samples: false
concat_samples_count_as_one: true
concat_samles_max_length: 22
concat_samples_min_langth: 16
concat_samples_joining_pause: 0.1
manifest_filepath: /content/drive/MyDrive/creole_asr_project/data/manifests/train_manifest.json
tarred_audio_filepaths:
- - /data/mml/by/tarred_train/audio__OP_0..511_CL_.tar
- - /data/mml/es/nemo_sp_asr_set_3pt0/tarred_train/audio__OP_0..511_CL_.tar
- - /data/mml/ru/v2/tarred_train/audio__OP_0..511_CL_.tar
- - /data/mml/ua/tarred_train/audio__OP_0..127_CL_.tar
- - /data/mml/en/tarred_train/audio__OP_0..511_CL_.tar
-

In [None]:
# Optional: Resume from best checkpoint
checkpoints = !ls "{CHECKPOINT_DIR}" | grep .ckpt
if checkpoints:
    best_ckpt = f"{CHECKPOINT_DIR}/{checkpoints[0]}"
    trainer.fit(model, ckpt_path=best_ckpt)