In [None]:
from pytorch_lightning import Trainer
import os

os.environ["NEMO_CACHE_DIR"] = "/A_track/"


In [None]:
import nemo.collections.asr as nemo_asr

pretrained_name="nvidia/canary-1b" # or "nvidia/parakeet-ctc-1.1b"

model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(pretrained_name, refresh_cache = True)


In [None]:
datasample = "/track_a_audio_files/1736843853-lWPStQITn7XCnMqYWAPbr3y3blg1.wav"

In [None]:
model.transcribe([datasample])[0].text

In [None]:
from omegaconf import OmegaConf

print(OmegaConf.to_yaml(model.cfg))

In [None]:
import re



def standardize_quotation(train_df):
    replacements = {
        "’": "'",
        "‘": "'",
        "“": '"',
        "”": '"',
        "\n": '',
        u'\xa0': u' ',

        '，':',',

        '&': "uye",
        '<': "",
        '*': "",
        '>': "",
        '#': "",
        '…': ".",
        '．': ".",
        '+': "",
        '=': "",
        '≠':'',
        '[': "(",
        ']': ")",
        '_':'-',

        'é': 'e',
        'ü': 'u',
        'ì': 'i',
        'ķ': 'k',
        'è': 'e',
    }
    pattern = re.compile("|".join(map(re.escape, replacements.keys())))
    train_df["transcription"] = train_df["transcription"].str.replace(
        pattern, lambda m: replacements[m.group()], regex=True
    )
    return train_df[["transcription"]]



 

In [None]:
train_df["transcription"] = standardize_quotation(train_df)

In [None]:
if not os.path.exists("./scripts/tokenizers/process_asr_text_tokenizer.py"):
  !mkdir scripts
  !wget -P scripts/ "https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tokenizers/process_asr_text_tokenizer.py"

In [None]:
raw_path = "shared/A_track/"
train_json_path = "shared/A_track/train.json"
dev_json_path = "shared/A_track/dev_test.json"

import pandas as pd


train_df = pd.read_json(train_json_path).T

# dev_df = pd.read_json(dev_json_path).T



train_df["file_path"] = "processed/"+ train_df["audio_path"] +".mel.pt"
dev_df["file_path"] = "processed/"+ dev_df["audio_path"] +".mel.pt"



In [None]:
pd.set_option('display.max_colwidth', None)
train_df[train_df["transcription"].str.contains(r'\d')][["transcription", 'voice_creator_id']]

In [None]:
train_df[train_df["transcription"].str.contains(r"’")][["transcription", 'voice_creator_id']]

In [None]:
train_df[train_df["transcription"].str.contains(r'"')][["transcription", 'voice_creator_id']]


In [None]:
train_df[train_df["transcription"].str.contains(r'/')][["transcription", 'voice_creator_id']]


In [None]:
pd.set_option('display.max_rows', 100)


In [None]:
import re

pattern_with_space_dot = r"[^a-zA-Z0-9()'\" .,’\-:;]"
train_df[train_df["transcription"].str.contains(pattern_with_space_dot)][["transcription", 'voice_creator_id']]
def find_non_matching(text, pattern):
    return ''.join(sorted(set(re.findall(pattern, text))))

non_matching_pattern = r"[^a-zA-Z0-9()'\" .,’\-:;]"
train_df_with_non_matching = train_df[train_df["transcription"].str.contains(pattern_with_space_dot)].copy()
train_df_with_non_matching["non_matching_chars"] = train_df_with_non_matching["transcription"].apply(lambda x: find_non_matching(x, non_matching_pattern))
train_df_with_non_matching[["transcription", "voice_creator_id", "non_matching_chars"]]

In [None]:
from collections import Counter

# Flatten all non-matching characters into a single list
all_non_matching_chars = []
for chars in train_df_with_non_matching["non_matching_chars"]:
    all_non_matching_chars.extend(list(chars))

# Count occurrences of each character
char_counts = Counter(all_non_matching_chars)

# Convert to a sorted list of (character, count) tuples
char_count_list = sorted(char_counts.items(), key=lambda x: x[1], reverse=True)
char_count_list

In [None]:
train_df[
    # (train_df['voice_creator_id'] == 'jV2p8qc1jLc1RFoTR8InbTJka782') &
    (train_df['transcription'].str.contains(r'eyitiyemu'))
]['transcription']

In [None]:

train_df[
    # (train_df['voice_creator_id'] == 'jV2p8qc1jLc1RFoTR8InbTJka782') &
    (train_df['transcription'].str.contains(r'\xa0'))
]['transcription']

In [None]:
!# Ensure you have cloned the NeMo repository: git clone https://github.com/NVIDIA/NeMo.git
# NEMO_GIT_FOLDER should be the path to the cloned repository.

!python /ocean/projects/cis250085p/shared/KASR/nemo/scripts/process_asr_text_tokenizer.py \
       --manifest="/A_track/train_processed.json" \
       --data_root="./kinyarwanda_tokenizers" \
       --vocab_size=1024 \
       --tokenizer="spe" \
       --spe_type="bpe" \
        --spe_remove_extra_whitespaces \
       --log
    #    --no_lower_case=False \

In [None]:
import torch
import os
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    os.environ["OMP_NUM_THREADS"] = "1"
    os.environ["OPENBLAS_NUM_THREADS"] = "1"
    os.environ["MKL_NUM_THREADS"] = "1"
    os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
    os.environ["NUMEXPR_NUM_THREADS"] = "1"
    os.environ["NUMBA_NUM_THREADS"] = "1"
    torch.set_num_threads(1)
    torch.set_num_interop_threads(1)


config = {
    'model_config_path': 'examples/asr/conf/conformer/conformer_ctc_bpe.yaml', # Path inside NeMo repo
    'model': {
        'init_from_pretrained_model': 'nvidia/parakeet-ctc-1.1b',
        'tokenizer_dir': './kinyarwanda_tokenizers/tokenizer_spe_bpe_v1024',
        'train_ds': {
            'manifest_filepath': '/shared/A_track/train_processed.json',
            'batch_size': 8,
             'max_duration': 30.0 # Increased max duration ..>
        },
        'validation_ds': {
            'manifest_filepath': '/shared/A_track/val_processed.json',
            'batch_size': 16,
            'max_duration': 30.0 # Increased max duration ..>
        },
        'optim': {
            'name': 'adamw',
            'lr': 0.0001,
            'betas': [0.9, 0.98],
            'weight_decay': 0.001,
            'sched': {
                'name': 'CosineAnnealing',
                'warmup_steps': 2000
            }
        }
    },
    'trainer': {
        'accelerator': 'gpu' if torch.cuda.is_available() else 'cpu',
        'devices': 1, # Use all available GPUs
        'max_epochs': 50,
        'precision': 'bf16'
    },
    # 'exp_manager': {
    #     "create_checkpoint_callback": False,
    #     'create_wandb_logger': True,
    #     'checkpoint_callback_params': {
    #         'monitor': 'val_loss',
    #         'save_top_k': 5,
    #         'mode': 'min',
    #         'filename': '{epoch:02d}-{val_loss:.2f}'
    #     },
    #     'exp_dir': './nemo_experiments',
    #     'wandb_logger_kwargs': {
    #         'name': 'parakeet-kinyarwanda-finetune-programmatic',
    #         'project': 'nemo-asr'
    #     }


        'exp_manager': {
            'exp_dir': './nemo_experiments',
            'create_wandb_logger': True,
            'wandb_logger_kwargs': {
                'name': 'parakeet-kinyarwanda-two-phase-finetune',
                'project': 'nemo-asr'
            },
            # --- ADD THIS SECTION ---
            'create_checkpoint_callback': True,
            'checkpoint_callback_params': {
                'monitor': 'val_wer',  # The metric to monitor
                'mode': 'min',         # 'min' for error rates, 'max' for accuracy
                'save_top_k': 5,       # Save the top 5 models
                'filename': '{epoch}-{step}-{val_wer:.2f}', # Name checkpoints with their WER
                'verbose': True
            }
    }
}


In [None]:
# programmatic_finetuning.py
import os
# import pytorch_lightning as pl
import lightning.pytorch as pl 
from omegaconf import OmegaConf
import nemo.collections.asr as nemo_asr
from nemo.utils.exp_manager import exp_manager


"""
Launches a NeMo ASR fine-tuning job programmatically.

Args:
    config (dict): A dictionary containing all necessary configuration parameters.
"""
print("--- Starting Programmatic Fine-Tuning ---")

# --- 1. Set up PyTorch Lightning Trainer ---
# The trainer is responsible for managing the training loop.
trainer_config = config['trainer']
trainer = pl.Trainer(**trainer_config, logger=False, enable_checkpointing=False)

# --- 2. Set up Experiment Manager ---
# The experiment manager handles logging, checkpointing, and experiment organization.
exp_manager_config = config.get('exp_manager', {})
# The `exp_manager` function requires the trainer to be passed to it.
exp_dir = exp_manager(trainer, exp_manager_config)
# 

# --- 3. Load Pretrained Model ---
print(f"Loading pretrained model: {config['model']['init_from_pretrained_model']}")
asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(
    model_name=config['model']['init_from_pretrained_model'],
    trainer=trainer,
    
    
)


In [None]:
# asr_model.cfg.resume_if_exists

# asr_model.cfg 

In [None]:
from omegaconf import OmegaConf

print(OmegaConf.to_yaml(asr_model.cfg))


In [None]:

# --- 4. Update Model Configuration ---
# print("Updating model configuration for Kinyarwanda fine-tuning...")
# with open(config['model_config_path'], 'r') as f:
#     model_cfg = OmegaConf.load(f)
model_cfg = asr_model.cfg

# Override tokenizer and dataset paths
model_cfg.tokenizer.dir = config['model']['tokenizer_dir']
model_cfg.train_ds.manifest_filepath = config['model']['train_ds']['manifest_filepath']
model_cfg.validation_ds.manifest_filepath = config['model']['validation_ds']['manifest_filepath']


# Set up the new tokenizer and vocabulary for the model
asr_model.change_vocabulary(new_tokenizer_dir=model_cfg.tokenizer.dir, new_tokenizer_type='bpe')

model_cfg.train_ds.batch_size = 6
model_cfg.validation_ds.batch_size = 6

model_cfg.train_ds.max_duration = 30
# Set up the data loaders with the new configuration



asr_model.setup_training_data(model_cfg.train_ds)
asr_model.setup_validation_data(model_cfg.validation_ds)
model_cfg.optim = OmegaConf.create(config['model']['optim'])
# Override optimizer and scheduler parameters
# OmegaConf.update(model_cfg.optim, config['model']['optim'], merge=True)
asr_model.setup_optimization(optim_config=model_cfg.optim)


# --- 5. Start Fine-Tuning ---
print("Configuration complete. Starting training...")
# trainer.fit(asr_model)
asr_model.train()
trainer.fit(asr_model)
print("Fine-tuning complete.")

# --- 6. Save the Final Model ---
final_model_path = os.path.join(exp_dir, "finetuned_kinyarwanda_model.nemo")
asr_model.save_to(final_model_path)
print(f"Final fine-tuned model saved to: {final_model_path}")




In [None]:
import gc
import torch

# Run Python garbage collection
gc.collect()

# Empty PyTorch CUDA cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

# 2 phase tranining

In [None]:
finetune_config = {
    'model_config_path': 'examples/asr/conf/conformer/conformer_ctc_bpe.yaml',
    'model': {
        'name': 'nvidia/parakeet-ctc-1.1b',
        'tokenizer_dir': '<path_to_your_output>/kinyarwanda_tokenizers/tokenizer_spe_bpe_v1024',
        'train_ds': {
            'manifest_filepath': '/A_track/train_final.json',
            'batch_size': 16
        },
        'validation_ds': {
            'manifest_filepath': '/A_track/val_final.json',
            'batch_size': 16
        }
    },
    'phase1': {
        'epochs': 10, # Number of epochs to train the decoder
        'lr': 1e-3    # Higher learning rate for the new decoder
    },
    'phase2': {
        'epochs': 50, # Number of epochs to fine-tune the whole model
        'lr': 1e-5    # Lower learning rate for the full model fine-tuning
    },
    'trainer': {
        'accelerator': 'gpu',
        'devices': -1,
        'precision': 'bf16',
        'strategy': 'ddp'
    },
    'exp_manager': {
        'exp_dir': './nemo_experiments',
        'create_wandb_logger': True,
        'wandb_logger_kwargs': {
            'name': 'parakeet-kinyarwanda-two-phase-finetune',
            'project': 'nemo-asr'
        }
    }
}


In [None]:
import pandas as pd
dev_test_df = pd.read_json("/shared/B_track/dev_test.json").T

In [None]:
dev_test_df["file_path"] = "./processed/" + dev_test_df["audio_path"] + ".mel.pt"


In [None]:
dev_test_df["file_path"]

In [None]:
import os

dev_test_df["file_exists"] = dev_test_df["file_path"].apply(os.path.exists)
dev_test_df[["file_path", "file_exists"]]

In [None]:
%cd /ocean/projects/cis250085p/shared/A_track

# run inference


In [None]:

from nemo.collections.asr.models import EncDecCTCModelBPE

checkpoint_path = "/nemo/nemo_experiments/default/checkpoints/epoch=3-step=53777-val_wer=0.15.ckpt"
asr_model = EncDecCTCModelBPE.restore_from(checkpoint_path, map_location="GPU")
audio_files = ["/path/to/audio1.wav", "/path/to/audio2.wav"]
results = asr_model.transcribe(audio_files)
for result in results:
        print(result['text'])


In [None]:
from nemo.collections.asr.models import EncDecCTCModelBPE
import torch

# Path to your .ckpt checkpoint
ckpt_path = "/nemo/nemo_experiments/default/2025-06-22_02-42-58/checkpoints/epoch=46-step=157967-val_wer=0.09-last.ckpt"

# Path to save the exported .nemo file
nemo_path = "/nemo/nemo_experiments/default/finetuned_epoch=3_model.nemo"

# Load model from .ckpt
asr_model = EncDecCTCModelBPE.load_from_checkpoint(ckpt_path, map_location="cuda" if torch.cuda.is_available() else "cpu")

asr_model.decoding.strategy = 'greedy_batch'

# Export to .nemo file
asr_model.save_to(nemo_path)

print(f"Model exported to {nemo_path}")


In [None]:
!

In [None]:
raw_path = "shared/track_a_audio_files"
train_json_path = "shared/A_track/test.json"

import pandas as pd
import os

test_df = pd.read_json(train_json_path).T


test_df["file_path"] = "processed/"+ test_df["audio_path"] +".mel.pt"
test_df['audio'] = test_df['audio_path'].apply(lambda x: os.path.join(raw_path, x.replace("audio/", "") +'.wav'))
test_df

In [None]:
test_df['audio'].iloc[0]

In [None]:
from nemo.collections.asr.models import EncDecCTCModelBPE


nemo_path = "KASR_2/nemo/nemo_experiments/default/2025-06-22_02-42-58/checkpoints/epoch=50-step=168050-val_wer=0.09-last.ckpt"
# Restore model from checkpoint
asr_model = EncDecCTCModelBPE.restore_from(restore_path=nemo_path, map_location="cuda", )

# List of audio files to transcribe
# audio_files = ["/path/to/audio1.wav", "/path/to/audio2.wav"]


In [None]:
# Optionally, set decoding strategy for faster inference
asr_model.decoding.strategy = 'greedy_batch'

# Transcribe
results = asr_model.transcribe(test_df['audio'].to_list())


In [None]:
results_list = []
# Print results
for result in results:
    print(result.text)
    results_list.append(result.text)


In [None]:
test_df['transcription'] = results_list

In [None]:

test_df['id'] = test_df.index

In [None]:
test_df[["id", "transcription"]].to_csv("/shared/A_track/submission.csv", index=False)

In [None]:
len(test_df)

In [None]:
len(results_list)

# test resume


In [None]:

nemo_path = "shared/KASR_2/nemo/nemo_experiments/default/2025-06-22_02-42-58/checkpoints/epoch=50-step=168050-val_wer=0.09-last.ckpt"

In [None]:
# programmatic_finetuning.py
import os
import torch
# import pytorch_lightning as pl
import lightning.pytorch as pl 
from omegaconf import OmegaConf
import nemo.collections.asr as nemo_asr
from nemo.utils.exp_manager import exp_manager

import utils

# import KASR.nemo.utils as utils
device = utils.get_device_safe_threading()



config = {
    'model': {
        'tokenizer_dir': '/KASR/nemo/kinyarwanda_tokenizers/tokenizer_spe_bpe_v1024',
        'train_ds': {
            'manifest_filepath': '/A_track/train_processed.json',
            'batch_size': 3,
             'max_duration': 30.0 # Increased max duration
        },
        'validation_ds': {
            'manifest_filepath': '/A_track/val_processed.json',
            'batch_size': 3,
        },
        'optim': {
            'name': 'adamw',
            'lr': 0.0001,
            'betas': [0.9, 0.98],
            'weight_decay': 0.001,
            'sched': {
                'name': 'CosineAnnealing',
                'warmup_steps': 2000
            }
        }
    },
    'trainer': {
        'accelerator': 'gpu' if torch.cuda.is_available() else 'cpu',
        'devices': 1, 
        'max_epochs': 60,
        'precision': 'bf16'
    },     
    
}


# --- 1. Set up PyTorch Lightning Trainer ---
# The trainer is responsible for managing the training loop.
trainer_config = config['trainer']
trainer = pl.Trainer(**trainer_config,logger=False,  enable_checkpointing=False)

# --- 2. Set up Experiment Manager ---
# The experiment manager handles logging, checkpointing, and experiment organization.
exp_manager_config = {
            'exp_dir': '/shared/KASR/nemo/nemo_experiments',
            'create_wandb_logger': True,
            'wandb_logger_kwargs': {
                'name': 'parakeet-kinyarwanda-two-phase-finetune',
                'project': 'nemo-asr',
    # 'resume': 'allow',
    # 'id': "2025-06-22_02-42-58"


                                    },
            'create_checkpoint_callback': True,
            'checkpoint_callback_params': {
                'monitor': 'val_wer',  # The metric to monitor
                'mode': 'min',         # 'min' for error rates, 'max' for accuracy
                'save_top_k': 5,       # Save the top 5 models
                'filename': '{epoch}-{step}-{val_wer:.2f}', # Name checkpoints with their WER
                'verbose': True,
                                        },
                 'resume_if_exists': True,
                'resume_ignore_no_checkpoint': True,
                    }

ckpt_path = None
ckpt_path="/shared/KASR_2/nemo/nemo_experiments/default/2025-06-22_02-42-58/checkpoints/epoch=50-step=168050-val_wer=0.09-last.ckpt"

restore_path = "/shared/KASR_2/nemo/nemo_experiments/default/2025-06-22_02-42-58/checkpoints/default.nemo"
exp_dir = exp_manager(trainer, exp_manager_config)



# --- 3. Load Pretrained Model ---
print(f"Loading pretrained model: 'nvidia/parakeet-ctc-1.1b'")
asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(
    model_name='nvidia/parakeet-ctc-1.1b',
    trainer=trainer,
    
)

asr_model = nemo_asr.models.EncDecCTCModelBPE.restore_from(restore_path="/shared/KASR_2/nemo/nemo_experiments/default/2025-06-22_02-42-58/checkpoints/epoch=50-step=168050-val_wer=0.09-last.ckpt",
                                                           trainer=trainer,)




In [None]:


# --- 4. Update Model Configuration ---
# print("Updating model configuration for Kinyarwanda fine-tuning...")



model_cfg = asr_model.cfg

# Override tokenizer and dataset paths
model_cfg.tokenizer.dir = config['model']['tokenizer_dir']

asr_model.change_vocabulary(new_tokenizer_dir=model_cfg.tokenizer.dir, new_tokenizer_type='bpe')

# Set up the data loaders with the new configuration
for k, v in config['model']['train_ds'].items():
    OmegaConf.update(model_cfg.train_ds, k, v)

asr_model.setup_training_data(model_cfg.train_ds)

for k, v in config['model']['validation_ds'].items():
    OmegaConf.update(model_cfg.validation_ds, k, v)

asr_model.setup_validation_data(model_cfg.validation_ds)


# Override optimizer and scheduler parameters
for k, v in config['model']['optim'].items():
    OmegaConf.update(model_cfg.optim, k, v)


if not ckpt_path:
    asr_model.setup_optimization(optim_config=model_cfg.optim)

# Set the model to use greedy decoding strategy during inference
# asr_model.decoding.strategy = 'greedy_batch'


# --- 5. Start Fine-Tuning ---
print("Configuration complete. Starting training...")
# trainer.fit(asr_model)
asr_model.train()
trainer.fit(asr_model, ckpt_path="/shared/epoch=50-step=168050-val_wer=0.09-last.ckpt")
print("Fine-tuning complete.")

# --- 6. Save the Final Model ---
final_model_path = os.path.join(exp_dir, "finetuned_kinyarwanda_model.nemo")
asr_model.save_to(final_model_path)
print(f"Final fine-tuned model saved to: {final_model_path}")



