## Pip Install Requirment

In [None]:
# Requirments:
# pip install requests
!pip install torch
!pip install sacrebleu
!pip install accelerate
!pip install datasets
!pip install transformers
!pip install scipy torchvision
!pip install peft
!pip install backoff

Collecting backoff
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Installing collected packages: backoff
Successfully installed backoff-2.2.1


## Load Librarys

In [1]:
"""
finetune Phi-4-multimodal-instruct on an speech task

scipy==1.15.1
peft==0.13.2
backoff==2.2.1
transformers==4.46.1
accelerate==1.3.0
"""

import argparse
import json
import os
from pathlib import Path

import torch
import sacrebleu
from accelerate import Accelerator
from accelerate.utils import gather_object
from datasets import load_dataset
from torch.utils.data import Dataset
from tqdm import tqdm
import pandas as pd
from transformers import (
    AutoModelForCausalLM,
    AutoProcessor,
    BatchFeature,
    Trainer,
    TrainingArguments,
    StoppingCriteria,
    StoppingCriteriaList,
)


  from .autonotebook import tqdm as notebook_tqdm


## Download Common voice and extract

In [None]:
## Download tar file to the current directory
import requests
import tarfile
import os
import shutil
def download_and_extract_tar(url, extract_to='.'):
    # Download the tar file
    response = requests.get(url, stream=True)
    tar_file_path = 'dataset.tar.gz'
    
    with open(tar_file_path, 'wb') as f:
        f.write(response.content)

    # Extract the tar file
    with tarfile.open(tar_file_path, 'r:gz') as tar:
        tar.extractall(path=extract_to)

    # Remove the tar file after extraction
    os.remove(tar_file_path)
# Example usage
url = 'https://storage.googleapis.com/common-voice-prod-prod-datasets/cv-corpus-4-2019-12-10/en.tar.gz?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gke-prod%40moz-fx-common-voice-prod.iam.gserviceaccount.com%2F20250424%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20250424T000524Z&X-Goog-Expires=43200&X-Goog-SignedHeaders=host&X-Goog-Signature=49076e5a9f1be99b24d7cf2ef7deabd641cca40829c838875c5c6dcf1bea8818a62199915f5ca1e23a82e6b36875430cce84545db101b59c17a9f25a4dc62795d70b6ac17314ba9537156ad43aec47ed365026e966d2c57e0eae8b5c8a64cbcbb1ab2cd3e096da6d03be5ab62abceec469591ae11f2747a619440e87e2029b3f1f18b42d4ec4ee85ba57339bdda9714f7101be14a50fe533785c4ca4ab8923273b889cc1ac577a7b767009fefeed24f4e63e70e2f18397959d137ca36756560874dc14d3ea296efe7a22d7ff45c5b577fa553fd69ff351e14398c018af9873ec585a025078925dac809a4d7198e3086d3c8c1a6c18abe61374b128d5923ab56f'

download_and_extract_tar(url, extract_to='CommonVoice/EN')

## Data Zip/Unzip

In [None]:
## Extract the zip file
import zipfile
import os
def extract_zip(zip_file_path, extract_to='.'):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
# Example usage
zip_file_path = '/home/kelechi/bio_ramp_project/afrivox-20250424T174821Z-002.zip'
extract_zip(zip_file_path, extract_to='CommonVoice/EN')


## Tran/Test Split

In [3]:
# Load the csv datasets and split into train, test, and validation sets
import pandas as pd
full_dataset = pd.read_csv('/home/kelechi/bio_ramp_project/CommonVoice/EN/afrivox/emnlp_nv_dataset.csv')
#rename speaker_id to client_id
full_dataset.rename(columns={'speaker_id': 'client_id'}, inplace=True)
full_dataset.rename(columns={'audio_path': 'path'}, inplace=True)
full_dataset.rename(columns={'transcription': 'sentence'}, inplace=True)


train_dataset = full_dataset.sample(frac=0.8, random_state=42)
test_dataset = full_dataset.drop(train_dataset.index)
val_dataset = test_dataset.sample(frac=0.5, random_state=42)
test_dataset = test_dataset.drop(val_dataset.index)



train_dataset.to_csv('/home/kelechi/bio_ramp_project/CommonVoice/EN/afrivox/train.csv', index=False)
val_dataset.to_csv('/home/kelechi/bio_ramp_project/CommonVoice/EN/afrivox/validated.csv', index=False)
test_dataset.to_csv('/home/kelechi/bio_ramp_project/CommonVoice/EN/afrivox/test.csv', index=False)




## Training Pipeline code

In [4]:
# import os
# import pandas as pd
# import torch
# from torch.utils.data import Dataset
# from transformers import AutoProcessor, AutoModelForCausalLM, Trainer, TrainingArguments
# from datasets import load_dataset
# from tqdm import tqdm
# from transformers import StoppingCriteria, StoppingCriteriaList
# import torchaudio
# from pathlib import Path
# from accelerate import Accelerator

# Define constants
INSTSRUCTION = {
    "en_zh-CN": "Translate the audio to Mandarin.",
    "en_id": "Translate the audio to Indonesian.",
    "en_sl": "Translate the audio to Slovenian.",
}

ANSWER_SUFFIX = "<|end|><|endoftext|>"
_IGNORE_INDEX = -100
_TRAIN_SIZE = 50000
_EVAL_SIZE = 200

# Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, processor, data_file, split, lang="en_zh-CN", rank=0, world_size=1):
        """
        Custom Dataset class for loading your dataset instead of CoVoST2.
        Args:
            processor: The processor to handle tokenization and audio feature extraction.
            data_file: Path to the dataset file (CSV, etc.).
            split: Split (train, test, etc.).
            lang: Language pair for translation (e.g., "en_zh-CN").
            rank: Rank for distributed processing.
            world_size: World size for distributed processing.
        """
        # Load your custom dataset (e.g., CSV file)
        self.data = pd.read_csv(data_file)
        
        self.processor = processor
        self.instruction = INSTSRUCTION.get(lang, "Translate the audio.")  # Default instruction

        # For distributed training, shard the dataset if needed
        if world_size > 1:
            # Shard the data (for distributed training)
            start_idx = rank * len(self.data) // world_size
            end_idx = (rank + 1) * len(self.data) // world_size
            self.data = self.data.iloc[start_idx:end_idx]

        self.training = "train" in split

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Load the row corresponding to the idx
        data = self.data.iloc[idx]

        # Construct user message (for instruction)
        user_message = {
            'role': 'user',
            'content': f'<|audio_1|>\n{self.instruction}',
        }
        
        # Construct the prompt by tokenizing the user message
        prompt = self.processor.tokenizer.apply_chat_template(
            [user_message], tokenize=False, add_generation_prompt=True
        )
        
        # Assuming the audio is a file path in 'audio_path', load the audio data
        audio_path = data['audio_path']
        audio_data, sampling_rate = self.load_audio(audio_path)

        # Prepare the input tokens
        inputs = self.processor(text=prompt, audios=[(audio_data, sampling_rate)], return_tensors='pt')

        # Get the translation (target) and prepare the labels
        answer = f"{data['translation']}{ANSWER_SUFFIX}"
        answer_ids = self.processor.tokenizer(answer, return_tensors='pt').input_ids
        
        if self.training:
            input_ids = torch.cat([inputs.input_ids, answer_ids], dim=1)
            labels = torch.full_like(input_ids, _IGNORE_INDEX)
            labels[:, -answer_ids.shape[1]:] = answer_ids
        else:
            input_ids = inputs.input_ids
            labels = answer_ids

        return {
            'input_ids': input_ids,
            'labels': labels,
            'input_audio_embeds': inputs.input_audio_embeds,
            'audio_embed_sizes': inputs.audio_embed_sizes,
        }

    def load_audio(self, audio_path):
        """
        Load the audio file from the given path.
        Replace this with your audio loading logic, using libraries like librosa, torchaudio, etc.
        """
        waveform, sample_rate = torchaudio.load(audio_path)
        return waveform, sample_rate

# Collate function for batching
def covost_collate_fn(batch):
    input_ids_list = []
    labels_list = []
    input_audio_embeds_list = []
    audio_embed_sizes_list = []
    audio_attention_mask_list = []
    for inputs in batch:
        input_ids_list.append(inputs['input_ids'][0])
        labels_list.append(inputs['labels'][0])
        input_audio_embeds_list.append(inputs['input_audio_embeds'])
        audio_embed_sizes_list.append(inputs['audio_embed_sizes'])
        audio_attention_mask_list.append(
            inputs['input_audio_embeds'].new_full((inputs['input_audio_embeds'].size(1),), True, dtype=torch.bool)
        )

    try:
        input_ids = pad_sequence(input_ids_list, padding_side='left', padding_value=0)
        labels = pad_sequence(labels_list, padding_side='left', padding_value=0)
        audio_attention_mask = (
            pad_sequence(audio_attention_mask_list, padding_side='right', padding_value=False)
            if len(audio_attention_mask_list) > 1
            else None
        )
    except Exception as e:
        print(e)
        print(input_ids_list)
        print(labels_list)
        raise
    attention_mask = (input_ids != 0).long()
    input_audio_embeds = cat_with_pad(input_audio_embeds_list, dim=0)
    audio_embed_sizes = torch.cat(audio_embed_sizes_list)

    return BatchFeature(
        {
            'input_ids': input_ids,
            'labels': labels,
            'attention_mask': attention_mask,
            'input_audio_embeds': input_audio_embeds,
            'audio_embed_sizes': audio_embed_sizes,
            'audio_attention_mask': audio_attention_mask,
            'input_mode': 2,  # speech mode
        }
    )

# Model creation
def create_model(model_name_or_path, use_flash_attention=False):
    model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path,
        torch_dtype=torch.bfloat16 if use_flash_attention else torch.float32,
        _attn_implementation='flash_attention_2' if use_flash_attention else 'sdpa',
        trust_remote_code=True,
    ).to('cuda')

    return model

# Evaluation function
@torch.no_grad()
def evaluate(model, processor, eval_dataset, save_path=None, disable_tqdm=False, eval_batch_size=1):
    rank = int(os.environ.get('RANK', 0))
    local_rank = int(os.environ.get('LOCAL_RANK', 0))

    model.eval()
    all_generated_texts = []
    all_labels = []

    eval_dataloader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=eval_batch_size,
        collate_fn=covost_collate_fn,
        shuffle=False,
        drop_last=False,
        num_workers=8,
        prefetch_factor=2,
        pin_memory=True,
    )
    stop_tokens = ["<|end|>", processor.tokenizer.eos_token]
    stop_tokens_ids = processor.tokenizer(stop_tokens, add_special_tokens=False, padding="longest", return_tensors="pt")["input_ids"]
    stop_tokens_ids = stop_tokens_ids.to(f'cuda:{local_rank}')

    for inputs in tqdm(
        eval_dataloader, disable=(rank != 0) or disable_tqdm, desc='running eval'
    ):
        stopping_criteria=StoppingCriteriaList([MultipleTokenBatchStoppingCriteria(stop_tokens_ids, batch_size=inputs.input_ids.size(0))])
        inputs = inputs.to(f'cuda:{local_rank}')
        generated_ids = model.generate(
            **inputs, eos_token_id=processor.tokenizer.eos_token_id, max_new_tokens=64,
            stopping_criteria=stopping_criteria,
            num_logits_to_keep=64
        )

        stop_tokens_idx = stopping_criteria[0].stop_tokens_idx.reshape(inputs.input_ids.size(0), -1)[:, 0]

        stop_tokens_idx = torch.where(
            stop_tokens_idx > 0,
            stop_tokens_idx - stop_tokens_ids.shape[-1],
            generated_ids.shape[-1],
        )
        generated_text = [
            processor.decode(_pred_ids[inputs["input_ids"].shape[1] : _stop_tokens_idx], skip_special_tokens=True, clean_up_tokenization_spaces=False)
            for _pred_ids, _stop_tokens_idx in zip(generated_ids, stop_tokens_idx)
        ]
        all_generated_texts.extend(generated_text)
        labels = [processor.decode(_label_ids[_label_ids != 0]).removesuffix(ANSWER_SUFFIX) for _label_ids in inputs["labels"]]
        all_labels.extend(labels)

    all_generated_texts = gather_object(all_generated_texts)
    all_labels = gather_object(all_labels)
    
    if rank == 0:
        assert len(all_generated_texts) == len(all_labels)
        bleu = sacrebleu.corpus_bleu(all_generated_texts, [all_labels])
        print(bleu)
        if save_path:
            with open(save_path, 'w') as f:
                save_dict = {
                    'all_generated_texts': all_generated_texts,
                    'all_labels': all_labels,
                    'score': bleu.score,
                }
                json.dump(save_dict, f)

        return bleu.score
    return None

# Main training script
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name_or_path', type=str, default='microsoft/Phi-4-multimodal-instruct', help='Model name or path to load from')
    parser.add_argument("--data_file", type=str, default="/home/kelechi/bio_ramp_project/CommonVoice/EN/afrivox", help="Path to the CSV dataset file")
    parser.add_argument("--lang", type=str, default="en_sl", help="Language pair for translation.")
    parser.add_argument('--use_flash_attention', action='store_true', help='Use Flash Attention')
    parser.add_argument('--output_dir', type=str, default='./output/', help='Output directory')
    parser.add_argument('--batch_size', type=int, default=128, help='Batch size')
    parser.add_argument('--batch_size_per_gpu', type=int, default=1, help='Batch size per GPU (adjust this to fit in GPU memory)')
    parser.add_argument('--num_train_epochs', type=int, default=1, help='Number of training epochs')
    parser.add_argument('--learning_rate', type=float, default=4.0e-5, help='Learning rate')
    parser.add_argument('--wd', type=float, default=0.01, help='Weight decay')
    parser.add_argument('--no-tqdm', dest='tqdm', action='store_false', help='Disable tqdm')
    args = parser.parse_args(args=[])

    accelerator = Accelerator()

    with accelerator.local_main_process_first():
        processor = AutoProcessor.from_pretrained(args.model_name_or_path, trust_remote_code=True)
        model = create_model(args.model_name_or_path, use_flash_attention=args.use_flash_attention)

    model.set_lora_adapter('speech')

    rank = int(os.environ.get('RANK', 0))
    world_size = int(os.environ.get('WORLD_SIZE', 1))

def get_data_file(data_dir, split):
    file_map = {
        'train': 'train.csv',
        'test': 'test.csv',
        'validation': 'validation.csv'
    }
    return os.path.join(data_dir, file_map[split])

# Update the dataset initialization
parser = argparse.ArgumentParser()
parser.add_argument('--data_file', type=str, required=True, help="Path to the data directory")
args = parser.parse_args()

train_file = get_data_file(args.data_file, 'train')

test_file = get_data_file(args.data_file, 'test')

train_dataset = CustomDataset(
    processor,
    data_file=train_file,
    split='train',
    lang=args.lang,
)

eval_dataset = CustomDataset(
    processor,
    data_file=test_file,
    split='test',
    lang=args.lang,
)

num_gpus = accelerator.num_processes
print(f'training on {num_gpus} GPUs')
assert (
    args.batch_size % (num_gpus * args.batch_size_per_gpu) == 0
), 'Batch size must be divisible by the number of GPUs'

training_args = TrainingArguments(
    num_train_epochs=args.num_train_epochs,
    per_device_train_batch_size=args.batch_size_per_gpu,
    gradient_checkpointing=True,
    gradient_accumulation_steps=args.batch_size // (num_gpus * args.batch_size_per_gpu),
    optim='adamw_torch',
    learning_rate=args.learning_rate,
    weight_decay=args.wd,
    max_grad_norm=1.0,
    logging_steps=10,
    output_dir=args.output_dir,
    save_strategy='no',
    save_total_limit=10,
    save_only_model=True,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=covost_collate_fn,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Start training
trainer.train()
trainer.save_model()
processor.save_pretrained(training_args.output_dir)

if __name__ == '__main__':
    main()


usage: ipykernel_launcher.py [-h] --data_file DATA_FILE
ipykernel_launcher.py: error: the following arguments are required: --data_file


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [2]:
import torch
print(f"Allocated memory: {torch.cuda.memory_allocated() / 1e9} GB")
print(f"Reserved memory: {torch.cuda.memory_reserved() / 1e9} GB")

Allocated memory: 0.0 GB
Reserved memory: 0.0 GB


## Other code variant (Do not run)

In [5]:
INSTSRUCTION = {
    "en_zh-CN": "Translate the audio to Mandarin.",
    "en_id": "Translate the audio to Indonesian.",
    "en_sl": "Translate the audio to Slovenian.",
}
TOKENIZER = {
    "en_zh-CN": "zh",
    "en_ja": "ja-mecab",
}
ANSWER_SUFFIX = "<|end|><|endoftext|>"
_IGNORE_INDEX = -100
_TRAIN_SIZE = 50000
_EVAL_SIZE = 200

class MultipleTokenBatchStoppingCriteria(StoppingCriteria):
    """Stopping criteria capable of receiving multiple stop-tokens and handling batched inputs."""

    def __init__(self, stop_tokens: torch.LongTensor, batch_size: int = 1) -> None:
        """Initialize the multiple token batch stopping criteria.

        Args:
            stop_tokens: Stop-tokens.
            batch_size: Batch size.

        """

        self.stop_tokens = stop_tokens
        self.max_stop_tokens = stop_tokens.shape[-1]
        self.stop_tokens_idx = torch.zeros(batch_size, dtype=torch.long, device=stop_tokens.device)

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        # Only gather the maximum number of inputs compatible with stop tokens
        # and checks whether generated inputs are equal to `stop_tokens`
        generated_inputs = torch.eq(input_ids[:, -self.max_stop_tokens :].unsqueeze(1), self.stop_tokens)
        equal_generated_inputs = torch.all(generated_inputs, dim=2)

        # Mark the position where a stop token has been produced for each input in the batch,
        # but only if the corresponding entry is not already set
        sequence_idx = torch.any(equal_generated_inputs, dim=1)
        sequence_set_mask = self.stop_tokens_idx == 0
        self.stop_tokens_idx[sequence_idx & sequence_set_mask] = input_ids.shape[-1]

        return torch.all(self.stop_tokens_idx)

class CoVoSTDataset(Dataset):
    def __init__(self, processor, data_dir, split="validated", lang="en_zh-CN", rank=0, world_size=1):
        try:
            print(f"Loading dataset with data_dir={data_dir}, split={split}")
            self.data = load_dataset(
                "csv",
                lang,
                data_dir=data_dir,
                split=split,
                trust_remote_code=True
            )
            print(f"Loaded {split} split with {len(self.data)} examples.")
        except Exception as e:
            print(f"Error loading {split} split: {e}")
            self.data = []

        self.training = "validated" in split
        self.processor = processor
        self.instruction = INSTSRUCTION[lang]

        if world_size > 1:
            self.data = self.data.shard(world_size, rank)


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """
        {'client_id': '0013037a1d45cc33460806cc3f8ecee9d536c45639ba4cbbf1564f1c051f53ff3c9f89ef2f1bf04badf55b3a2e7654c086f903681a7b6299616cff6f67598eff',
        'file': '{data_dir}/clips/common_voice_en_699711.mp3',
        'audio': {'path': '{data_dir}/clips/common_voice_en_699711.mp3',
        'array': array([-1.28056854e-09, -1.74622983e-09, -1.16415322e-10, ...,
                3.92560651e-10,  6.62794264e-10, -3.89536581e-09]),
        'sampling_rate': 16000},
        'sentence': '"She\'ll be all right."',
        'translation': '她会没事的。',
        'id': 'common_voice_en_699711'}
        """
        data = self.data[idx]
        user_message = {
            'role': 'user',
            'content': '<|audio_1|>\n' + self.instruction,
        }
        prompt = self.processor.tokenizer.apply_chat_template(
            [user_message], tokenize=False, add_generation_prompt=True
        )
        inputs = self.processor(text=prompt, audios=[(data["audio"]["array"], data["audio"]["sampling_rate"])], return_tensors='pt')
        
        answer = f"{data['translation']}{ANSWER_SUFFIX}"
        answer_ids = self.processor.tokenizer(answer, return_tensors='pt').input_ids
        if  self.training:
            input_ids = torch.cat([inputs.input_ids, answer_ids], dim=1)
            labels = torch.full_like(input_ids, _IGNORE_INDEX)
            labels[:, -answer_ids.shape[1] :] = answer_ids
        else:
            input_ids = inputs.input_ids
            labels = answer_ids

        return {
            'input_ids': input_ids,
            'labels': labels,
            'input_audio_embeds': inputs.input_audio_embeds,
            'audio_embed_sizes': inputs.audio_embed_sizes,
        }

def pad_sequence(sequences, padding_side='right', padding_value=0):
    """
    Pad a list of sequences to the same length.
    sequences: list of tensors in [seq_len, *] shape
    """
    assert padding_side in ['right', 'left']
    max_size = sequences[0].size()
    trailing_dims = max_size[1:]
    max_len = max(len(seq) for seq in sequences)
    batch_size = len(sequences)
    output = sequences[0].new_full((batch_size, max_len) + trailing_dims, padding_value)
    for i, seq in enumerate(sequences):
        length = seq.size(0)
        if padding_side == 'right':
            output.data[i, :length] = seq
        else:
            output.data[i, -length:] = seq
    return output


def cat_with_pad(tensors, dim, padding_value=0):
    """
    cat along dim, while pad to max for all other dims
    """
    ndim = tensors[0].dim()
    assert all(
        t.dim() == ndim for t in tensors[1:]
    ), 'All tensors must have the same number of dimensions'

    out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
    out_size[dim] = sum(t.shape[dim] for t in tensors)
    output = tensors[0].new_full(out_size, padding_value)

    index = 0
    for t in tensors:
        # Create a slice list where every dimension except dim is full slice
        slices = [slice(0, t.shape[d]) for d in range(ndim)]
        # Update only the concat dimension slice
        slices[dim] = slice(index, index + t.shape[dim])

        output[slices] = t
        index += t.shape[dim]

    return output


def covost_collate_fn(batch):
    input_ids_list = []
    labels_list = []
    input_audio_embeds_list = []
    audio_embed_sizes_list = []
    audio_attention_mask_list = []
    for inputs in batch:
        input_ids_list.append(inputs['input_ids'][0])
        labels_list.append(inputs['labels'][0])
        input_audio_embeds_list.append(inputs['input_audio_embeds'])
        audio_embed_sizes_list.append(inputs['audio_embed_sizes'])
        audio_attention_mask_list.append(
            inputs['input_audio_embeds'].new_full((inputs['input_audio_embeds'].size(1),), True, dtype=torch.bool)
        )

    try:
        input_ids = pad_sequence(input_ids_list, padding_side='left', padding_value=0)
        labels = pad_sequence(labels_list, padding_side='left', padding_value=0)
        audio_attention_mask = (
            pad_sequence(audio_attention_mask_list, padding_side='right', padding_value=False)
            if len(audio_attention_mask_list) > 1
            else None
        )
    except Exception as e:
        print(e)
        print(input_ids_list)
        print(labels_list)
        raise
    attention_mask = (input_ids != 0).long()
    input_audio_embeds = cat_with_pad(input_audio_embeds_list, dim=0)
    audio_embed_sizes = torch.cat(audio_embed_sizes_list)

    return BatchFeature(
        {
            'input_ids': input_ids,
            'labels': labels,
            'attention_mask': attention_mask,
            'input_audio_embeds': input_audio_embeds,
            'audio_embed_sizes': audio_embed_sizes,
            'audio_attention_mask': audio_attention_mask,
            'input_mode': 2,  # speech mode
        }
    )



def create_model(model_name_or_path, use_flash_attention=False):
    model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path,
        torch_dtype=torch.bfloat16 if use_flash_attention else torch.float32,
        _attn_implementation='flash_attention_2' if use_flash_attention else 'sdpa',
        trust_remote_code=True,
    ).to('cuda')

    return model


@torch.no_grad()
def evaluate(
    model, processor, eval_dataset, save_path=None, disable_tqdm=False, eval_batch_size=1
):
    rank = int(os.environ.get('RANK', 0))
    local_rank = int(os.environ.get('LOCAL_RANK', 0))

    model.eval()
    all_generated_texts = []
    all_labels = []

    eval_dataloader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=eval_batch_size,
        collate_fn=covost_collate_fn,
        shuffle=False,
        drop_last=False,
        num_workers=8,
        prefetch_factor=2,
        pin_memory=True,
    )
    stop_tokens = ["<|end|>", processor.tokenizer.eos_token]
    stop_tokens_ids = processor.tokenizer(stop_tokens, add_special_tokens=False, padding="longest", return_tensors="pt")["input_ids"]
    stop_tokens_ids = stop_tokens_ids.to(f'cuda:{local_rank}')

    for inputs in tqdm(
        eval_dataloader, disable=(rank != 0) or disable_tqdm, desc='running eval'
    ):
        stopping_criteria=StoppingCriteriaList([MultipleTokenBatchStoppingCriteria(stop_tokens_ids, batch_size=inputs.input_ids.size(0))])
        inputs = inputs.to(f'cuda:{local_rank}')
        generated_ids = model.generate(
            **inputs, eos_token_id=processor.tokenizer.eos_token_id, max_new_tokens=16,
            stopping_criteria=stopping_criteria,
        )

        stop_tokens_idx = stopping_criteria[0].stop_tokens_idx.reshape(inputs.input_ids.size(0), -1)[:, 0]

        stop_tokens_idx = torch.where(
            stop_tokens_idx > 0,
            stop_tokens_idx - stop_tokens_ids.shape[-1],
            generated_ids.shape[-1],
        )
        generated_text = [
            processor.decode(_pred_ids[inputs["input_ids"].shape[1] : _stop_tokens_idx], skip_special_tokens=True, clean_up_tokenization_spaces=False)
            for _pred_ids, _stop_tokens_idx in zip(generated_ids, stop_tokens_idx)
        ]
        all_generated_texts.extend(generated_text)
        labels = [processor.decode(_label_ids[_label_ids != 0]).removesuffix(ANSWER_SUFFIX) for _label_ids in inputs["labels"]]
        all_labels.extend(labels)

    all_generated_texts = gather_object(all_generated_texts)
    all_labels = gather_object(all_labels)
    
    if rank == 0:
        assert len(all_generated_texts) == len(all_labels)
        bleu = sacrebleu.corpus_bleu(all_generated_texts, [all_labels])
        print(bleu)
        if save_path:
            with open(save_path, 'w') as f:
                save_dict = {
                    'all_generated_texts': all_generated_texts,
                    'all_labels': all_labels,
                    'score': bleu.score,
                }
                json.dump(save_dict, f)

        return bleu.score
    return None


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model_name_or_path',
        type=str,
        default='microsoft/Phi-4-multimodal-instruct',
        help='Model name or path to load from',
    )
    parser.add_argument(
        "--common_voice_dir",
        type=str,
        default="/home/kelechi/bio_ramp_project/cv-corpus-21.0-delta-2025-03-14/en",
        help="Unzipped Common Voice Audio dataset directory, refer to https://commonvoice.mozilla.org/en/datasets, version 4.0",
    )
    parser.add_argument(
        "--lang",
        type=str,
        default="en_sl",
        help="Language pair for translation.",
    )
    parser.add_argument('--use_flash_attention', action='store_true', help='Use Flash Attention')
    parser.add_argument('--output_dir', type=str, default='./output/', help='Output directory')
    parser.add_argument('--batch_size', type=int, default=128, help='Batch size')
    parser.add_argument(
        '--batch_size_per_gpu',
        type=int,
        default=4,
        help='Batch size per GPU (adjust this to fit in GPU memory)',
    )
    parser.add_argument(
        '--num_train_epochs', type=int, default=1, help='Number of training epochs'
    )
    parser.add_argument('--learning_rate', type=float, default=4.0e-5, help='Learning rate')
    parser.add_argument('--wd', type=float, default=0.01, help='Weight decay')
    parser.add_argument('--no-tqdm', dest='tqdm', action='store_false', help='Disable tqdm')
    args = parser.parse_args(args=[])

    accelerator = Accelerator()

    with accelerator.local_main_process_first():
        processor = AutoProcessor.from_pretrained(
            args.model_name_or_path,
            trust_remote_code=True,
        )
        model = create_model(
            args.model_name_or_path,
            use_flash_attention=args.use_flash_attention,
        )

    model.set_lora_adapter('speech')


    rank = int(os.environ.get('RANK', 0))
    world_size = int(os.environ.get('WORLD_SIZE', 1))

    eval_dataset = CoVoSTDataset(processor,
                                 data_dir="/home/kelechi/bio_ramp_project/cv-corpus-21.0-delta-2025-03-14/en",
                                 split="train[:100]",
                                 lang=args.lang,
                                 rank=rank,
                                 world_size=world_size)
    
    train_dataset = CoVoSTDataset(processor,
                                  data_dir="/home/kelechi/bio_ramp_project/cv-corpus-21.0-delta-2025-03-14/en",
                                  split=f'validated[:{_TRAIN_SIZE}]',
                                  lang=args.lang)

    num_gpus = accelerator.num_processes
    print(f'training on {num_gpus} GPUs')
    assert (
        args.batch_size % (num_gpus * args.batch_size_per_gpu) == 0
    ), 'Batch size must be divisible by the number of GPUs'
    gradient_accumulation_steps = args.batch_size // (num_gpus * args.batch_size_per_gpu)

    if args.use_flash_attention:
        fp16 = False
        bf16 = True
    else:
        fp16 = True
        bf16 = False

    # hard coded training args
    training_args = TrainingArguments(
        num_train_epochs=args.num_train_epochs,
        per_device_train_batch_size=args.batch_size_per_gpu,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={'use_reentrant': False},
        gradient_accumulation_steps=gradient_accumulation_steps,
        optim='adamw_torch',
        adam_beta1=0.9,
        adam_beta2=0.95,
        adam_epsilon=1e-7,
        learning_rate=args.learning_rate,
        weight_decay=args.wd,
        max_grad_norm=1.0,
        lr_scheduler_type='linear',
        warmup_steps=50,
        logging_steps=10,
        output_dir=args.output_dir,
        save_strategy='no',
        save_total_limit=10,
        save_only_model=True,
        bf16=bf16,
        fp16=fp16,
        remove_unused_columns=False,
        report_to='none',
        deepspeed=None,
        disable_tqdm=not args.tqdm,
        dataloader_num_workers=4,
        ddp_find_unused_parameters=True,  # for unused SigLIP layers
    )

    # eval before fine-tuning
    out_path = Path(training_args.output_dir)
    out_path.mkdir(parents=True, exist_ok=True)

    score = evaluate(
        model,
        processor,
        eval_dataset,
        save_path=out_path / 'eval_before.json',
        disable_tqdm=not args.tqdm,
        eval_batch_size=args.batch_size_per_gpu,
    )
    if accelerator.is_main_process:
        print(f'BLEU Score before finetuning: {score}')

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=covost_collate_fn,
        train_dataset=train_dataset,
    )

    trainer.train()
    trainer.save_model()
    if accelerator.is_main_process:
        processor.save_pretrained(training_args.output_dir)
    accelerator.wait_for_everyone()

    # eval after fine-tuning (load saved checkpoint)
    # first try to clear GPU memory
    del model
    del trainer
    __import__('gc').collect()
    torch.cuda.empty_cache()

    # reload the model for inference
    model = AutoModelForCausalLM.from_pretrained(
        training_args.output_dir,
        torch_dtype=torch.bfloat16 if args.use_flash_attention else torch.float32,
        trust_remote_code=True,
        _attn_implementation='flash_attention_2' if args.use_flash_attention else 'sdpa',
    ).to('cuda')

    score = evaluate(
        model,
        processor,
        eval_dataset,
        save_path=out_path / 'eval_after.json',
        disable_tqdm=not args.tqdm,
        eval_batch_size=args.batch_size_per_gpu,
    )
    if accelerator.is_main_process:
        print(f'BLEU Score after finetuning: {score}')


if __name__ == '__main__':
    main()

  lambda i: encoder_checkpoint_wrapper(
Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.44it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.29 GiB. GPU 0 has a total capacity of 23.65 GiB of which 2.10 GiB is free. Including non-PyTorch memory, this process has 21.54 GiB memory in use. Of the allocated memory 20.80 GiB is allocated by PyTorch, and 379.97 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model_name_or_path',
        type=str,
        default='microsoft/Phi-4-multimodal-instruct',
        help='Model name or path to load from',
    )
    parser.add_argument(
        "--lang",
        type=str,
        default="en_sl",
        help="Language pair for translation.",
    )
    parser.add_argument('--output_dir', type=str, default='./output/', help='Output directory')
    parser.add_argument('--batch_size', type=int, default=128, help='Batch size')
    parser.add_argument('--batch_size_per_gpu', type=int, default=32, help='Batch size per GPU')
    parser.add_argument('--num_train_epochs', type=int, default=1, help='Number of training epochs')
    parser.add_argument('--learning_rate', type=float, default=4.0e-5, help='Learning rate')
    parser.add_argument('--wd', type=float, default=0.01, help='Weight decay')
    args = parser.parse_args(args=[])

    accelerator = Accelerator()

    with accelerator.local_main_process_first():
        processor = AutoProcessor.from_pretrained(
            args.model_name_or_path,
            trust_remote_code=True,
        )
        model = create_model(
            args.model_name_or_path,
        )

    model.set_lora_adapter('speech')

    # Load the dataset with Pandas and convert to Hugging Face Dataset
    validated_path = "/home/kelechi/bio_ramp_project/cv-corpus-21.0-delta-2025-03-14/en/validated.tsv"
    df = pd.read_csv(validated_path, sep="\t")
    hf_dataset = Dataset.from_pandas(df)

    # Split the dataset into training and evaluation sets
    train_size = int(0.8 * len(hf_dataset))
    eval_size = len(hf_dataset) - train_size
    train_dataset_raw, eval_dataset_raw = hf_dataset.train_test_split(
        test_size=eval_size / len(hf_dataset)
    ).values()

    # Create CoVoSTDataset instances
    train_dataset = CoVoSTDataset(processor, train_dataset_raw, training=True, lang=args.lang)
    eval_dataset = CoVoSTDataset(processor, eval_dataset_raw, training=False, lang=args.lang)

    num_gpus = accelerator.num_processes
    print(f'training on {num_gpus} GPUs')
    gradient_accumulation_steps = args.batch_size // (num_gpus * args.batch_size_per_gpu)

    # Training arguments
    training_args = TrainingArguments(
        num_train_epochs=args.num_train_epochs,
        per_device_train_batch_size=args.batch_size_per_gpu,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=args.learning_rate,
        weight_decay=args.wd,
        output_dir=args.output_dir,
        save_strategy='no',
        remove_unused_columns=False,
        disable_tqdm=True,
    )

    # Evaluate before fine-tuning
    score = evaluate(
        model,
        processor,
        eval_dataset,
        eval_batch_size=args.batch_size_per_gpu,
    )
    print(f'BLEU Score before finetuning: {score}')

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=covost_collate_fn,
        train_dataset=train_dataset,
    )

    trainer.train()
    trainer.save_model()

    # Evaluate after fine-tuning
    score = evaluate(
        model,
        processor,
        eval_dataset,
        eval_batch_size=args.batch_size_per_gpu,
    )
    print(f'BLEU Score after finetuning: {score}')


if __name__ == '__main__':
    main()

## Load smaller version into pipeline (do not run)

In [3]:
#%pip install torchaudio

from datasets import Dataset
import torchaudio
import pandas as pd


INSTSRUCTION = {
    "en_zh-CN": "Translate the audio to Mandarin.",
    "en_id": "Translate the audio to Indonesian.",
    "en_sl": "Translate the audio to Slovenian.",
}
TOKENIZER = {
    "en_zh-CN": "zh",
    "en_ja": "ja-mecab",
}
ANSWER_SUFFIX = "<|end|><|endoftext|>"
_IGNORE_INDEX = -100
_TRAIN_SIZE = 50000
_EVAL_SIZE = 200

class MultipleTokenBatchStoppingCriteria(StoppingCriteria):
    """Stopping criteria capable of receiving multiple stop-tokens and handling batched inputs."""

    def __init__(self, stop_tokens: torch.LongTensor, batch_size: int = 1) -> None:
        """Initialize the multiple token batch stopping criteria.

        Args:
            stop_tokens: Stop-tokens.
            batch_size: Batch size.

        """

        self.stop_tokens = stop_tokens
        self.max_stop_tokens = stop_tokens.shape[-1]
        self.stop_tokens_idx = torch.zeros(batch_size, dtype=torch.long, device=stop_tokens.device)

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        # Only gather the maximum number of inputs compatible with stop tokens
        # and checks whether generated inputs are equal to `stop_tokens`
        generated_inputs = torch.eq(input_ids[:, -self.max_stop_tokens :].unsqueeze(1), self.stop_tokens)
        equal_generated_inputs = torch.all(generated_inputs, dim=2)

        # Mark the position where a stop token has been produced for each input in the batch,
        # but only if the corresponding entry is not already set
        sequence_idx = torch.any(equal_generated_inputs, dim=1)
        sequence_set_mask = self.stop_tokens_idx == 0
        self.stop_tokens_idx[sequence_idx & sequence_set_mask] = input_ids.shape[-1]

        return torch.all(self.stop_tokens_idx)

def preprocess_dataset(validated_path, base_audio_dir):
    # Load the dataset using Pandas
    df = pd.read_csv(validated_path, sep="\t")

    # Print column names to verify structure
    print("Columns in the dataset:", df.columns)

    # Add audio data to the dataset
    def load_audio(row):
        try:
            # Construct the full path to the audio file
            audio_file_path = os.path.join(base_audio_dir, row["path"])
            waveform, sampling_rate = torchaudio.load(audio_file_path)
            return {"array": waveform.squeeze().numpy(), "sampling_rate": sampling_rate}
        except Exception as e:
            print(f"Error loading audio for file {row['path']}: {e}")
            return {"array": None, "sampling_rate": None}

    # Ensure the "path" column exists
    if "path" not in df.columns:
        raise KeyError("The 'path' column is missing in the dataset. Please check the dataset structure.")

    df["audio"] = df.apply(load_audio, axis=1)

    # Filter out rows where audio could not be loaded
    df = df[df["audio"].apply(lambda x: x["array"] is not None)]

    # Convert to Hugging Face Dataset
    hf_dataset = Dataset.from_pandas(df)
    return hf_dataset


class CoVoSTDataset(Dataset):
    def __init__(self, processor, dataset, training=True, lang="en_zh-CN"):
        self.dataset = dataset
        self.training = training
        self.processor = processor
        self.instruction = INSTSRUCTION[lang]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        data = self.dataset[idx]

        # Debugging: Check data structure
        print(f"Data at index {idx}: {data}")

        # Ensure translation field exists
        assert "translation" in data, f"Missing 'translation' field in data: {data}"
        assert data["translation"], f"Empty 'translation' field in data: {data}"

        user_message = {
            'role': 'user',
            'content': '<|audio_1|>\n' + self.instruction,
        }
        prompt = self.processor.tokenizer.apply_chat_template(
            [user_message], tokenize=False, add_generation_prompt=True
        )

        # Debugging: Check generated prompt
        print(f"Generated prompt: {prompt}")

        try:
            inputs = self.processor(
                text=prompt,
                audios=[(data["audio"]["array"], data["audio"]["sampling_rate"])],
                return_tensors='pt'
            )
        except Exception as e:
            print(f"Error in processor: {e}")
            raise

        answer = f"{data['translation']}{ANSWER_SUFFIX}"
        answer_ids = self.processor.tokenizer(answer, return_tensors='pt').input_ids

        if self.training:
            input_ids = torch.cat([inputs.input_ids, answer_ids], dim=1)
            labels = torch.full_like(input_ids, _IGNORE_INDEX)
            labels[:, -answer_ids.shape[1]:] = answer_ids
        else:
            input_ids = inputs.input_ids
            labels = answer_ids

        return {
            'input_ids': input_ids,
            'labels': labels,
            'input_audio_embeds': inputs.input_audio_embeds,
            'audio_embed_sizes': inputs.audio_embed_sizes,
        }

def pad_sequence(sequences, padding_side='right', padding_value=0):
    """
    Pad a list of sequences to the same length.
    sequences: list of tensors in [seq_len, *] shape
    """
    assert padding_side in ['right', 'left']
    max_size = sequences[0].size()
    trailing_dims = max_size[1:]
    max_len = max(len(seq) for seq in sequences)
    batch_size = len(sequences)
    output = sequences[0].new_full((batch_size, max_len) + trailing_dims, padding_value)
    for i, seq in enumerate(sequences):
        length = seq.size(0)
        if padding_side == 'right':
            output.data[i, :length] = seq
        else:
            output.data[i, -length:] = seq
    return output


def cat_with_pad(tensors, dim, padding_value=0):
    """
    cat along dim, while pad to max for all other dims
    """
    ndim = tensors[0].dim()
    assert all(
        t.dim() == ndim for t in tensors[1:]
    ), 'All tensors must have the same number of dimensions'

    out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
    out_size[dim] = sum(t.shape[dim] for t in tensors)
    output = tensors[0].new_full(out_size, padding_value)

    index = 0
    for t in tensors:
        # Create a slice list where every dimension except dim is full slice
        slices = [slice(0, t.shape[d]) for d in range(ndim)]
        # Update only the concat dimension slice
        slices[dim] = slice(index, index + t.shape[dim])

        output[slices] = t
        index += t.shape[dim]

    return output


def covost_collate_fn(batch):
    input_ids_list = []
    labels_list = []
    input_audio_embeds_list = []
    audio_embed_sizes_list = []
    audio_attention_mask_list = []
    for inputs in batch:
        input_ids_list.append(inputs['input_ids'][0])
        labels_list.append(inputs['labels'][0])
        input_audio_embeds_list.append(inputs['input_audio_embeds'])
        audio_embed_sizes_list.append(inputs['audio_embed_sizes'])
        audio_attention_mask_list.append(
            inputs['input_audio_embeds'].new_full((inputs['input_audio_embeds'].size(1),), True, dtype=torch.bool)
        )

    try:
        input_ids = pad_sequence(input_ids_list, padding_side='left', padding_value=0)
        labels = pad_sequence(labels_list, padding_side='left', padding_value=0)
        audio_attention_mask = (
            pad_sequence(audio_attention_mask_list, padding_side='right', padding_value=False)
            if len(audio_attention_mask_list) > 1
            else None
        )
    except Exception as e:
        print(e)
        print(input_ids_list)
        print(labels_list)
        raise
    attention_mask = (input_ids != 0).long()
    input_audio_embeds = cat_with_pad(input_audio_embeds_list, dim=0)
    audio_embed_sizes = torch.cat(audio_embed_sizes_list)

    return BatchFeature(
        {
            'input_ids': input_ids,
            'labels': labels,
            'attention_mask': attention_mask,
            'input_audio_embeds': input_audio_embeds,
            'audio_embed_sizes': audio_embed_sizes,
            'audio_attention_mask': audio_attention_mask,
            'input_mode': 2,  # speech mode
        }
    )



## Model Training Pipeline using smaller version of dataset (do not run)

In [3]:
def create_model(model_name_or_path, use_flash_attention=False):
    model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path,
        torch_dtype=torch.bfloat16 if use_flash_attention else torch.float32,
        _attn_implementation='flash_attention_2' if use_flash_attention else 'sdpa',
        trust_remote_code=True,
    ).to('cuda')

    return model


@torch.no_grad()
def evaluate(
    model, processor, eval_dataset, save_path=None, disable_tqdm=False, eval_batch_size=1
):
    rank = int(os.environ.get('RANK', 0))
    local_rank = int(os.environ.get('LOCAL_RANK', 0))

    model.eval()
    all_generated_texts = []
    all_labels = []

    eval_dataloader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=eval_batch_size,
        collate_fn=covost_collate_fn,
        shuffle=False,
        drop_last=False,
        num_workers=8,
        prefetch_factor=2,
        pin_memory=True,
    )
    stop_tokens = ["<|end|>", processor.tokenizer.eos_token]
    stop_tokens_ids = processor.tokenizer(stop_tokens, add_special_tokens=False, padding="longest", return_tensors="pt")["input_ids"]
    stop_tokens_ids = stop_tokens_ids.to(f'cuda:{local_rank}')

    for inputs in tqdm(
        eval_dataloader, disable=(rank != 0) or disable_tqdm, desc='running eval'
    ):
        stopping_criteria=StoppingCriteriaList([MultipleTokenBatchStoppingCriteria(stop_tokens_ids, batch_size=inputs.input_ids.size(0))])
        inputs = inputs.to(f'cuda:{local_rank}')
        generated_ids = model.generate(
            **inputs, eos_token_id=processor.tokenizer.eos_token_id, max_new_tokens=64,
            stopping_criteria=stopping_criteria,
        )

        stop_tokens_idx = stopping_criteria[0].stop_tokens_idx.reshape(inputs.input_ids.size(0), -1)[:, 0]

        stop_tokens_idx = torch.where(
            stop_tokens_idx > 0,
            stop_tokens_idx - stop_tokens_ids.shape[-1],
            generated_ids.shape[-1],
        )
        generated_text = [
            processor.decode(_pred_ids[inputs["input_ids"].shape[1] : _stop_tokens_idx], skip_special_tokens=True, clean_up_tokenization_spaces=False)
            for _pred_ids, _stop_tokens_idx in zip(generated_ids, stop_tokens_idx)
        ]
        all_generated_texts.extend(generated_text)
        labels = [processor.decode(_label_ids[_label_ids != 0]).removesuffix(ANSWER_SUFFIX) for _label_ids in inputs["labels"]]
        all_labels.extend(labels)

    all_generated_texts = gather_object(all_generated_texts)
    all_labels = gather_object(all_labels)
    
    if rank == 0:
        assert len(all_generated_texts) == len(all_labels)
        bleu = sacrebleu.corpus_bleu(all_generated_texts, [all_labels])
        print(bleu)
        if save_path:
            with open(save_path, 'w') as f:
                save_dict = {
                    'all_generated_texts': all_generated_texts,
                    'all_labels': all_labels,
                    'score': bleu.score,
                }
                json.dump(save_dict, f)

        return bleu.score
    return None

def main():
    import os
    os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Suppress tokenizer parallelism warning

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model_name_or_path',
        type=str,
        default='microsoft/Phi-4-multimodal-instruct',
        help='Model name or path to load from',
    )
    parser.add_argument(
        "--lang",
        type=str,
        default="en_sl",
        help="Language pair for translation.",
    )
    parser.add_argument('--output_dir', type=str, default='./output/', help='Output directory')
    parser.add_argument('--batch_size', type=int, default=128, help='Batch size')
    parser.add_argument('--batch_size_per_gpu', type=int, default=32, help='Batch size per GPU')
    parser.add_argument('--num_train_epochs', type=int, default=1, help='Number of training epochs')
    parser.add_argument('--learning_rate', type=float, default=4.0e-5, help='Learning rate')
    parser.add_argument('--wd', type=float, default=0.01, help='Weight decay')
    parser.add_argument('--no-tqdm', dest='tqdm', action='store_false', help='Disable tqdm')
    args = parser.parse_args(args=[])

    accelerator = Accelerator()

    with accelerator.local_main_process_first():
        processor = AutoProcessor.from_pretrained(
            args.model_name_or_path,
            trust_remote_code=True,
        )
        model = create_model(
            args.model_name_or_path,
        )

    model.set_lora_adapter('speech')

    # Preprocess the dataset
    validated_path = "/home/kelechi/bio_ramp_project/cv-corpus-21.0-delta-2025-03-14/en/validated.tsv"
    base_audio_dir = "/home/kelechi/bio_ramp_project/cv-corpus-21.0-delta-2025-03-14/en/clips"
    hf_dataset = preprocess_dataset(validated_path, base_audio_dir)

    # Split the dataset into training and evaluation sets
    train_size = int(0.8 * len(hf_dataset))
    eval_size = len(hf_dataset) - train_size
    train_dataset_raw, eval_dataset_raw = hf_dataset.train_test_split(
        test_size=eval_size / len(hf_dataset)
    ).values()

    # Create CoVoSTDataset instances
    train_dataset = CoVoSTDataset(processor, train_dataset_raw, training=True, lang=args.lang)
    eval_dataset = CoVoSTDataset(processor, eval_dataset_raw, training=False, lang=args.lang)

    num_gpus = accelerator.num_processes
    print(f'training on {num_gpus} GPUs')
    gradient_accumulation_steps = args.batch_size // (num_gpus * args.batch_size_per_gpu)

    # Training arguments
    training_args = TrainingArguments(
        num_train_epochs=args.num_train_epochs,
        per_device_train_batch_size=args.batch_size_per_gpu,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=args.learning_rate,
        weight_decay=args.wd,
        output_dir=args.output_dir,
        save_strategy='no',
        remove_unused_columns=False,
        disable_tqdm=True,
    )

    # Evaluate before fine-tuning
    score = evaluate(
        model,
        processor,
        eval_dataset,
        eval_batch_size=args.batch_size_per_gpu,
    )
    print(f'BLEU Score before finetuning: {score}')

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=covost_collate_fn,
        train_dataset=train_dataset,
    )

    trainer.train()
    trainer.save_model()

    # Evaluate after fine-tuning
    score = evaluate(
        model,
        processor,
        eval_dataset,
        eval_batch_size=args.batch_size_per_gpu,
    )
    print(f'BLEU Score after finetuning: {score}')


if __name__ == '__main__':
    main()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
  lambda i: encoder_checkpoint_wrapper(
Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.45it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.29 GiB. GPU 0 has a total capacity of 23.65 GiB of which 1.70 GiB is free. Process 190848 has 21.56 GiB memory in use. Including non-PyTorch memory, this process has 384.00 MiB memory in use. Of the allocated memory 0 bytes is allocated by PyTorch, and 0 bytes is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Verify issues with datasets (do not run)

In [12]:
import os
import pandas as pd

def verify_audio_paths(validated_path, base_audio_dir):
    """
    Verify the existence of audio files specified in the dataset.

    Args:
        validated_path (str): Path to the validated.tsv file.
        base_audio_dir (str): Base directory containing the audio files.

    Returns:
        None
    """
    # Load the dataset using Pandas
    if not os.path.exists(validated_path):
        print(f"Error: The file {validated_path} does not exist.")
        return

    try:
        df = pd.read_csv(validated_path, sep="\t")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return

    # Check if the 'path' column exists
    if "path" not in df.columns:
        print("Error: The 'path' column is missing in the dataset.")
        return

    # Verify each audio file
    missing_files = []
    for idx, row in df.iterrows():
        audio_file_path = os.path.join(base_audio_dir, row["path"])
        if not os.path.exists(audio_file_path):
            missing_files.append(audio_file_path)

    # Report results
    if missing_files:
        print(f"Found {len(missing_files)} missing audio files:")
        for file in missing_files[:10]:  # Show only the first 10 missing files
            print(f"  - {file}")
        print("...")
    else:
        print("All audio files are present and valid.")

# Example usage
validated_path = "/home/kelechi/bio_ramp_project/cv-corpus-21.0-delta-2025-03-14/en/validated.tsv"
base_audio_dir = "/home/kelechi/bio_ramp_project/cv-corpus-21.0-delta-2025-03-14/en/clips"

verify_audio_paths(validated_path, base_audio_dir)

All audio files are present and valid.


In [2]:
import os
import pandas as pd
import torchaudio
from datasets import Dataset

def verify_audio_loading(validated_path, base_audio_dir):
    """
    Verify the audio loading logic and structure of the audio field.

    Args:
        validated_path (str): Path to the validated.tsv file.
        base_audio_dir (str): Base directory containing the audio files.

    Returns:
        None
    """
    # Step 1: Load the dataset using Pandas
    if not os.path.exists(validated_path):
        print(f"Error: The file {validated_path} does not exist.")
        return

    try:
        df = pd.read_csv(validated_path, sep="\t")
        print(f"Dataset loaded successfully with {len(df)} rows.")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return

    # Step 2: Check if the 'path' column exists
    if "path" not in df.columns:
        print("Error: The 'path' column is missing in the dataset.")
        return

    # Step 3: Verify audio loading for each row
    def load_audio(row):
        try:
            # Construct the full path to the audio file
            audio_file_path = os.path.join(base_audio_dir, row["path"])
            waveform, sampling_rate = torchaudio.load(audio_file_path)
            return {"array": waveform.squeeze().numpy(), "sampling_rate": sampling_rate}
        except Exception as e:
            print(f"Error loading audio for file {row['path']}: {e}")
            return {"array": None, "sampling_rate": None}

    # Step 4: Apply the audio loading logic
    print("Loading audio files...")
    df["audio"] = df.apply(load_audio, axis=1)

    # Step 5: Check for invalid audio fields
    invalid_audio_rows = df[df["audio"].apply(lambda x: x["array"] is None or x["sampling_rate"] is None)]
    if not invalid_audio_rows.empty:
        print(f"Found {len(invalid_audio_rows)} rows with invalid audio fields:")
        print(invalid_audio_rows[["path", "audio"]].head(10))  # Show the first 10 invalid rows
    else:
        print("All audio files loaded successfully.")

    # Step 6: Convert to Hugging Face Dataset and verify structure
    print("Converting to Hugging Face Dataset...")
    hf_dataset = Dataset.from_pandas(df)
    

    for idx, row in enumerate(hf_dataset):
        if not isinstance(row["audio"], dict) or "array" not in row["audio"] or "sampling_rate" not in row["audio"]:
            print(f"Invalid audio field at index {idx}: {row['audio']}")

    print("Audio loading verification complete.")

# print the first few rows of the dataset
    print("First few rows of the dataset:")
    print(df.head())

    # Print the column names
    print("\nColumns in the dataset:")
    print(df.columns.tolist())






# Example usage
validated_path = "/home/kelechi/bio_ramp_project/cv-corpus-21.0-delta-2025-03-14/en/validated.tsv"
base_audio_dir = "/home/kelechi/bio_ramp_project/cv-corpus-21.0-delta-2025-03-14/en/clips"

verify_audio_loading(validated_path, base_audio_dir)

Dataset loaded successfully with 249 rows.
Loading audio files...
All audio files loaded successfully.
Converting to Hugging Face Dataset...
Audio loading verification complete.
First few rows of the dataset:
                                           client_id  \
0  116398939d6be70fc5fb532924a130c0adf286ac283499...   
1  24a4da2e8f053a45a0715849c222a40a4b0da9872efb2e...   
2  30849595699bc853c3810a78448acede46888b4e2d0809...   
3  42d53f34c1bc50f7a7c4ed1765a8d1ffeaf5cd441513cc...   
4  436b9e1f9da710d74eb01209f8f269bee70e93cadf2053...   

                           path  \
0  common_voice_en_41923025.mp3   
1  common_voice_en_42356358.mp3   
2  common_voice_en_42165090.mp3   
3  common_voice_en_41921729.mp3   
4  common_voice_en_42528393.mp3   

                                         sentence_id  \
0  f5a2a431746c5229ab696ba0e1a518fe7b26e208ff3b84...   
1  f6f009587d8812c147af1cc05079e1fcd8120c8a98cdf8...   
2  f69afa5e77812e8be0085c874d2a9767323c78ffb43ba6...   
3  f5739acbefdbd3aa

In [None]:

#%pip install torchaudio

from datasets import Dataset
import torchaudio
import pandas as pd


INSTSRUCTION = {
    "en_zh-CN": "Translate the audio to Mandarin.",
    "en_id": "Translate the audio to Indonesian.",
    "en_sl": "Translate the audio to Slovenian.",
}
TOKENIZER = {
    "en_zh-CN": "zh",
    "en_ja": "ja-mecab",
}
ANSWER_SUFFIX = "<|end|><|endoftext|>"
_IGNORE_INDEX = -100
_TRAIN_SIZE = 50000
_EVAL_SIZE = 200

class MultipleTokenBatchStoppingCriteria(StoppingCriteria):
    """Stopping criteria capable of receiving multiple stop-tokens and handling batched inputs."""

    def __init__(self, stop_tokens: torch.LongTensor, batch_size: int = 1) -> None:
        """Initialize the multiple token batch stopping criteria.

        Args:
            stop_tokens: Stop-tokens.
            batch_size: Batch size.

        """

        self.stop_tokens = stop_tokens
        self.max_stop_tokens = stop_tokens.shape[-1]
        self.stop_tokens_idx = torch.zeros(batch_size, dtype=torch.long, device=stop_tokens.device)

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        # Only gather the maximum number of inputs compatible with stop tokens
        # and checks whether generated inputs are equal to `stop_tokens`
        generated_inputs = torch.eq(input_ids[:, -self.max_stop_tokens :].unsqueeze(1), self.stop_tokens)
        equal_generated_inputs = torch.all(generated_inputs, dim=2)

        # Mark the position where a stop token has been produced for each input in the batch,
        # but only if the corresponding entry is not already set
        sequence_idx = torch.any(equal_generated_inputs, dim=1)
        sequence_set_mask = self.stop_tokens_idx == 0
        self.stop_tokens_idx[sequence_idx & sequence_set_mask] = input_ids.shape[-1]

        return torch.all(self.stop_tokens_idx)

def preprocess_dataset(validated_path, base_audio_dir):
    # Load the dataset using Pandas
    df = pd.read_csv(validated_path, sep="\t")

    # Print column names to verify structure
    print("Columns in the dataset:", df.columns)

    # Add audio data to the dataset
    def load_audio(row):
        try:
            # Construct the full path to the audio file
            audio_file_path = os.path.join(base_audio_dir, row["path"])
            waveform, sampling_rate = torchaudio.load(audio_file_path)
            return {"array": waveform.squeeze().numpy(), "sampling_rate": sampling_rate}
        except Exception as e:
            print(f"Error loading audio for file {row['path']}: {e}")
            return {"array": None, "sampling_rate": None}

    # Ensure the "path" column exists
    if "path" not in df.columns:
        raise KeyError("The 'path' column is missing in the dataset. Please check the dataset structure.")

    df["audio"] = df.apply(load_audio, axis=1)

    # Filter out rows where audio could not be loaded
    df = df[df["audio"].apply(lambda x: x["array"] is not None)]

    # Convert to Hugging Face Dataset
    hf_dataset = Dataset.from_pandas(df)
    return hf_dataset


class CoVoSTDataset(Dataset):
    def __init__(self, processor, dataset, training=True, lang="en_zh-CN"):
        self.dataset = dataset  # Renamed from self.data to self.dataset
        self.training = training
        self.processor = processor
        self.instruction = INSTSRUCTION[lang]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        data = self.dataset[idx]
        user_message = {
            'role': 'user',
            'content': '<|audio_1|>\n' + self.instruction,
        }
        prompt = self.processor.tokenizer.apply_chat_template(
            [user_message], tokenize=False, add_generation_prompt=True
        )
        inputs = self.processor(
            text=prompt,
            audios=[(data["path"]["array"], data["path"]["sampling_rate"])],
            return_tensors='pt'
        )

        answer = f"{data['translation']}{ANSWER_SUFFIX}"
        answer_ids = self.processor.tokenizer(answer, return_tensors='pt').input_ids
        if self.training:
            input_ids = torch.cat([inputs.input_ids, answer_ids], dim=1)
            labels = torch.full_like(input_ids, _IGNORE_INDEX)
            labels[:, -answer_ids.shape[1]:] = answer_ids
        else:
            input_ids = inputs.input_ids
            labels = answer_ids

        return {
            'input_ids': input_ids,
            'labels': labels,
            'input_audio_embeds': inputs.input_audio_embeds,
            'audio_embed_sizes': inputs.audio_embed_sizes,
        }

def pad_sequence(sequences, padding_side='right', padding_value=0):
    """
    Pad a list of sequences to the same length.
    sequences: list of tensors in [seq_len, *] shape
    """
    assert padding_side in ['right', 'left']
    max_size = sequences[0].size()
    trailing_dims = max_size[1:]
    max_len = max(len(seq) for seq in sequences)
    batch_size = len(sequences)
    output = sequences[0].new_full((batch_size, max_len) + trailing_dims, padding_value)
    for i, seq in enumerate(sequences):
        length = seq.size(0)
        if padding_side == 'right':
            output.data[i, :length] = seq
        else:
            output.data[i, -length:] = seq
    return output


def cat_with_pad(tensors, dim, padding_value=0):
    """
    cat along dim, while pad to max for all other dims
    """
    ndim = tensors[0].dim()
    assert all(
        t.dim() == ndim for t in tensors[1:]
    ), 'All tensors must have the same number of dimensions'

    out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
    out_size[dim] = sum(t.shape[dim] for t in tensors)
    output = tensors[0].new_full(out_size, padding_value)

    index = 0
    for t in tensors:
        # Create a slice list where every dimension except dim is full slice
        slices = [slice(0, t.shape[d]) for d in range(ndim)]
        # Update only the concat dimension slice
        slices[dim] = slice(index, index + t.shape[dim])

        output[slices] = t
        index += t.shape[dim]

    return output


def covost_collate_fn(batch):
    input_ids_list = []
    labels_list = []
    input_audio_embeds_list = []
    audio_embed_sizes_list = []
    audio_attention_mask_list = []
    for inputs in batch:
        input_ids_list.append(inputs['input_ids'][0])
        labels_list.append(inputs['labels'][0])
        input_audio_embeds_list.append(inputs['input_audio_embeds'])
        audio_embed_sizes_list.append(inputs['audio_embed_sizes'])
        audio_attention_mask_list.append(
            inputs['input_audio_embeds'].new_full((inputs['input_audio_embeds'].size(1),), True, dtype=torch.bool)
        )

    try:
        input_ids = pad_sequence(input_ids_list, padding_side='left', padding_value=0)
        labels = pad_sequence(labels_list, padding_side='left', padding_value=0)
        audio_attention_mask = (
            pad_sequence(audio_attention_mask_list, padding_side='right', padding_value=False)
            if len(audio_attention_mask_list) > 1
            else None
        )
    except Exception as e:
        print(e)
        print(input_ids_list)
        print(labels_list)
        raise
    attention_mask = (input_ids != 0).long()
    input_audio_embeds = cat_with_pad(input_audio_embeds_list, dim=0)
    audio_embed_sizes = torch.cat(audio_embed_sizes_list)

    return BatchFeature(
        {
            'input_ids': input_ids,
            'labels': labels,
            'attention_mask': attention_mask,
            'input_audio_embeds': input_audio_embeds,
            'audio_embed_sizes': audio_embed_sizes,
            'audio_attention_mask': audio_attention_mask,
            'input_mode': 2,  # speech mode
        }
    )

How d

In [4]:
import torchaudio
import os

audio_file_path = "/home/kelechi/bio_ramp_project/cv-corpus-21.0-delta-2025-03-14/en/clips/common_voice_en_41923025.mp3"

try:
    waveform, sampling_rate = torchaudio.load(audio_file_path)
    print(f"Audio loaded successfully! Sampling rate: {sampling_rate}")
except Exception as e:
    print(f"Error loading audio: {e}")

Audio loaded successfully! Sampling rate: 32000


In [17]:
validated_path = "/home/kelechi/bio_ramp_project/cv-corpus-21.0-delta-2025-03-14/en/validated.tsv"
base_audio_dir = "/home/kelechi/bio_ramp_project/cv-corpus-21.0-delta-2025-03-14/en/clips"


hf_dataset = preprocess_dataset(validated_path, base_audio_dir)
for idx, row in enumerate(hf_dataset):
    if not isinstance(row["audio"], dict) or "array" not in row["audio"] or "sampling_rate" not in row["audio"]:
        print(f"Invalid audio field at index {idx}: {row}")

Columns in the dataset: Index(['client_id', 'path', 'sentence_id', 'sentence', 'sentence_domain',
       'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant',
       'locale', 'segment'],
      dtype='object')


In [14]:
inputs = self.processor(
    text=prompt,
    audios=[(data["audio"]["array"], data["audio"]["sampling_rate"])],
    return_tensors='pt'
)

NameError: name 'self' is not defined