<a href="https://colab.research.google.com/github/khalilurehman-masood/fine_tuning/blob/main/fine_tuning_voice_to_text_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch torchaudio transformers datasets jiwer librosa soundfile


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft

In [None]:
!pip install sentencepiece huggingface_hub




In [None]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, AutoProcessor

# Load pretrained Urdu-compatible Wav2Vec2 model
model_name = "facebook/wav2vec2-large-960h"
# processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
processor = Wav2Vec2Processor.from_pretrained(model_name)

In [None]:
from datasets import load_dataset

# Load Mozilla Common Voice Urdu dataset
dataset_train = load_dataset("mozilla-foundation/common_voice_11_0", "ur", split='train')
dataset_test = load_dataset("mozilla-foundation/common_voice_11_0", "ur", split='test')


In [None]:
next(iter(dataset_train))

{'client_id': 'e53f84d151d6cc6d45a57decde08a99efe47d7751a4ca60e58fb87ea68a35d53dcae445c65d5e73e0449a0b1cf2b4d09f32874877e8786664aa50f1f2ec2b932',
 'path': '/root/.cache/huggingface/datasets/downloads/extracted/5350814842baec1cce17a4cb70aed2f5d8243e8fe4e810ff027157f331f95972/ur_train_0/common_voice_ur_31771683.mp3',
 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/5350814842baec1cce17a4cb70aed2f5d8243e8fe4e810ff027157f331f95972/ur_train_0/common_voice_ur_31771683.mp3',
  'array': array([7.10542736e-14, 7.38964445e-13, 1.08002496e-12, ...,
         1.29391765e-06, 2.22157587e-06, 1.43777788e-06]),
  'sampling_rate': 48000},
 'sentence': 'کبھی کبھار ہی خیالی پلاو بناتا ہوں',
 'up_votes': 2,
 'down_votes': 0,
 'age': 'twenties',
 'gender': 'male',
 'accent': '',
 'locale': 'ur',
 'segment': ''}

In [None]:
import torch, torchaudio

In [None]:
# Function to process audio
def preprocess_data(batch):
    audio = batch["audio"]

    # Convert audio to 16kHz
    waveform = torch.tensor(audio["array"], dtype=torch.float32)
    sample_rate = audio["sampling_rate"]

    if sample_rate != 16000:
        transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = transform(waveform)

    # Extract features
    batch["input_values"] = processor(waveform.numpy(), sampling_rate=16000).input_values[0]

    # Tokenize text
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids

    return batch

# Apply preprocessing
dataset_train = dataset_train.map(preprocess_data, remove_columns=["client_id", "path", "audio", "up_votes", "down_votes", "age", "gender", "accent", "locale", "segment"])
dataset_test = dataset_test.map(preprocess_data, remove_columns=["client_id", "path", "audio", "up_votes", "down_votes", "age", "gender", "accent", "locale", "segment"])

In [None]:
dataset_train.shape

(4129, 3)

In [None]:
dataset_train.features

{'sentence': Value(dtype='string', id=None),
 'input_values': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [None]:
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./wav2vec2-urdu",
    eval_strategy='epoch',  # Evaluate during training
    save_strategy="steps",  # Save checkpoints periodically
    per_device_train_batch_size=8,  # Adjust based on GPU memory
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Helps if GPU memory is limited
    fp16=True,  # Enables mixed precision for faster training (if using GPU)
    save_steps=400,
    eval_steps=400,
    logging_steps=100,
    learning_rate=3e-4,
    weight_decay=0.005,
    num_train_epochs=3,
    warmup_steps=500,
    save_total_limit=2,  # Keep only the last 2 checkpoints
    report_to="none",  # Avoid logging to WandB
)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    processing_class=processor.feature_extractor,
    data_collator=data_collator,
)


In [None]:
trainer.train()




Epoch,Training Loss,Validation Loss
1,458.206,188.864517
2,253.8971,218.006439




TrainOutput(global_step=774, training_loss=628.8787548449612, metrics={'train_runtime': 1907.7627, 'train_samples_per_second': 6.493, 'train_steps_per_second': 0.406, 'total_flos': 2.6159546651950817e+18, 'train_loss': 628.8787548449612, 'epoch': 2.990328820116054})

In [None]:
trainer.evaluate()




{'eval_loss': 218.00643920898438,
 'eval_runtime': 178.5228,
 'eval_samples_per_second': 18.496,
 'eval_steps_per_second': 2.313,
 'epoch': 2.990328820116054}