In [1]:
!pip install --upgrade datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio
!pip install jinja2

Collecting transformers
  Downloading transformers-4.48.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting jiwer
  Downloading jiwer-3.0.5-py3-none-any.whl.metadata (2.7 kB)
Collecting tensorboard
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting gradio
  Downloading gradio-5.12.0-py3-none-any.whl.metadata (16 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading

In [2]:
from huggingface_hub import login
login(token="") # secret token

In [None]:
from datasets import load_dataset

# Load the dataset
dataset_nep = load_dataset("openslr/openslr", "SLR54", split="train", trust_remote_code=True)

README.md:   0%|          | 0.00/42.9k [00:00<?, ?B/s]

openslr.py:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0/16 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/157905 [00:00<?, ? examples/s]

In [4]:
dataset_nep

Dataset({
    features: ['path', 'audio', 'sentence'],
    num_rows: 157905
})

In [5]:
split_data = dataset_nep.train_test_split(test_size=0.2, seed=42)  # 20% test set
dataset = split_data['train']
test = split_data['test']

In [6]:
split_data

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'sentence'],
        num_rows: 126324
    })
    test: Dataset({
        features: ['path', 'audio', 'sentence'],
        num_rows: 31581
    })
})

In [7]:
import pandas as pd
import numpy as np

In [8]:
data = []
num_examples_to_load = 2200

for i, example in enumerate(dataset):
    # Calculate the duration
    duration = example['audio']['array'].shape[0] / example['audio']['sampling_rate']
    length = len(example.get('sentence', ''))

    if (1 <= duration <= 5) and length >= 3:
        data.append({
            'path': example['path'],
            'audio_array': example['audio']['array'],
            'sampling_rate': example['audio']['sampling_rate'],
            'duration': duration,
            'text': example.get('sentence', ''),
        })

    if len(data) >= num_examples_to_load:
        break

# Convert the list to a DataFrame
df_10 = pd.DataFrame(data)

In [9]:
data_test = []
num_examples_to_load = 200

for i, example in enumerate(test):
    # Calculate the duration
    duration = example['audio']['array'].shape[0] / example['audio']['sampling_rate']
    length = len(example.get('sentence', ''))

    if (1 <= duration <= 5) and length >= 3:
        data_test.append({
            'path': example['path'],
            'audio_array': example['audio']['array'],
            'sampling_rate': example['audio']['sampling_rate'],
            'duration': duration,
            'text': example.get('sentence', ''),
        })

    if len(data_test) >= num_examples_to_load:
        break

# Convert the list to a dataframe
test = pd.DataFrame(data_test)

In [10]:
from datasets import Dataset
dataset = Dataset.from_pandas(df_10)
test = Dataset.from_pandas(test)

In [11]:
print(dataset[0]["sampling_rate"])
print(test)

48000
Dataset({
    features: ['path', 'audio_array', 'sampling_rate', 'duration', 'text'],
    num_rows: 200
})


In [9]:
# # Save datasets
# dataset.save_to_disk('dataset_direc')
# test.save_to_disk('test_direc')

# # Compress the directories for download
# !zip -r dataset_direc.zip dataset_direc
# !zip -r test_direc.zip test_direc


In [12]:
import torch
import torchaudio

def resample_audio(batch):
    resampler = torchaudio.transforms.Resample(orig_freq=48000, new_freq=16000)
    batch['audio_array'] = [resampler(torch.tensor(audio, dtype=torch.float32)).numpy() if len(audio) > 0 else np.array([], dtype=np.float32) for audio in batch['audio_array']]  # Directly create tensor from audio
    batch['sampling_rate'] = [16000] * len(batch['audio_array'])
    return batch

# Apply the resampling function
dataset = dataset.map(resample_audio, batched=True)
test = test.map(resample_audio, batched=True)

Map:   0%|          | 0/2200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [13]:
print(test[2]["sampling_rate"])

16000


In [14]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower() + " "
    return batch

In [15]:
dataset = dataset.map(remove_special_characters)
test = test.map(remove_special_characters)

Map:   0%|          | 0/2200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [16]:
def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [17]:
dataset

Dataset({
    features: ['path', 'audio_array', 'sampling_rate', 'duration', 'text'],
    num_rows: 2200
})

In [18]:
import datasets
vocabs_dataset = dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=["path", "audio_array", "sampling_rate","duration", "text"])
vocabs_test = test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=["path", "audio_array", "sampling_rate","duration", "text"])
vocabs = datasets.concatenate_datasets([vocabs_dataset, vocabs_test])

Map:   0%|          | 0/2200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [19]:
len(vocabs[0]["vocab"])

72

In [20]:
vocabs

Dataset({
    features: ['vocab', 'all_text'],
    num_rows: 2
})

In [21]:
vocab_list = list(set(vocabs["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'आ': 0,
 'ड': 1,
 'ै': 2,
 'ध': 3,
 'ि': 4,
 'ढ': 5,
 'ो': 6,
 'ू': 7,
 'ल': 8,
 'च': 9,
 'न': 10,
 'श': 11,
 'ङ': 12,
 'ट': 13,
 'प': 14,
 'ऐ': 15,
 '\u200c': 16,
 'य': 17,
 'औ': 18,
 'त': 19,
 'र': 20,
 'ब': 21,
 'व': 22,
 'ज': 23,
 'ँ': 24,
 '०': 25,
 '।': 26,
 '४': 27,
 'छ': 28,
 'ा': 29,
 'इ': 30,
 'उ': 31,
 'ऊ': 32,
 'ु': 33,
 'स': 34,
 '३': 35,
 'े': 36,
 'अ': 37,
 '१': 38,
 'द': 39,
 '८': 40,
 '्': 41,
 '६': 42,
 '७': 43,
 '\u200d': 44,
 'ञ': 45,
 'झ': 46,
 'ः': 47,
 'ं': 48,
 '५': 49,
 'म': 50,
 'ी': 51,
 'थ': 52,
 'ष': 53,
 'फ': 54,
 'घ': 55,
 'ृ': 56,
 'भ': 57,
 'ख': 58,
 'ग': 59,
 '२': 60,
 'ऋ': 61,
 'ह': 62,
 'ण': 63,
 'ओ': 64,
 'ठ': 65,
 'ई': 66,
 ' ': 67,
 '९': 68,
 'क': 69,
 'ौ': 70,
 'ए': 71}

In [22]:
del vocab_dict['\u200c']
del vocab_dict['\u200d']

In [23]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(len(vocab_dict))

72


In [24]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [25]:
from transformers import (
    WhisperTokenizer,
    WhisperProcessor,
    WhisperFeatureExtractor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [53]:
model_id = 'openai/whisper-small'
out_dir = 'whisper_np_librispeech'
epochs = 30
batch_size = 8

In [27]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_id)
tokenizer = WhisperTokenizer.from_pretrained(model_id, language='Nepali', task='transcribe')
processor = WhisperProcessor.from_pretrained(model_id, language='Nepali', task='transcribe')

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

In [28]:
import random

rand_int = random.randint(0, len(dataset)-1)

print("Target text:", dataset[rand_int]["text"])
print("Input array shape:", np.asarray(dataset[rand_int]["audio_array"]).shape)
print("Sampling rate:", dataset[rand_int]["sampling_rate"])

Target text: हरेक दुई महिनाको 
Input array shape: (44800,)
Sampling rate: 16000


In [36]:
import librosa
def prepare_dataset(batch):
  audio = batch
  batch['input_features'] = feature_extractor(audio['audio_array'], sampling_rate=audio['sampling_rate']).input_features[0]
  batch['labels'] =  tokenizer(batch['text']).input_ids
  return batch



In [35]:
dataset

Dataset({
    features: ['path', 'audio_array', 'sampling_rate', 'duration', 'text'],
    num_rows: 2200
})

In [41]:
dataset_preprocessed = dataset.map(prepare_dataset, remove_columns=["path", "audio_array", "duration", "sampling_rate"], num_proc=1)
test_preprocessed = test.map(prepare_dataset, remove_columns=["path", "audio_array", "duration", "sampling_rate"], num_proc=1)

Map:   0%|          | 0/2200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [42]:
dataset_preprocessed[0]['labels']

[50258,
 50313,
 50359,
 50363,
 3941,
 114,
 17937,
 3941,
 244,
 17937,
 31970,
 44500,
 46758,
 17937,
 3941,
 231,
 3941,
 223,
 3941,
 249,
 220,
 50257]

In [44]:
dataset[10]['audio_arrayarray']

[-0.0004793127591256052,
 0.0009873341768980026,
 -0.0003688646829687059,
 -0.000492804916575551,
 0.0014002175303176045,
 0.0011864983243867755,
 0.0012432164512574673,
 0.0004993958282284439,
 0.0005047359154559672,
 0.0003564493963494897,
 -0.0014362502843141556,
 -0.000837713829241693,
 -0.0005206537898629904,
 -0.001997085055336356,
 -5.238356243353337e-05,
 0.00041034710011444986,
 -0.0006329532479867339,
 -0.0012253281893208623,
 0.0005767238326370716,
 0.0015554423443973064,
 -0.0007743315654806793,
 0.0009208635310642421,
 0.0008895814535208046,
 0.0014187541091814637,
 0.004489267244935036,
 0.002324931789189577,
 0.0026567992754280567,
 0.004717953037470579,
 0.003074625739827752,
 0.003302796045318246,
 0.00457329535856843,
 0.004614206030964851,
 0.00378182390704751,
 0.0019242517882958055,
 0.003515739692375064,
 0.0047553968615829945,
 0.0008548519108444452,
 0.0009690629667602479,
 0.0030711181461811066,
 0.002405429957434535,
 0.002181709511205554,
 0.00176573195494711

In [43]:
input_str = dataset_preprocessed[0]["text"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

Input:                 शाखा कहलाउँछ 
Decoded w/ special:    <|startoftranscript|><|ne|><|transcribe|><|notimestamps|>शाखा कहलाउँछ <|endoftext|>
Decoded w/out special: शाखा कहलाउँछ 
Are equal:             True


In [44]:
model = WhisperForConditionalGeneration.from_pretrained(model_id)

model.generation_config.task = 'transcribe'
model.generation_config.language = 'nepali'
model.generation_config.forced_decoder_ids = None

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

In [45]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [46]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [48]:
import evaluate

In [49]:
metric = evaluate.load('wer')

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {'wer': wer}

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [56]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-np-libri",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=0.00001,
    warmup_steps=500,
    bf16=False,
    fp16=True,
    num_train_epochs=epochs,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    predict_with_generate=True,
    generation_max_length=225,
    report_to=['tensorboard'],
    load_best_model_at_end=True,
    metric_for_best_model='wer',
    greater_is_better=False,
    dataloader_num_workers=2,
    save_total_limit=2,
    lr_scheduler_type='constant',
    seed=42,
    data_seed=42
)



In [57]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset_preprocessed,
    eval_dataset=test_preprocessed,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

  trainer = Seq2SeqTrainer(


In [58]:
import wandb
wandb.login(key="d65bb5260632ef5809c7af7276c38ff76b160d31")
trainer.train()

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
  self.pid = os.fork()


Epoch,Training Loss,Validation Loss,Wer
1,0.5015,0.351222,63.859649
2,0.2081,0.293922,51.754386
3,0.098,0.302672,52.45614
4,0.0512,0.303137,48.421053
5,0.031,0.347723,50.175439
6,0.0236,0.367548,48.947368
7,0.0172,0.372377,51.052632
8,0.0161,0.377478,48.421053
9,0.0133,0.3761,48.245614
10,0.0112,0.396828,53.859649


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


KeyboardInterrupt: 