In [1]:
!pip install datasets -q
!pip install transformers -q
!pip install jiwer -q

In [2]:
from huggingface_hub import login
login(token="") # your secret token here


In [3]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("mozilla-foundation/common_voice_17_0", "en", split="train", streaming=True, trust_remote_code=True)
valid = load_dataset("mozilla-foundation/common_voice_17_0", "en", split="validation", streaming=True, trust_remote_code=True)
test = load_dataset("mozilla-foundation/common_voice_17_0", "en", split="test", streaming=True, trust_remote_code=True)



README.md:   0%|          | 0.00/12.7k [00:00<?, ?B/s]

common_voice_17_0.py:   0%|          | 0.00/8.19k [00:00<?, ?B/s]

languages.py:   0%|          | 0.00/3.92k [00:00<?, ?B/s]

release_stats.py:   0%|          | 0.00/132k [00:00<?, ?B/s]

In [4]:
dataset=dataset.remove_columns(['client_id', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'])
valid=valid.remove_columns(['client_id', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'])
test=test.remove_columns(['client_id', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'])

In [5]:
import pandas as pd
import numpy as np

In [6]:
data = []
num_examples_to_load = 2200

for i, example in enumerate(dataset):
    # Calculate the duration
    duration = example['audio']['array'].shape[0] / example['audio']['sampling_rate']
    length = len(example.get('sentence', ''))

    if (1 <= duration <= 5) and length >= 3:
        data.append({
            'path': example['path'],
            'audio_array': example['audio']['array'],
            'sampling_rate': example['audio']['sampling_rate'],
            'duration': duration,
            'text': example.get('sentence', ''),
        })

    if len(data) >= num_examples_to_load:
        break

# Convert the list to a DataFrame
df_10 = pd.DataFrame(data)

Reading metadata...: 1101170it [00:27, 39389.91it/s]


In [7]:
data_test = []
num_examples_to_load = 200

for i, example in enumerate(test):
    # Calculate the duration
    duration = example['audio']['array'].shape[0] / example['audio']['sampling_rate']
    length = len(example.get('sentence', ''))

    if (1 <= duration <= 5) and length >= 3:
        data_test.append({
            'path': example['path'],
            'audio_array': example['audio']['array'],
            'sampling_rate': example['audio']['sampling_rate'],
            'duration': duration,
            'text': example.get('sentence', ''),
        })

    if len(data_test) >= num_examples_to_load:
        break

# Convert the list to a dataframe
test = pd.DataFrame(data_test)

Reading metadata...: 16393it [00:00, 23386.41it/s]


In [8]:
data_valid = []
num_examples_to_load = 500

for i, example in enumerate(valid):
    # Calculate the duration
    duration = example['audio']['array'].shape[0] / example['audio']['sampling_rate']
    length = len(example.get('sentence', ''))

    if (1 <= duration <= 5) and length >= 3:
        data_valid.append({
            'path': example['path'],
            'audio_array': example['audio']['array'],
            'sampling_rate': example['audio']['sampling_rate'],
            'duration': duration,
            'text': example.get('sentence', ''),
        })

    if len(data_valid) >= num_examples_to_load:
        break

# Convert the list to a dataframe
valid = pd.DataFrame(data_valid)

Reading metadata...: 16393it [00:00, 27021.45it/s]


In [9]:
from datasets import Dataset
dataset = Dataset.from_pandas(df_10)
valid = Dataset.from_pandas(valid)
test = Dataset.from_pandas(test)

In [10]:
# # Save datasets
# dataset.save_to_disk('dataset_direc')
# test.save_to_disk('test_direc')

# # Compress the directories for download
# !zip -r dataset_direc.zip dataset_direc
# !zip -r test_direc.zip test_direc


In [11]:
import torch
import torchaudio

def resample_audio(batch):
    resampler = torchaudio.transforms.Resample(orig_freq=48000, new_freq=16000)
    batch['audio_array'] = [resampler(torch.tensor(audio, dtype=torch.float32)).numpy() if len(audio) > 0 else np.array([], dtype=np.float32) for audio in batch['audio_array']]  # Directly create tensor from audio
    batch['sampling_rate'] = [16000] * len(batch['audio_array'])
    return batch

# Apply the resampling function
dataset = dataset.map(resample_audio, batched=True)
valid = valid.map(resample_audio, batched=True)
test = test.map(resample_audio, batched=True)

Map:   0%|          | 0/2200 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [12]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower() + " "
    return batch

In [13]:
dataset = dataset.map(remove_special_characters)
valid = valid.map(remove_special_characters)
test = test.map(remove_special_characters)

Map:   0%|          | 0/2200 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [14]:
def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [15]:
import datasets
vocabs_dataset = dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=["path", "audio_array", "sampling_rate","duration", "text"])
vocabs_test = valid.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=["path", "audio_array", "sampling_rate","duration", "text"])
vocabs = datasets.concatenate_datasets([vocabs_dataset, vocabs_test])

Map:   0%|          | 0/2200 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [16]:
vocabs

Dataset({
    features: ['vocab', 'all_text'],
    num_rows: 2
})

In [17]:
vocab_list = list(set(vocabs["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'r': 0,
 'a': 1,
 's': 2,
 'q': 3,
 ' ': 4,
 't': 5,
 'd': 6,
 'k': 7,
 'z': 8,
 'h': 9,
 '—': 10,
 'n': 11,
 'm': 12,
 'é': 13,
 '‘': 14,
 'y': 15,
 "'": 16,
 'p': 17,
 'g': 18,
 'j': 19,
 'u': 20,
 'c': 21,
 'v': 22,
 'w': 23,
 '’': 24,
 '(': 25,
 'í': 26,
 'b': 27,
 'ê': 28,
 'â': 29,
 'l': 30,
 'i': 31,
 'x': 32,
 'f': 33,
 '“': 34,
 '”': 35,
 ')': 36,
 'o': 37,
 'e': 38}

In [18]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(len(vocab_dict))

41


In [19]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [20]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [21]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)

In [22]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [23]:
import random

rand_int = random.randint(0, len(dataset)-1)

print("Target text:", dataset[rand_int]["text"])
print("Input array shape:", np.asarray(dataset[rand_int]["audio_array"]).shape)
print("Sampling rate:", dataset[rand_int]["sampling_rate"])

Target text: now its place in short term forgetting is being questioned 
Input array shape: (66240,)
Sampling rate: 16000


In [24]:
import librosa
def prepare_dataset(batch):
    audio = batch
    audio_arrays = np.array(audio["audio_array"])
    audio["audio_arrayarray"] = librosa.resample(audio_arrays, orig_sr=audio["sampling_rate"], target_sr=16000)
    audio["sampling_rate"] = 16000

    # Process audio input
    batch["input_values"] = processor(audio["audio_array"], sampling_rate=audio["sampling_rate"]).input_values[0]

    # Process text labels (ensure correct tokenization)
    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids

    # Check if the labels exceed the vocab size and adjust if necessary
    max_vocab_size = len(vocab_dict)

    # Ensure labels are within vocab range
    batch["labels"] = [min(label, max_vocab_size - 1) for label in batch["labels"]]
    return batch



In [25]:
dataset = dataset.map(prepare_dataset, remove_columns=["path" , "audio_array", "duration", "sampling_rate", "text"], num_proc=1)
valid = valid.map(prepare_dataset, remove_columns=["path" , "audio_array", "duration", "sampling_rate", "text"], num_proc=1)
test = test.map(prepare_dataset, remove_columns=["path" , "audio_array", "duration", "sampling_rate", "text"], num_proc=1)

Map:   0%|          | 0/2200 [00:00<?, ? examples/s]



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [26]:
dataset

Dataset({
    features: ['audio_arrayarray', 'input_values', 'labels'],
    num_rows: 2200
})

In [27]:
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [28]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [29]:
!pip install evaluate -q
import evaluate

# Load the WER metric
wer_metric = evaluate.load("wer")

  pid, fd = os.forkpty()


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [30]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [31]:
from transformers import Wav2Vec2ForCTC

# Load the wav2vec2-large-960h model for speech-to-text (CTC)
model_name = "facebook/wav2vec2-large-960h"
model = Wav2Vec2ForCTC.from_pretrained(model_name,ctc_loss_reduction="mean", pad_token_id=processor.tokenizer.pad_token_id)

# Instead of using resize_token_embeddings, directly update the lm_head
model.lm_head = torch.nn.Linear(in_features=model.lm_head.in_features,
                                 out_features=len(vocab_dict),
                                 bias=True)
# Update the config to reflect the vocabulary size change
model.config.vocab_size = len(vocab_dict)

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=r"C:\Result",
  group_by_length=True,
  per_device_train_batch_size=16,
  per_device_eval_batch_size=16,
  eval_strategy="steps",
  num_train_epochs=30,
  fp16=True,
  gradient_checkpointing=True,
  save_steps=500,
  eval_steps=500,
  logging_steps=500,
  learning_rate=1e-5,
  weight_decay=0.001,
  warmup_steps=1000,
  save_total_limit=2,
)

In [36]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset,
    eval_dataset=valid,
    processing_class=processor.feature_extractor,
)

In [37]:
import wandb
wandb.login(key="97a5d116ab3cc1f54945db8b5e4450e3b5bd9d04")
trainer.train()

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Wer
500,2.9952,2.735163,1.0
1000,1.8663,0.725641,0.420022
1500,0.9465,0.532839,0.362416
2000,0.7044,0.533998,0.35123
2500,0.5549,0.516883,0.350112
3000,0.4947,0.488916,0.333613
3500,0.4887,0.493502,0.327181
4000,0.4337,0.480674,0.329978


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.

TrainOutput(global_step=4140, training_loss=1.0399618471302272, metrics={'train_runtime': 8469.2396, 'train_samples_per_second': 7.793, 'train_steps_per_second': 0.489, 'total_flos': 8.324643836112479e+18, 'train_loss': 1.0399618471302272, 'epoch': 30.0})

In [None]:
import pickle

# Save model and processor
with open("model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("processor.pkl", "wb") as processor_file:
    pickle.dump(processor, processor_file)

print("Model and processor saved using pickle.")


In [1]:
test_results = trainer.evaluate(test)
print("Test Loss:", test_results["eval_loss"])
print("Test WER:", test_results["eval_wer"])

NameError: name 'trainer' is not defined

In [None]:
def map_to_result(batch):
  with torch.no_grad():
    input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
    logits = model(input_values).logits

  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_str"] = processor.batch_decode(pred_ids)[0]
  batch["text"] = processor.decode(batch["labels"], group_tokens=False)
  
  return batch

results = test.map(map_to_result, remove_columns=test.column_names)


In [None]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))


In [None]:
results

In [None]:
show_random_elements(results)