In [1]:
!pip install datasets transformers huggingface_hub torchaudio librosa jiwer -q



In [53]:
import os
import torch
import datasets
from datasets import Dataset, DatasetDict, Audio
import random
import json
import librosa
import pandas as pd
from IPython.display import display, HTML

from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor

In [3]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Feb 27 02:18:51 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  |   00000000:01:00.0 Off |                  Off |
|  0%   37C    P8             11W /  450W |       1MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## 1. Load in data and convert to a Dataset for training

In [4]:
audio_data = datasets.load_dataset("csv", data_files="cv-valid-train.csv")
audio_data

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['filename', 'text', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'duration'],
        num_rows: 195776
    })
})

#### Common Voice dataset provides a lot more information about each audio file such as accent. However, I have decided to only use the `'filename'` and the `'text'` for fine tuning.

In [6]:
audio_data = audio_data.remove_columns(["accent", "age", "up_votes", "down_votes", "gender", "duration"])

In [7]:
## writing a function to display a random sample of the dataset
from datasets import ClassLabel

def show_random_elements(dataset, num_examples):
    assert num_examples <= len(dataset), "Number of samples more than length of dataset"
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset) - 1)
        while pick in picks:
            pick = random.randint(0, len(dataset) - 1)
        picks.append(pick)

    sample_df = pd.DataFrame(dataset[picks])
    display(HTML(sample_df.to_html()))

In [15]:
show_random_elements(audio_data["train"].remove_columns(["filename"]), num_examples=10)

Unnamed: 0,text
0,look at the bottle you took the powder from
1,the boy was sad as he left her that day
2,his soul must be too primitive to understand those things he thought
3,how do you think
4,in a weeks time we're going to america
5,tomorrow morning at ten o'clock
6,here i am between my flock and my treasure the boy thought
7,it was dropping off in flakes and raining down on the sand
8,we are obliged at least once in our lives to visit the holy city of mecca
9,you can't believe it's not butter


## 2. Prepare Dataset

In [28]:
# check if dataset contains special characters
import re

def check_special_char(dataset):
    for text in dataset['text']:
        regex = re.compile('[@_!#$%^&*()<>?/\|}{~:]')
        if regex.search(text) == None:
            continue
        else:
            print(f"This text contains special characters: {text}")
            break
    return "Text does not contain special characters"

In [29]:
check_special_char(audio_data['train'])

'Text does not contain special characters'

#### The transcriptions look very clean and it seems a lot like dialogues. We can see that the transcriptions do not contains special characters such as `,.?!;:`. Since I will not be using a language model, it wont be hard to classify speech chunks in this dataset since they do no have these special characters which relate to characteristic sounds like excitement and confusion

In [24]:
def check_uppercase(dataset):
    for text in dataset['text']:
        for char in text:
            if char.isupper():
                print(text)
                return

    return "All the text are already in lowercase"
check_uppercase(audio_data['train'])

'All the text are already in lowercase'

#### There is no need to normalize the text as the text has already been normalized to lowercase

## 3. Prepare Tokenizer

#### In the transformers library, Wav2Vec2 model is accompanied by a `Wav2Vec2CTCTokenizer`, which helps to process the model's output format to text. The fine-tuned Wav2Vec2 needs to map a sequence of context representaitons to the correct transcription, hence a linear layer needs to be added on top. The linear layer is used to classify context representations to a token class.

#### The output size of this layer should be equal to the number of tokens in the vocabulary from the dataset used for fine-tuning.

#### I have decided to use the CTC algorithm and it classifies the speech chunks into letters, so I extract all the unique letters and build the vocabulary.

In [31]:
def create_vocab(batch):
    all_text = " ".join(batch['text'])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

vocab = audio_data.map(create_vocab, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=audio_data.column_names["train"])

Map:   0%|          | 0/195776 [00:00<?, ? examples/s]

In [32]:
vocab

DatasetDict({
    train: Dataset({
        features: ['vocab', 'all_text'],
        num_rows: 1
    })
})

In [35]:
vocab_list = list(set(vocab['train']['vocab'][0]))

# visualize the vocab
vocab_dict = {letter: number for number, letter in enumerate(vocab_list)}
vocab_dict

{"'": 0,
 's': 1,
 'e': 2,
 'q': 3,
 'o': 4,
 'w': 5,
 'b': 6,
 'l': 7,
 'x': 8,
 'z': 9,
 'u': 10,
 'y': 11,
 'n': 12,
 'r': 13,
 'a': 14,
 'c': 15,
 'p': 16,
 'd': 17,
 'm': 18,
 't': 19,
 'h': 20,
 'f': 21,
 'j': 22,
 'i': 23,
 ' ': 24,
 'k': 25,
 'v': 26,
 'g': 27}

#### All letters in the alphabet occur in the dataset and there is also the special characters `" "` and `'`. I did not remove these special characters because:
- the model need to predict when a words has finished or else it would just be a sequence of characters with no clear distinction between the words.
- the  `'` character helps to give a difference in meaning between it's and its

To make it clear that the space character has its own token class, i give it a more visible character: `|`

In [37]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

##### I also add a `"UNK"` and `"PAD"` token to help the model deal with characters not encounterd in the trainig set and to have "blank token" for CTC algorithm

In [38]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(len(vocab_dict))

30


In [40]:
# save vocab as json file
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [43]:
# use json file to create an object of Wav2Vec2CTCTokenizer class
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

## 4. Create Feature Extractor

#### Wav2Vec2 was pretrained and fine-tuned on 960 hours of Librispeech which was sampled at 16kHz, hence we have to make sure the pseech input is also sampled at 16kHz.

#### Common Voice is sampled at 48kHz, thus it has to downsampled to 16kHz for training. I downsampled the training data 16kHz below

In [46]:
# feature extractor pipeline
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
# featur extractor and tokenizer wrapped into single class
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

#### Preprocess Data

In [47]:
audio_data

DatasetDict({
    train: Dataset({
        features: ['filename', 'text'],
        num_rows: 195776
    })
})

In [50]:
AUDIO_DIR = "./cv-valid-train"
audio_data['train'] = audio_data['train'].map(lambda x: {'filename': os.path.join(AUDIO_DIR, x['filename'])})
audio_data['train']['filename'][0]

Map:   0%|          | 0/195776 [00:00<?, ? examples/s]

'./cv-valid-train/cv-valid-train/sample-000000.mp3'

In [57]:
def resample_audio(dataset):
    # downsample to 16kHz
    audio_input, sr = librosa.load(dataset['filename'], sr=16000)
    dataset['audio'] = audio_input
    return dataset

In [59]:
audio_data['train'] = audio_data['train'].map(resample_audio)

Map:   0%|          | 0/195776 [00:00<?, ? examples/s]

  audio_input, sr = librosa.load(dataset['filename'], sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: './cv-valid-train/cv-valid-train/sample-073801.mp3'

In [None]:
# split into train and validation set
audio_data.train_test_split(test_size=0.3)
train_set = audio_data['train']
eval_set = audio_data['test']