# **PSST spelling correction dataset script generation**

In [1]:
# Ensure that GPU and RAM is set up: will be needed for training purpose
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Jul 13 14:55:39 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.01    Driver Version: 515.65.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:AF:00.0 Off |                    0 |
| N/A   53C    P0    29W /  70W |   4801MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# ensure enough memory present so that training does not stop
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 201.2 gigabytes of available RAM

You are using a high-RAM runtime!


In [3]:
# install the libraries
!pip install datasets 
!pip install huggingsound 
!pip install tqdm
!pip install transformers=4.28.0

[0m[31mERROR: Invalid requirement: 'transformers=4.28.0'
Hint: = is not a valid operator. Did you mean == ?[0m[31m
[0m

In [4]:
# import the libraries
import re
from datasets import load_dataset, DatasetDict, Audio
from huggingsound import SpeechRecognitionModel
from tqdm import tqdm
import json
import torch

In [5]:
# Function to remove special characters from text
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

def remove_special_characters(batch):
    batch["transcript"] = re.sub(chars_to_ignore_regex, '', batch["transcript"]) + " "
    return batch

In [6]:
# Function to save data to a JSON file
def save_to_json(data, filename):
    with open(filename, 'w') as jsonfile:
        json.dump(data, jsonfile, indent=4)

In [7]:
# load the model and processor
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch

model = Wav2Vec2ForCTC.from_pretrained("monideep2255/finetuning-xlsr-53-PSST_V7")
processor = Wav2Vec2Processor.from_pretrained("monideep2255/finetuning-xlsr-53-PSST_V7")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
processor.decode

<bound method Wav2Vec2Processor.decode of Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='monideep2255/finetuning-xlsr-53-PSST_V7', vocab_size=46, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<UNK>', 'pad_token': '<PAD>', 'additional_special_tokens': [AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True)]}, clean_up_tokenization_spaces=True)>

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (2): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (3): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elemen

In [10]:
processor.tokenizer.decoder = {24: '<???>',
 3: '<PAD>',
 2: '<SIL>',
 18: '<SPN>',
 19: '<UNK>',
 1: ' AA ',
 8: '  AE',
 6: ' AH ',
 36: ' AO ',
 33: ' AW ',
 17: ' AY ',
 20: ' B ',
 43: ' CH ',
 35: ' D ',
 42: ' DH ',
 10: ' DX ',
 7: ' EH ',
 12: ' ER ',
 44: ' EY ',
 27: ' F ',
 40: ' G ',
 9: ' HH ',
 41: ' IH ',
 14: ' IY ',
 28: ' JH ',
 21: ' K ',
 22: ' L ',
 37: ' M ',
 0: ' N ',
 25: ' NG ',
 16: ' OW ',
 15: ' OY ',
 32: ' P ',
 45: ' R ',
 38: ' S ',
 29: ' SH ',
 5: ' T ',
 31: ' TH ',
 11: ' UH ',
 4: ' UW ',
 34: ' V ',
 30: ' W ',
 39: ' Y ',
 13: ' Z ',
 26: ' ZH ',
 23: '|'}

In [11]:
import librosa
import numpy as np

# Function to preprocess and transcribe the data
def prep_training_data(model, dataset):
    references = []
    for example in tqdm(dataset):
        audio_path = example["filename_new"]["path"]
        input_values, sampling_rate = librosa.load(audio_path, sr=16000)

        # Resample the input speech to match the model's sampling rate
        input_values = librosa.resample(input_values, orig_sr=sampling_rate, target_sr=16000)

        input_values = processor(input_values, sampling_rate=16000, return_tensors="pt").input_values
        input_values = input_values.to(device)  # Move input to the same device as the model
        with torch.no_grad():
            logits = model(input_values).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(predicted_ids[0], clean_up_tokenization_spaces=False)

        row = {
            "id": example["utterance_id"],
            "prompt": example["prompt"],
            "actual": example["transcript"].strip(),
            "prediction": transcription.strip().replace('  ', ' ').replace('\t', ' '),
        }
        references.append(row)
    return references

In [12]:
'''
# Function to preprocess and transcribe the data
def prep_training_data(model, dataset):
    references = []
    for example in tqdm(dataset):
        audio_path = example["filename_new"]["path"]
        prediction = model.transcribe([audio_path])[0]["transcription"]
        row = {
            "id": example["utterance_id"],
            "prompt": example["prompt"],
            "actual": example["transcript"],
            "prediction": prediction,
        }
        references.append(row)
    return references
'''

'\n# Function to preprocess and transcribe the data\ndef prep_training_data(model, dataset):\n    references = []\n    for example in tqdm(dataset):\n        audio_path = example["filename_new"]["path"]\n        prediction = model.transcribe([audio_path])[0]["transcription"]\n        row = {\n            "id": example["utterance_id"],\n            "prompt": example["prompt"],\n            "actual": example["transcript"],\n            "prediction": prediction,\n        }\n        references.append(row)\n    return references\n'

In [13]:
def main():
    #source = "monideep2255/finetuning-xlsr-53-PSST_V7"

    # Model to create the transcriptions
    #model = SpeechRecognitionModel(source)

    data = load_dataset('csv', data_files={
    "train": '/work/van-speech-nlp/psst-csv/train_utterances_excel.csv',
    #"valid": '/work/van-speech-nlp/psst-csv/valid_utterances_excel.csv'
    #"test": '/work/van-speech-nlp/psst-csv/test_utterances_excel.csv'
})
    data = data.cast_column("filename_new", Audio(sampling_rate=16_000))

    # Training data
    train_data = data['train']
    #valid_data = data['valid']
    #test_data = data['test']

    # Data preprocessing
    train_data = train_data.map(remove_special_characters)
    #valid_data = valid_data.map(remove_special_characters)
    #test_data = test_data.map(remove_special_characters)
    
    # Prepare and transcribe the data for the train set
    train_data_transcribed = prep_training_data(model, train_data)
    save_to_json(train_data_transcribed, "train_data.json")

    # Prepare and transcribe the data for the validation set
    #valid_data_transcribed = prep_training_data(model, valid_data)
    #save_to_json(valid_data_transcribed, "valid_data.json")
    
    # Prepare and transcribe the data for the test set
    #test_data_transcribed = prep_training_data(model, test_data)
    #save_to_json(test_data_transcribed, "test_data.json")

if __name__ == "__main__":
    main()



  0%|          | 0/1 [00:00<?, ?it/s]



100%|██████████| 2298/2298 [03:05<00:00, 12.36it/s]
