In [8]:
from datasets import Dataset
import pandas as pd
import re

import torchaudio
import librosa
import numpy as np
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_data = Dataset.from_pandas(train_df)
test_data = Dataset.from_pandas(test_df)

train_data = train_data.remove_columns(["full", "x", "filename"])
test_data = test_data.remove_columns(["full", "x", "filename"])

chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
    return batch

train_data = train_data.map(remove_special_characters)
test_data = test_data.map(remove_special_characters)


  0%|          | 0/132574 [00:00<?, ?ex/s]

  0%|          | 0/23396 [00:00<?, ?ex/s]

In [9]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["file"])
    batch["speech"] = speech_array[0].numpy()
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["sentence"]
    return batch

train_data = train_data.map(speech_file_to_array_fn, remove_columns=train_data.column_names, num_proc=64)
test_data = test_data.map(speech_file_to_array_fn, remove_columns=test_data.column_names, num_proc=64)


tcmalloc: large alloc 1392181248 bytes == 0x2a2914000 @  0x7fe58d618680 0x7fe58d638da2 0x5f24fc 0x64d250 0x5268e9 0x5c3a40 0x56aadf 0x568d9a 0x50b868 0x56bc9b 0x50b6fe 0x56fb87 0x5f5956 0x56aadf 0x568d9a 0x50b868 0x56aadf 0x5f5956 0x56aadf 0x568d9a 0x50b868 0x56aadf 0x568d9a 0x5f5b33 0x50b7f8 0x5f2702 0x56c332 0x568d9a 0x50b868 0x56aadf 0x5f5956
tcmalloc: large alloc 1408000000 bytes == 0x2a4e68000 @  0x7fe58d618680 0x7fe58d638da2 0x5f24fc 0x64d250 0x5268e9 0x5c3a40 0x56aadf 0x568d9a 0x50b868 0x56bc9b 0x50b6fe 0x56fb87 0x5f5956 0x56aadf 0x568d9a 0x50b868 0x56aadf 0x5f5956 0x56aadf 0x568d9a 0x50b868 0x56aadf 0x568d9a 0x5f5b33 0x50b7f8 0x5f2702 0x56c332 0x568d9a 0x50b868 0x56aadf 0x5f5956
tcmalloc: large alloc 1393410048 bytes == 0x2a2b6e000 @  0x7fe58d618680 0x7fe58d638da2 0x5f24fc 0x64d250 0x5268e9 0x5c3a40 0x56aadf 0x568d9a 0x50b868 0x56bc9b 0x50b6fe 0x56fb87 0x5f5956 0x56aadf 0x568d9a 0x50b868 0x56aadf 0x5f5956 0x56aadf 0x568d9a 0x50b868 0x56aadf 0x568d9a 0x5f5b33 0x50b7f8 0x5f2702 0

In [10]:
def resample(batch):
    batch["speech"] = librosa.resample(np.asarray(batch["speech"]), 48_000, 16_000)
    batch["sampling_rate"] = 16_000
    return batch

train_data = train_data.map(resample, num_proc=64)
test_data = test_data.map(resample, num_proc=64)

tcmalloc: large alloc 1796087808 bytes == 0x5f5e7c000 @  0x7fe58d618680 0x7fe58d638da2 0x5f24fc 0x64d250 0x5268e9 0x5c3a40 0x56aadf 0x50b6fe 0x56fb87 0x5f5956 0x56aadf 0x568d9a 0x50b868 0x56aadf 0x5f5956 0x56aadf 0x568d9a 0x50b868 0x56aadf 0x568d9a 0x5f5b33 0x50b7f8 0x5f2702 0x56c332 0x568d9a 0x50b868 0x56aadf 0x5f5956 0x56acb6 0x5f5956 0x56aadf
tcmalloc: large alloc 2245115904 bytes == 0x660f5e000 @  0x7fe58d618680 0x7fe58d638da2 0x5f24fc 0x64d250 0x5268e9 0x5c3a40 0x56aadf 0x50b6fe 0x56fb87 0x5f5956 0x56aadf 0x568d9a 0x50b868 0x56aadf 0x5f5956 0x56aadf 0x568d9a 0x50b868 0x56aadf 0x568d9a 0x5f5b33 0x50b7f8 0x5f2702 0x56c332 0x568d9a 0x50b868 0x56aadf 0x5f5956 0x56acb6 0x5f5956 0x56aadf
tcmalloc: large alloc 2806398976 bytes == 0x5a042e000 @  0x7fe58d618680 0x7fe58d638da2 0x5f24fc 0x64d250 0x5268e9 0x5c3a40 0x56aadf 0x50b6fe 0x56fb87 0x5f5956 0x56aadf 0x568d9a 0x50b868 0x56aadf 0x5f5956 0x56aadf 0x568d9a 0x50b868 0x56aadf 0x568d9a 0x5f5b33 0x50b7f8 0x5f2702 0x56c332 0x568d9a 0x50b868 0

KeyboardInterrupt: 

In [None]:
def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    assert (len(set(batch["sampling_rate"])) == 1), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
                    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

train_data = train_data.map(prepare_dataset, remove_columns=train_data.column_names, batch_size=8, num_proc=8, batched=True, return_tensors="pt")
test_data = test_data.map(prepare_dataset, remove_columns=test_data.column_names, batch_size=8, num_proc=8, batched=True, return_tensors="pt")

In [None]:
train_data = train_data.with_format("torch")
test_data = test_data.with_format("torch")

torch.save(train_data, 'train_si_asr.pt')
torch.save(test_data, 'test_si_asr.pt')