In [1]:
from transformers import SpeechT5Model, SpeechT5FeatureExtractor

import numpy as np
import torch
from datasets import load_dataset
from IPython.display import Audio, display

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [2]:
class CommonVoiceDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        audio_array = self.dataset[idx]['audio']['array']
        sampling_rate = self.dataset[idx]['audio']['sampling_rate']
        sentence = self.dataset[idx]['sentence']

        audio_tensor = torch.tensor(audio_array).float()
        return audio_tensor, sampling_rate, sentence

In [None]:
# Load dataset
dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split='test')
train_dataset = CommonVoiceDataset(dataset)

Downloading builder script:   0%|          | 0.00/8.13k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.4k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/60.9k [00:00<?, ?B/s]

Downloading and preparing dataset common_voice_11_0/en to /export/home/lium/bdos/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/en/11.0.0/3f27acf10f303eac5b6fbbbe02495aeddb46ecffdb0a2fe3507fcfbf89094631...


Downloading data:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.85G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.80G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.77G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.73G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.69G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.66G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.64G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.60G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.60G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.60G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.67G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.48G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.76G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.41G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/732M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/722M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.30G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.37G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.28G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.39G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/352M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.87G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.10G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.65G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/460M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]