In [1]:
from transformers import SpeechT5Model, SpeechT5FeatureExtractor

import numpy as np
import torch
from datasets import load_dataset
from IPython.display import Audio, display

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Data

In [3]:
class CommonVoiceDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        audio_array = self.dataset[idx]['audio']['array']
        sampling_rate = self.dataset[idx]['audio']['sampling_rate']
        sentence = self.dataset[idx]['sentence']

        audio_tensor = torch.tensor(audio_array).float()
        return audio_tensor, sampling_rate, sentence

In [4]:
# Load dataset
dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split='test')
train_dataset = CommonVoiceDataset(dataset)

Found cached dataset common_voice_11_0 (/export/home/lium/bdos/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/en/11.0.0/3f27acf10f303eac5b6fbbbe02495aeddb46ecffdb0a2fe3507fcfbf89094631)


In [None]:
audio_tensor, sampling_rate, sentence = train_dataset[0]
audio_array = audio_tensor.numpy()
print("Text:", sentence)
display(Audio(audio_array, rate=sampling_rate))

Text: Joe Keaton disapproved of films, and Buster also had reservations about the medium.


## SpeechT5 speech embeddings

In [6]:
# Load pretrained model and feature extractor
model = SpeechT5Model.from_pretrained("microsoft/speecht5_asr")
model = model.to(device)
feature_extractor = SpeechT5FeatureExtractor.from_pretrained("microsoft/speecht5_asr")

Some weights of the model checkpoint at microsoft/speecht5_asr were not used when initializing SpeechT5Model: ['speecht5.encoder.prenet.pos_conv_embed.conv.weight_v', 'speecht5.encoder.prenet.masked_spec_embed', 'speecht5.encoder.prenet.feature_projection.layer_norm.bias', 'speecht5.encoder.prenet.feature_projection.layer_norm.weight', 'speecht5.encoder.prenet.pos_conv_embed.conv.bias', 'speecht5.encoder.prenet.feature_encoder.conv_layers.2.conv.weight', 'speecht5.encoder.prenet.feature_encoder.conv_layers.6.conv.weight', 'speecht5.encoder.prenet.feature_encoder.conv_layers.3.conv.weight', 'speecht5.encoder.prenet.feature_encoder.conv_layers.0.layer_norm.bias', 'speecht5.encoder.prenet.feature_projection.projection.bias', 'speecht5.encoder.prenet.pos_conv_embed.conv.weight_g', 'speecht5.encoder.prenet.feature_encoder.conv_layers.5.conv.weight', 'speecht5.encoder.prenet.feature_encoder.conv_layers.0.conv.weight', 'speecht5.encoder.prenet.feature_encoder.conv_layers.0.layer_norm.weight',

In [7]:
inputs = feature_extractor(audio_tensor.numpy(), return_tensors="pt", sampling_rate=16000).to(device)

In [8]:
inputs

{'input_values': tensor([[-7.9581e-13, -1.5916e-12, -6.2528e-12,  ...,  1.5242e-06,
          1.9465e-06,  1.2574e-06]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0', dtype=torch.int32)}

In [9]:
# Compute the embeddings
with torch.no_grad():
    embeddings = model(**inputs).last_hidden_state
    #embeddings = model(inputs.input_values).last_hidden_state

OutOfMemoryError: CUDA out of memory. Tried to allocate 340.66 GiB (GPU 0; 22.17 GiB total capacity; 544.88 MiB already allocated; 21.04 GiB free; 602.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF