In [23]:
import torch
from datasets import load_dataset

from transformers import pipeline
from transformers import SpeechT5Processor, SpeechT5ForSpeechToText, SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5ForSpeechToSpeech

from IPython.display import Audio

## Data

In [26]:
dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
dataset = dataset.sort("id")
example = dataset[40]

Found cached dataset librispeech_asr_demo (/export/home/lium/bdos/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_demo/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)


## Automatic Speech Recognition (ASR)

Option 1

In [2]:
generator = pipeline(task="automatic-speech-recognition", model="microsoft/speecht5_asr")

In [4]:
transcription = generator(example["audio"]["array"])



In [5]:
transcription

{'text': 'a man said to the universe sir i exist'}

Option 2

In [7]:
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")

In [8]:
sampling_rate = dataset.features["audio"].sampling_rate
inputs = processor(audio=example["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

In [9]:
predicted_ids = model.generate(**inputs, max_length=100)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

In [10]:
transcription

['a man said to the universe sir i exist']

## Text-to-Speech (TTS)

In [13]:
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

In [14]:
inputs = processor(text="Don't count the days, make the days count.", return_tensors="pt")

In [15]:
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

Found cached dataset cmu-arctic-xvectors (/export/home/lium/bdos/.cache/huggingface/datasets/Matthijs___cmu-arctic-xvectors/default/0.0.1/a62fea1f9415e240301ea0042ffad2a3aadf4d1caa7f9a8d9512d631723e781f)


In [16]:
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

In [17]:
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)

In [18]:
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

In [19]:
with torch.no_grad():
    speech = vocoder(spectrogram)

In [20]:
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

In [21]:
Audio(speech, rate=16000)

## Voice Conversion (VC)

In [24]:
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")

In [27]:
example["audio"]

{'path': '/export/home/lium/bdos/.cache/huggingface/datasets/downloads/extracted/e1169fd1dddcf165665bd3c40c38a3f2e7f677cf94c6387cfa40bdaf65d475f6/dev_clean/1272/141231/1272-141231-0000.flac',
 'array': array([-0.00048828, -0.00018311, -0.00137329, ...,  0.00079346,
         0.00091553,  0.00085449]),
 'sampling_rate': 16000}

In [28]:
Audio(example["audio"]['array'], rate=16000)

In [29]:
sampling_rate = dataset.features["audio"].sampling_rate
inputs = processor(audio=example["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

In [30]:
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

Found cached dataset cmu-arctic-xvectors (/export/home/lium/bdos/.cache/huggingface/datasets/Matthijs___cmu-arctic-xvectors/default/0.0.1/a62fea1f9415e240301ea0042ffad2a3aadf4d1caa7f9a8d9512d631723e781f)


In [31]:
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

In [32]:
speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)

In [33]:
Audio(speech, rate=16000)