In [2]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import Audio, load_dataset
import IPython.display as ipd
import torch

# Set device to "mps" if available
device = torch.device("mps" if torch.backends.mps.is_built() else "cpu")

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
forced_decoder_ids = processor.get_decoder_prompt_ids(language="french", task="translate")

# load streaming dataset (https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0) and read first audio sample
ds = load_dataset("mozilla-foundation/common_voice_11_0", "fr", split="test", streaming=True)
# Load the embeddings dataset for speech synthesis.
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

#Get Audio Dateaset.
ds = ds.cast_column("audio", Audio(sampling_rate=16_000))

index = 2  # Change this index to get a different sample
for i, sample in enumerate(ds):
    if i == index:
        print(sample["sentence"])
        input_speech = sample["audio"]
        break
# input_speech = next(iter(ds))["audio"]

#Play Audio
ipd.display(ipd.Audio(data=input_speech["array"], autoplay=True, rate=input_speech["sampling_rate"]))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Reading metadata...: 16089it [00:00, 33063.77it/s]


Ce site contient quatre tombeaux de la dynastie achéménide et sept des Sassanides.


In [3]:
# Process the input speech and move to the same device as the model
input_features = processor(input_speech["array"], sampling_rate=input_speech["sampling_rate"], return_tensors="pt").input_features.to(device)

try:
    # Generate token ids on the MPS device
    predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
except NotImplementedError as e:
    # If NotImplementedError occurs, move model and input_features back to CPU
    # print("Operation not supported on MPS, switching to CPU...")
    device = torch.device("cpu")
    model.to(device)
    input_features = input_features.to(device)
    predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)

# Decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


[' This site contains 4 tombs of the dynasty, Hemenid and 7 of the sassanids.']


In [4]:
from transformers import pipeline
from datasets import load_dataset
# import soundfile as sf
import torch
import IPython.display as ipd

# Check if a GPU is available and set the device accordingly
device = torch.device("mps" if torch.backends.mps.is_built() else "cpu")

# Initialize the text-to-speech pipeline with the specified device
synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts", device=device)
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)

# Synthesize speech with the given transcription and speaker embedding
output_speech = synthesiser(transcription, forward_params={"speaker_embeddings": speaker_embedding})

# Check if speech is a list and access the first element if so
if isinstance(output_speech, list):
    output_speech = output_speech[0]

# Write the audio to a file or play it
# sf.write("speech.wav", output_speech["audio"], samplerate=output_speech["sampling_rate"])
ipd.display(ipd.Audio(data=output_speech["audio"], autoplay=True, rate=output_speech["sampling_rate"]))