Following HuggingFace SpeechT5 tutorial to combine ASR and TTS (https://huggingface.co/blog/speecht5)


In [None]:
!pip install -q torch
!pip install -q sentencepiece
!pip install -q torchaudio 
!pip install -q transformers
!pip install -q soundfile

# there's probably a better library for playing audio 
!pip install -q simpleaudio 

In [None]:
!ls ../data

In [None]:
# load an example sound file

import soundfile as sf
# returns a tuple of (NumPy array of the waveform, sampling rate)
input_sound_tuple = sf.read("../data/2086-149220-0033.wav")

In [None]:
input_sound_tuple

In [None]:
input_sound, sampling_rate = input_sound_tuple

In [61]:
from transformers import pipeline
asr = pipeline(task="automatic-speech-recognition", model="microsoft/speecht5_asr")



In [62]:
# get back a text transcription from the sound
asr_result = asr(input_sound)



In [63]:
# it's dictionary with a single field called 'text'
asr_result

{'text': "well i don't wish to see it any more observed febric turning away her eyes it is certainly very like the old portrait"}

In [None]:
text = asr_result['text']

In [64]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech

# the text-to-speech model has two parts: a tokenizer/processor which turns the character stream into 
# a matrix of token IDs and an actual speech synthesizer
tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")


{'input_ids': tensor([[ 4, 20,  5, 15, 15,  4, 10,  4, 14,  8,  9, 31,  6,  4, 20, 10, 12, 11,
          4,  6,  8,  4, 12,  5,  5,  4, 10,  6,  4,  7,  9, 22,  4, 18,  8, 13,
          5,  4,  8, 25, 12,  5, 13, 27,  5, 14,  4, 19,  5, 25, 13, 10, 17,  4,
          6, 16, 13,  9, 10,  9, 21,  4,  7, 20,  7, 22,  4, 11,  5, 13,  4,  5,
         22,  5, 12,  4, 10,  6,  4, 10, 12,  4, 17,  5, 13,  6,  7, 10,  9, 15,
         22,  4, 27,  5, 13, 22,  4, 15, 10, 28,  5,  4,  6, 11,  5,  4,  8, 15,
         14,  4, 24,  8, 13,  6, 13,  7, 10,  6,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
# run the processor to get back an array of token IDs
tts_inputs = tts_processor(text=text, return_tensors="pt");
tts_inputs

In [None]:
tts_input_token_ids = tts_inputs['input_ids']

In [65]:
import torch
from transformers import SpeechT5HifiGan
from datasets import load_dataset

# load vector describing speaker voice
speaker_embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(speaker_embeddings_dataset[7306]["xvector"]).unsqueeze(0)

# get a vocoder model to generate final sound
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

Found cached dataset cmu-arctic-xvectors (/Users/iskander/.cache/huggingface/datasets/Matthijs___cmu-arctic-xvectors/default/0.0.1/a62fea1f9415e240301ea0042ffad2a3aadf4d1caa7f9a8d9512d631723e781f)


In [None]:
# combine the TTS model, a speaker embedding, and vocoder to actually generate sounds for the 
# text token IDs
output_speech = tts_model.generate_speech(tts_input_token_ids, speaker_embeddings, vocoder=vocoder)

In [66]:
# save the sound file 
import soundfile as sf
sf.write("round-trip-output.wav", output_speech.numpy(), samplerate=16000)

In [68]:
import simpleaudio 
import numpy as np

def normalize_waveform(single_channel_float_waveform, min_int=-32768, max_int=32767, output_dtype=np.int16)
    # simpleaudio expects 16-bit integer values for wave height
    # so normalize float sound arrays to fit that range
    int_range = max_int - min_int
    normalized_waveform = single_channel_float_waveform - single_channel_float_waveform.min()
    normalized_waveform /= normalized_waveform.max()
    int64_waveform_from_0 = (normalized_waveform * int_range).astype(np.int64)
    int64_waveform_from_min = int64_waveform_from_0 + min_int
    return int64_waveform_from_min.astype(output_dtype)
    
def play(single_channel_float_waveform, num_channels=1, bytes_per_sample=2, sampling_rate=16000):
    int_waveform = normalize_waveform(single_channel_float_waveform)
    play_obj = simpleaudio.play_buffer(int_waveform, num_channel, bytes_per_sample, sampling_rate)
    # wait for play-back to finish
    play_obj.wait_done()

In [69]:
play(input_sound, sampling_rate)

In [70]:
play(output_speech.numpy())