Following HuggingFace SpeechT5 tutorial to combine ASR and TTS (https://huggingface.co/blog/speecht5)


In [1]:
!pip install -q torch
!pip install -q sentencepiece
!pip install -q torchaudio 
!pip install -q transformers
!pip install -q soundfile

# there's probably a better library for playing audio 
!pip install -q simpleaudio 

In [2]:
!ls ../data

2086-149220-0033.wav


In [3]:
# load an example sound file

import soundfile as sf
# returns a tuple of (NumPy array of the waveform, sampling rate)
input_sound_tuple = sf.read("../data/2086-149220-0033.wav")

In [4]:
input_sound_tuple

(array([0.00000000e+00, 9.15527344e-05, 9.15527344e-05, ...,
        1.22070312e-04, 1.22070312e-04, 1.22070312e-04]),
 16000)

In [5]:
input_sound, sampling_rate = input_sound_tuple

In [13]:
from transformers import pipeline
asr = pipeline(task="automatic-speech-recognition", model="microsoft/speecht5_asr")

Some weights of the model checkpoint at microsoft/speecht5_asr were not used when initializing SpeechT5ForSpeechToText: ['speecht5.encoder.prenet.pos_conv_embed.conv.weight_g', 'speecht5.encoder.prenet.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing SpeechT5ForSpeechToText from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SpeechT5ForSpeechToText from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of SpeechT5ForSpeechToText were not initialized from the model checkpoint at microsoft/speecht5_asr and are newly initialized: ['speecht5.encoder.prenet.pos_conv_embed.conv.parametrizations.weight.original1', 'speecht5.encoder.prenet.pos_conv_embed.conv.parametrizations.we

In [14]:
# get back a text transcription from the sound
asr_result = asr(input_sound)

In [15]:
# it's dictionary with a single field called 'text'
asr_result

{'text': "well i don't wish to see it any more observed febric turning away her eyes it is certainly very like the old portrait"}

In [16]:
text = asr_result['text']

In [17]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech

# the text-to-speech model has two parts: a tokenizer/processor which turns the character stream into 
# a matrix of token IDs and an actual speech synthesizer
tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")


In [18]:
# run the processor to get back an array of token IDs
tts_inputs = tts_processor(text=text, return_tensors="pt");
tts_inputs

{'input_ids': tensor([[ 4, 20,  5, 15, 15,  4, 10,  4, 14,  8,  9, 31,  6,  4, 20, 10, 12, 11,
          4,  6,  8,  4, 12,  5,  5,  4, 10,  6,  4,  7,  9, 22,  4, 18,  8, 13,
          5,  4,  8, 25, 12,  5, 13, 27,  5, 14,  4, 19,  5, 25, 13, 10, 17,  4,
          6, 16, 13,  9, 10,  9, 21,  4,  7, 20,  7, 22,  4, 11,  5, 13,  4,  5,
         22,  5, 12,  4, 10,  6,  4, 10, 12,  4, 17,  5, 13,  6,  7, 10,  9, 15,
         22,  4, 27,  5, 13, 22,  4, 15, 10, 28,  5,  4,  6, 11,  5,  4,  8, 15,
         14,  4, 24,  8, 13,  6, 13,  7, 10,  6,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [20]:
tts_input_token_ids = tts_inputs['input_ids']

In [22]:
import torch
from datasets import load_dataset

# load vector describing speaker voice
speaker_embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(speaker_embeddings_dataset[7306]["xvector"]).unsqueeze(0)
speaker_embeddings

Found cached dataset cmu-arctic-xvectors (/Users/iskander/.cache/huggingface/datasets/Matthijs___cmu-arctic-xvectors/default/0.0.1/a62fea1f9415e240301ea0042ffad2a3aadf4d1caa7f9a8d9512d631723e781f)


tensor([[-7.5731e-02, -2.7370e-02,  1.4933e-02,  4.5861e-02,  8.3840e-03,
         -2.7535e-02, -5.1030e-02, -6.1435e-02,  1.4576e-02,  1.9632e-02,
         -7.7323e-02, -7.8355e-02,  5.8233e-02,  3.7577e-02,  1.4377e-02,
          1.7147e-02, -1.3966e-02,  1.3549e-03,  9.4501e-03,  9.6230e-03,
          3.8752e-02,  2.5284e-03, -1.5207e-02, -4.5730e-02, -7.0040e-02,
         -8.4035e-03, -5.4758e-02,  4.7528e-03,  5.4306e-02,  1.8867e-02,
         -2.7039e-03,  2.1273e-02,  3.8547e-02, -4.7406e-02,  1.2328e-02,
         -6.9829e-02,  2.7079e-02,  5.5035e-02, -6.0107e-02, -6.4483e-02,
          6.6905e-03, -5.0482e-02,  4.0781e-02,  3.7543e-03,  3.2528e-02,
         -1.2350e-01, -1.8370e-02,  1.1340e-02, -5.8363e-02,  4.8635e-02,
          1.9366e-02,  3.4130e-02,  2.4126e-02,  1.6346e-02, -8.6061e-02,
          2.4379e-03,  1.1567e-02,  3.0723e-02,  3.2634e-02,  1.7792e-02,
          3.7105e-02, -1.0155e-02, -1.4131e-02,  4.0054e-02,  2.5886e-03,
          2.7985e-02,  1.8849e-02, -3.

In [46]:
from transformers import SpeechT5HifiGan

# get a vocoder model to generate final sound
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

In [50]:
# combine the TTS model, a speaker embedding, and vocoder to actually generate sounds for the 
# text token IDs
output_speech = tts_model.generate_speech(tts_input_token_ids, speaker_embeddings, vocoder=vocoder)

In [33]:
output_speech, output_speech.shape

(tensor([ 3.6646e-05,  8.9237e-05,  4.7538e-05,  ..., -1.4577e-03,
         -1.8693e-03, -2.8707e-03]),
 torch.Size([118272]))

In [34]:
# save the sound file 
import soundfile as sf
sf.write("round-trip-output.wav", output_speech.numpy(), samplerate=16000)

In [35]:
import simpleaudio 
import numpy as np

def normalize_waveform(single_channel_float_waveform, min_int=-32768, max_int=32767, output_dtype=np.int16):
    # simpleaudio expects 16-bit integer values for wave height
    # so normalize float sound arrays to fit that range
    int_range = max_int - min_int
    normalized_waveform = single_channel_float_waveform - single_channel_float_waveform.min()
    normalized_waveform /= normalized_waveform.max()
    int64_waveform_from_0 = (normalized_waveform * int_range).astype(np.int64)
    int64_waveform_from_min = int64_waveform_from_0 + min_int
    return int64_waveform_from_min.astype(output_dtype)
    
def play(single_channel_float_waveform, num_channels=1, bytes_per_sample=2, sampling_rate=16000):
    int_waveform = normalize_waveform(single_channel_float_waveform)
    play_obj = simpleaudio.play_buffer(int_waveform, num_channels, bytes_per_sample, sampling_rate)
    # wait for play-back to finish
    play_obj.wait_done()

In [36]:
play(input_sound, sampling_rate=sampling_rate)

In [51]:
play(output_speech.numpy())