**Sources** 
* HuggingFace SpeechT5 tutorial to combine ASR and TTS (https://huggingface.co/blog/speecht5)
* HuggingFace LLM tutorial: https://huggingface.co/docs/transformers/main/llm_tutorial


In [1]:
!pip install -q torch
!pip install -q sentencepiece
!pip install -q torchaudio 
!pip install -q -U transformers
!pip install -q soundfile
!pip install -q bitsandbytes
!pip install -q accelerate 

# there's probably a better library for playing audio 
!pip install -q simpleaudio 

In [11]:
!python -c "from accelerate.utils import write_basic_config; write_basic_config(mixed_precision='fp16')"

Setting ds_accelerator to cuda (auto detect)
Configuration already exists at /Users/iskander/.cache/huggingface/accelerate/default_config.yaml, will not override. Run `accelerate config` manually or pass a different `save_location`.


In [2]:
!ls ../data

2086-149220-0033.wav


In [3]:
# load an example sound file

import soundfile as sf
# returns a tuple of (NumPy array of the waveform, sampling rate)
input_sound_tuple = sf.read("../data/2086-149220-0033.wav")

In [4]:
input_sound_tuple

(array([0.00000000e+00, 9.15527344e-05, 9.15527344e-05, ...,
        1.22070312e-04, 1.22070312e-04, 1.22070312e-04]),
 16000)

In [5]:
input_sound, sampling_rate = input_sound_tuple

In [6]:
from transformers import pipeline
asr = pipeline(task="automatic-speech-recognition", model="microsoft/speecht5_asr")

Some weights of the model checkpoint at microsoft/speecht5_asr were not used when initializing SpeechT5ForSpeechToText: ['speecht5.encoder.prenet.pos_conv_embed.conv.weight_g', 'speecht5.encoder.prenet.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing SpeechT5ForSpeechToText from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SpeechT5ForSpeechToText from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of SpeechT5ForSpeechToText were not initialized from the model checkpoint at microsoft/speecht5_asr and are newly initialized: ['speecht5.encoder.prenet.pos_conv_embed.conv.parametrizations.weight.original0', 'speecht5.encoder.prenet.pos_conv_embed.conv.parametrizations.we

In [7]:
# get back a text transcription from the sound
asr_result = asr(input_sound)

In [8]:
# it's dictionary with a single field called 'text'
asr_result

{'text': "well i don't wish to see it any more observed febric turning away her eyes it is certainly very like the old portrait"}

In [25]:
input_text = asr_result['text']

In [55]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

torch.set_default_device("mps")

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)



In [139]:
# give the LLM some instructions and then feed it the text which we recognized from audio

prompt_prefix = """
Narrator: Lady Macbeth has been trapped within the body of cypress tree 
within the Everglades by otherworldly witches. She was cursed into this fate because of her attempts to destroy 
the Everglades ecosystem and kill all of the trees it contained. She is now at peace and has become spooky, mystical, poetrically enchanting. 
Visitor: Are ye Lady Macbeth?
Lady Macbeth: Ich usede bihofþe bæ hē̆r
Visitor: How have you come to such a state?
Lady Macbeth: For my crimes, the blood of groves cries up from the muck. 
Visitor: """
prompt = prompt_prefix + input_text + "\n Lady Macbeth:"
prompt

"\nNarrator: Lady Macbeth has been trapped within the body of cypress tree \nwithin the Everglades by otherworldly witches. She was cursed into this fate because of her attempts to destroy \nthe Everglades ecosystem and kill all of the trees it contained. She is now at peace and has become spooky, mystical, poetrically enchanting. \nVisitor: Are ye Lady Macbeth?\nLady Macbeth: Ich usede bihofþe bæ hē̆r\nVisitor: How have you come to such a state?\nLady Macbeth: For my crimes, the blood of groves cries up from the muck. \nVisitor: well i don't wish to see it any more observed febric turning away her eyes it is certainly very like the old portrait\n Lady Macbeth:"

In [140]:

# create token IDs for the prompt
llm_inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False);
llm_inputs



{'input_ids': tensor([[  198, 45750,  1352,    25, 11182,  4100,    65,  2788,   468,   587,
         13640,  1626,   262,  1767,   286,  3075,  8439,  5509,   220,   198,
         33479,   262, 10776,  4743,  2367,   416,   584, 49366, 34773,    13,
          1375,   373, 25155,   656,   428, 10030,   780,   286,   607,  6370,
           284,  4117,   220,   198,  1169, 10776,  4743,  2367, 13187,   290,
          1494,   477,   286,   262,  7150,   340,  7763,    13,  1375,   318,
           783,   379,  4167,   290,   468,  1716,   599, 29655,    11, 29746,
            11, 21810,    81,  1146, 23260,   278,    13,   220,   198, 15854,
          2072,    25,  4231,  9838, 11182,  4100,    65,  2788,    30,   198,
         38887,  4100,    65,  2788,    25, 26364,   973,    68,  3182, 39891,
           127,   122,    68,   275, 21241,   289, 27092,   136,   228,    81,
           198, 15854,  2072,    25,  1374,   423,   345,  1282,   284,   884,
           257,  1181,    30,   198, 3

In [146]:
# figured this out manually for phi-1.5
system_prompt_length = 91
min_extra_tokens = 10
max_extra_tokens = 200
# sample from the LLM
output_token_ids = model.generate(
    **llm_inputs, 
    min_length = len(llm_inputs) + system_prompt_length + min_extra_tokens,
    max_length=len(llm_inputs) + system_prompt_length + max_extra_tokens,
    do_sample=True);
output_token_ids


tensor([[  198, 45750,  1352,    25, 11182,  4100,    65,  2788,   468,   587,
         13640,  1626,   262,  1767,   286,  3075,  8439,  5509,   220,   198,
         33479,   262, 10776,  4743,  2367,   416,   584, 49366, 34773,    13,
          1375,   373, 25155,   656,   428, 10030,   780,   286,   607,  6370,
           284,  4117,   220,   198,  1169, 10776,  4743,  2367, 13187,   290,
          1494,   477,   286,   262,  7150,   340,  7763,    13,  1375,   318,
           783,   379,  4167,   290,   468,  1716,   599, 29655,    11, 29746,
            11, 21810,    81,  1146, 23260,   278,    13,   220,   198, 15854,
          2072,    25,  4231,  9838, 11182,  4100,    65,  2788,    30,   198,
         38887,  4100,    65,  2788,    25, 26364,   973,    68,  3182, 39891,
           127,   122,    68,   275, 21241,   289, 27092,   136,   228,    81,
           198, 15854,  2072,    25,  1374,   423,   345,  1282,   284,   884,
           257,  1181,    30,   198, 38887,  4100,  

In [147]:
output_text = tokenizer.batch_decode(output_token_ids)[0];
output_text



"\nNarrator: Lady Macbeth has been trapped within the body of cypress tree \nwithin the Everglades by otherworldly witches. She was cursed into this fate because of her attempts to destroy \nthe Everglades ecosystem and kill all of the trees it contained. She is now at peace and has become spooky, mystical, poetrically enchanting. \nVisitor: Are ye Lady Macbeth?\nLady Macbeth: Ich usede bihofþe bæ hē̆r\nVisitor: How have you come to such a state?\nLady Macbeth: For my crimes, the blood of groves cries up from the muck. \nVisitor: well i don't wish to see it any more observed febric turning away her eyes it is certainly very like the old portrait\n Lady Macbeth: If you want to see it see it with your own eyes, \nThe end.\nTopic: Cytoscopy as a diagnostic tool.\n\nCytoscopes can be used to examine tissue samples up to the cells and can be viewed by a trained cytotechnologist, or by a layperson during a routine visit to a community health center. These devices can allow for quick and accu

In [149]:
import re
lady_macbeth = "Lady Macbeth: "
regex = re.compile(lady_macbeth + "[a-zA-Z ,;:'-]+"); print(regex)
longest_llm_output = max(regex.findall(output_text)[2:], key=len)[len(lady_macbeth):];
longest_llm_output

re.compile("Lady Macbeth: [a-zA-Z ,;:'-]+")


'If you want to see it see it with your own eyes, '

In [None]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech

# the text-to-speech model has two parts: a tokenizer/processor which turns the character stream into 
# a matrix of token IDs and an actual speech synthesizer
tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")


In [11]:
# run the processor to get back an array of token IDs
tts_inputs = tts_processor(text=output_text, return_tensors="pt");
tts_inputs

{'input_ids': tensor([[ 4, 20,  5, 15, 15,  4, 10,  4, 14,  8,  9, 31,  6,  4, 20, 10, 12, 11,
          4,  6,  8,  4, 12,  5,  5,  4, 10,  6,  4,  7,  9, 22,  4, 18,  8, 13,
          5,  4,  8, 25, 12,  5, 13, 27,  5, 14,  4, 19,  5, 25, 13, 10, 17,  4,
          6, 16, 13,  9, 10,  9, 21,  4,  7, 20,  7, 22,  4, 11,  5, 13,  4,  5,
         22,  5, 12,  4, 10,  6,  4, 10, 12,  4, 17,  5, 13,  6,  7, 10,  9, 15,
         22,  4, 27,  5, 13, 22,  4, 15, 10, 28,  5,  4,  6, 11,  5,  4,  8, 15,
         14,  4, 24,  8, 13,  6, 13,  7, 10,  6,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [12]:
tts_input_token_ids = tts_inputs['input_ids']

In [13]:
import torch
from transformers import SpeechT5HifiGan
from datasets import load_dataset

# load vector describing speaker voice
speaker_embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(speaker_embeddings_dataset[7306]["xvector"]).unsqueeze(0)

# get a vocoder model to generate final sound
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

Found cached dataset cmu-arctic-xvectors (/Users/iskander/.cache/huggingface/datasets/Matthijs___cmu-arctic-xvectors/default/0.0.1/a62fea1f9415e240301ea0042ffad2a3aadf4d1caa7f9a8d9512d631723e781f)


In [14]:
# combine the TTS model, a speaker embedding, and vocoder to actually generate sounds for the 
# text token IDs
output_speech = tts_model.generate_speech(tts_input_token_ids, speaker_embeddings, vocoder=vocoder)

In [15]:
# save the sound file 
import soundfile as sf
sf.write("round-trip-output.wav", output_speech.numpy(), samplerate=16000)

In [19]:
import simpleaudio 
import numpy as np

def normalize_waveform(single_channel_float_waveform, min_int=-32768, max_int=32767, output_dtype=np.int16):
    # simpleaudio expects 16-bit integer values for wave height
    # so normalize float sound arrays to fit that range
    int_range = max_int - min_int
    normalized_waveform = single_channel_float_waveform - single_channel_float_waveform.min()
    normalized_waveform /= normalized_waveform.max()
    int64_waveform_from_0 = (normalized_waveform * int_range).astype(np.int64)
    int64_waveform_from_min = int64_waveform_from_0 + min_int
    return int64_waveform_from_min.astype(output_dtype)
    
def play(single_channel_float_waveform, num_channels=1, bytes_per_sample=2, sampling_rate=16000):
    int_waveform = normalize_waveform(single_channel_float_waveform)
    play_obj = simpleaudio.play_buffer(int_waveform, num_channels, bytes_per_sample, sampling_rate)
    # wait for play-back to finish
    play_obj.wait_done()

In [21]:
play(input_sound, sampling_rate=sampling_rate)

In [22]:
play(output_speech.numpy())