<a href="https://colab.research.google.com/github/jermwatt/asclepius_dev/blob/main/text2speech2text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install dependencies for text2speech and speech2text
!pip install git+https://github.com/huggingface/transformers sentencepiece datasets soundfile torchaudio

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-fi_y_bwa
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-fi_y_bwa
  Resolved https://github.com/huggingface/transformers to commit f1a1eb4ae10eac56dccfcd55d0f3a65772dc2741
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


# 1. setup functions

In [2]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import soundfile as sf
from transformers import pipeline


def text2speech_setup():
    # load text2speech model
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

    # load xvector containing speaker's voice characteristics from a dataset
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

    # load speaker embeddings
    speaker_embeddings = speaker_embeddings.to("cuda")
    return processor, model, vocoder, speaker_embeddings


def speech2text_setup():
    # instantiate pipe
    speech2text_pipe = pipeline(model="openai/whisper-large", device=0)
    return speech2text_pipe


# setup text2speech pipeline
processor, model, vocoder, speaker_embeddings = text2speech_setup()

# setup speech2text pipeline
speech2text_pipe = speech2text_setup()



# 2.  inference functions

In [5]:
# text2speech inference
def text2speech_inference(text,audio_file_name='test.wav'):
    inputs = processor(text=text, return_tensors="pt")
    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
    sf.write(audio_file_name, speech.numpy(), samplerate=16000)

# speech2text inference
def speech2text_inference(audio_file_name='test.wav', text_file_name='text.txt'):
    output = speech2text_pipe(audio_file_name, chunk_length_s=30)
    with open(text_file_name, 'w') as f:
        f.write(output['text'])


# 3.  Test processors

In [6]:
# test text2speech processor
text='Am I getting out of breath doing things that I can normally do without a problem?'
text2speech_inference(text)

RuntimeError: ignored

In [None]:
# test speech2text processor
speech2text_inference()