In [10]:
import json
import requests
import os
from time import sleep
import os
import logging
import customvoice
import azure.cognitiveservices.speech as speechsdk
from dotenv import load_dotenv
load_dotenv()
os.environ['SSL_CERT_DIR'] = '/etc/ssl/certs'

In [11]:

def get_personal_voice(config: customvoice.Config, personal_voice_id: str):
    personal_voice = customvoice.PersonalVoice.get(config, personal_voice_id)
    return personal_voice.speaker_profile_id


def create_personal_voice(config: customvoice.Config, project_id: str,
                          consent_id: str, consent_file_path: str, voice_talent_name: str, company_name: str,
                          personal_voice_id: str, audio_folder: str):
    # create project
    project = customvoice.Project.create(config, project_id, customvoice.ProjectKind.PersonalVoice)
    print('Project created. project id: %s' % project.id)

    # upload consent
    consent = customvoice.Consent.create(config, project_id, consent_id, voice_talent_name, company_name, consent_file_path, 'en-us')
    if consent.status == customvoice.Status.Failed:
        print('Create consent failed. consent id: %s' % consent.id)
        raise Exception
    elif consent.status == customvoice.Status.Succeeded:
        print('Create consent succeeded. consent id: %s' % consent.id)

    # create personal voice
    personal_voice = customvoice.PersonalVoice.create(config, project_id, personal_voice_id, consent_id, audio_folder)
    if personal_voice.status == customvoice.Status.Failed:
        print('Create personal voice failed. personal voice id: %s' % personal_voice.id)
        raise Exception
    elif personal_voice.status == customvoice.Status.Succeeded:
        print('Create personal voice succeeded. personal voice id: %s, speaker profile id: %s' % (personal_voice.id, personal_voice.speaker_profile_id))
    return personal_voice.speaker_profile_id

def speech_synthesis_to_wave_file(text: str, output_file_path: str, speaker_profile_id: str):
    # Creates an instance of a speech config with specified subscription key and service region.
    speech_config = speechsdk.SpeechConfig(subscription=config.key, region=config.region)
    file_config = speechsdk.audio.AudioOutputConfig(filename=output_file_path)
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=file_config)

    ssml = "<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis' " \
           "xmlns:mstts='http://www.w3.org/2001/mstts'>" \
           "<voice name='DragonLatestNeural'>" \
           "<mstts:ttsembedding speakerProfileId='%s'/>" \
           "<mstts:express-as style='Prompt'>" \
           "<lang xml:lang='en-US'> %s </lang>" \
           "</mstts:express-as>" \
           "</voice></speak> " % (speaker_profile_id, text)
    result = speech_synthesizer.speak_ssml_async(ssml).get()

    # Check result
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        print("Speech synthesized for text [{}], and the audio was saved to [{}]".format(text, output_file_path))
        print("result id: {}".format(result.result_id))
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech synthesis canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))
            print("result id: {}".format(result.result_id))


def clean_up(config: customvoice.Config, project_id: str, consent_id: str, personal_voice_id: str):
    customvoice.PersonalVoice.delete(config, personal_voice_id)
    customvoice.Consent.delete(config, consent_id)
    customvoice.Project.delete(config, project_id)

In [6]:
logging.basicConfig(filename="customvoice.log",
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    filemode='w')
logger = logging.getLogger()
logger.setLevel(logging.ERROR)

config = customvoice.Config(os.getenv("SPEECH_KEY"), os.getenv("SPEECH_REGION"), logger)


project_id = 'personal-voice-project'
consent_id = 'personal-voice-consent'
personal_voice_id  = 'personal-voice'


Project created. project id: personal-voice-project
Create consent succeeded. consent id: personal-voice-consent
Create personal voice succeeded. personal voice id: personal-voice, speaker profile id: 87c02382-35e9-4484-ac8e-32d2a87cffa6


In [9]:
# step 1: create personal voice
# Need consent file and audio file to create personal vocie.
# This is consent file template.
# I [voice talent name] am aware that recordings of my voice will be used by [company name] to create and use a synthetic version of my voice.
# You can find sample consent file here
# https://github.com/Azure-Samples/Cognitive-Speech-TTS/blob/master/CustomVoice/Sample%20Data/Individual%20utterances%20%2B%20matching%20script/VoiceTalentVerbalStatement.wav
consent_file_path = r'./consent/consent.wav'
voice_talent_name = 'Yingting Huang'
company_name = 'Microsoft'
# Need 50 - 90 seconds audio file.
# You can find sample audio file here.
# https://github.com/Azure-Samples/Cognitive-Speech-TTS/blob/master/CustomVoice/Sample%20Data/Individual%20utterances%20%2B%20matching%20script/SampleAudios.zip
# Pleae unzip audio file, and put the first 14 files in folder below.
audio_folder = r'./samples/'
speaker_profile_id = create_personal_voice(config, project_id, 
                                           consent_id, consent_file_path, voice_talent_name, company_name,
                                           personal_voice_id, audio_folder)

# step 2: synthesis wave
text = """
How hot is too hot to work? It is a question researchers have found the answer to here, in Cambodia's brick kilns, where people toil in some of the hottest working conditions in the world, fuelled in part by the scraps of fast fashion.
"""
output_wave_file_path = './output/001.wav'
speech_synthesis_to_wave_file(text, output_wave_file_path, speaker_profile_id)

Speech synthesized for text [
How hot is too hot to work? It is a question researchers have found the answer to here, in Cambodia's brick kilns, where people toil in some of the hottest working conditions in the world, fuelled in part by the scraps of fast fashion.
], and the audio was saved to [./output/001.wav]
result id: a0ed5363c13b413098782bcd012d2b09


In [15]:
speaker_profile_id = get_personal_voice(config, personal_voice_id)
text = """
The BBC spoke to several workers who said they sweat so much through the day that it felt like they were in a hot bath. Fainting is common too, possibly because they become dehydrated. Their names have been changed because they fear reprisals from their employers
"""
output_wave_file_path = './output/002.wav'
speech_synthesis_to_wave_file(text, output_wave_file_path, speaker_profile_id)

Exception: Service return error
Request URL: GET https://southeastasia.api.cognitive.microsoft.com/customvoice/personalvoices/personal-voice?api-version=2023-12-01-preview
status code: 404
response:
{
    "error": {
        "code": "NotFound",
        "message": "Resource not found"
    }
}

In [14]:
# Optional step 4: clean up, if you don't need this voice to synthesis more content.
clean_up(config, project_id, consent_id, personal_voice_id)